Implemented tree delta compression in fast-import.
authorShawn O. Pearce <spearce@spearce.org>
Mon, 28 Aug 2006 16:22:50 +0000 (12:22 -0400)
committerShawn O. Pearce <spearce@spearce.org>
Sun, 14 Jan 2007 07:15:10 +0000 (02:15 -0500)
We now store for every tree entry two modes and two sha1 values;
the base (aka "version 0") and the current/new (aka "version 1").
When we generate a tree object we also regenerate the prior version
object and use that as our base object for a delta.  This strategy
saves a significant amount of memory as we can continue to use the
atom pool for file/directory names and only increases each tree
entry by an additional 24 bytes of memory.

Branches should automatically delta against their ancestor tree,
unless the ancestor tree is already at the delta chain limit.

Signed-off-by: Shawn O. Pearce <spearce@spearce.org>
fast-import.c

index b1b2382560e24b9e8f148eeb8c09c9c19c0ac482..6b011204150111d40cf8eb50117b24236eb214c8 100644 (file)
@@ -132,7 +132,7 @@ struct mark_set
 struct last_object
 {
        void *data;
-       unsigned int len;
+       unsigned long len;
        unsigned int depth;
        unsigned char sha1[20];
 };
@@ -157,14 +157,18 @@ struct tree_entry
 {
        struct tree_content *tree;
        struct atom_str* name;
-       unsigned int mode;
-       unsigned char sha1[20];
+       struct tree_entry_ms
+       {
+               unsigned int mode;
+               unsigned char sha1[20];
+       } versions[2];
 };
 
 struct tree_content
 {
        unsigned int entry_capacity; /* must match avail_tree_content */
        unsigned int entry_count;
+       unsigned int delta_depth;
        struct tree_entry *entries[FLEX_ARRAY]; /* more */
 };
 
@@ -203,6 +207,7 @@ static unsigned long duplicate_count;
 static unsigned long marks_set_count;
 static unsigned long object_count_by_type[9];
 static unsigned long duplicate_count_by_type[9];
+static unsigned long delta_count_by_type[9];
 
 /* Memory pools */
 static size_t mem_pool_alloc = 2*1024*1024 - sizeof(struct mem_pool);
@@ -224,7 +229,7 @@ static unsigned long pack_mlen = 128*1024*1024;
 static unsigned long page_size;
 
 /* Table of objects we've written. */
-static unsigned int object_entry_alloc = 1000;
+static unsigned int object_entry_alloc = 5000;
 static struct object_entry_pool *blocks;
 static struct object_entry *object_table[1 << 16];
 static struct mark_set *marks;
@@ -486,6 +491,7 @@ static struct tree_content* new_tree_content(unsigned int cnt)
 
        t = (struct tree_content*)f;
        t->entry_count = 0;
+       t->delta_depth = 0;
        return t;
 }
 
@@ -512,6 +518,7 @@ static struct tree_content* grow_tree_content(
 {
        struct tree_content *r = new_tree_content(t->entry_count + amt);
        r->entry_count = t->entry_count;
+       r->delta_depth = t->delta_depth;
        memcpy(r->entries,t->entries,t->entry_count*sizeof(t->entries[0]));
        release_tree_content(t);
        return r;
@@ -642,6 +649,7 @@ static int store_object(
        deflateInit(&s, zlib_compression_level);
 
        if (delta) {
+               delta_count_by_type[type]++;
                last->depth++;
                s.next_in = delta;
                s.avail_in = deltalen;
@@ -755,11 +763,14 @@ static void *unpack_non_delta_entry(unsigned long o, unsigned long sz)
        return result;
 }
 
-static void *unpack_entry(unsigned long offset, unsigned long *sizep);
+static void *unpack_entry(unsigned long offset,
+       unsigned long *sizep,
+       unsigned int *delta_depth);
 
 static void *unpack_delta_entry(unsigned long offset,
        unsigned long delta_size,
-       unsigned long *sizep)
+       unsigned long *sizep,
+       unsigned int *delta_depth)
 {
        struct object_entry *base_oe;
        unsigned char *base_sha1;
@@ -770,7 +781,7 @@ static void *unpack_delta_entry(unsigned long offset,
        base_oe = find_object(base_sha1);
        if (!base_oe)
                die("I'm broken; I can't find a base I know must be here.");
-       base = unpack_entry(base_oe->offset, &base_size);
+       base = unpack_entry(base_oe->offset, &base_size, delta_depth);
        delta_data = unpack_non_delta_entry(offset + 20, delta_size);
        result = patch_delta(base, base_size,
                             delta_data, delta_size,
@@ -780,10 +791,13 @@ static void *unpack_delta_entry(unsigned long offset,
        free(delta_data);
        free(base);
        *sizep = result_size;
+       (*delta_depth)++;
        return result;
 }
 
-static void *unpack_entry(unsigned long offset, unsigned long *sizep)
+static void *unpack_entry(unsigned long offset,
+       unsigned long *sizep,
+       unsigned int *delta_depth)
 {
        unsigned long size;
        enum object_type kind;
@@ -791,12 +805,13 @@ static void *unpack_entry(unsigned long offset, unsigned long *sizep)
        offset = unpack_object_header(offset, &kind, &size);
        switch (kind) {
        case OBJ_DELTA:
-               return unpack_delta_entry(offset, size, sizep);
+               return unpack_delta_entry(offset, size, sizep, delta_depth);
        case OBJ_COMMIT:
        case OBJ_TREE:
        case OBJ_BLOB:
        case OBJ_TAG:
                *sizep = size;
+               *delta_depth = 0;
                return unpack_non_delta_entry(offset, size);
        default:
                die("I created an object I can't read!");
@@ -819,6 +834,7 @@ static const char *get_mode(const char *str, unsigned int *modep)
 
 static void load_tree(struct tree_entry *root)
 {
+       unsigned char* sha1 = root->versions[1].sha1;
        struct object_entry *myoe;
        struct tree_content *t;
        unsigned long size;
@@ -826,19 +842,19 @@ static void load_tree(struct tree_entry *root)
        const char *c;
 
        root->tree = t = new_tree_content(8);
-       if (is_null_sha1(root->sha1))
+       if (is_null_sha1(sha1))
                return;
 
-       myoe = find_object(root->sha1);
+       myoe = find_object(sha1);
        if (myoe) {
                if (myoe->type != OBJ_TREE)
-                       die("Not a tree: %s", sha1_to_hex(root->sha1));
-               buf = unpack_entry(myoe->offset, &size);
+                       die("Not a tree: %s", sha1_to_hex(sha1));
+               buf = unpack_entry(myoe->offset, &size, &t->delta_depth);
        } else {
                char type[20];
-               buf = read_sha1_file(root->sha1, type, &size);
+               buf = read_sha1_file(sha1, type, &size);
                if (!buf || strcmp(type, tree_type))
-                       die("Can't load tree %s", sha1_to_hex(root->sha1));
+                       die("Can't load tree %s", sha1_to_hex(sha1));
        }
 
        c = buf;
@@ -850,56 +866,116 @@ static void load_tree(struct tree_entry *root)
                t->entries[t->entry_count++] = e;
 
                e->tree = NULL;
-               c = get_mode(c, &e->mode);
+               c = get_mode(c, &e->versions[1].mode);
                if (!c)
-                       die("Corrupt mode in %s", sha1_to_hex(root->sha1));
+                       die("Corrupt mode in %s", sha1_to_hex(sha1));
+               e->versions[0].mode = e->versions[1].mode;
                e->name = to_atom(c, strlen(c));
                c += e->name->str_len + 1;
-               hashcpy(e->sha1, c);
+               hashcpy(e->versions[0].sha1, (unsigned char*)c);
+               hashcpy(e->versions[1].sha1, (unsigned char*)c);
                c += 20;
        }
        free(buf);
 }
 
-static int tecmp (const void *_a, const void *_b)
+static int tecmp0 (const void *_a, const void *_b)
 {
        struct tree_entry *a = *((struct tree_entry**)_a);
        struct tree_entry *b = *((struct tree_entry**)_b);
        return base_name_compare(
-               a->name->str_dat, a->name->str_len, a->mode,
-               b->name->str_dat, b->name->str_len, b->mode);
+               a->name->str_dat, a->name->str_len, a->versions[0].mode,
+               b->name->str_dat, b->name->str_len, b->versions[0].mode);
 }
 
-static void store_tree(struct tree_entry *root)
+static int tecmp1 (const void *_a, const void *_b)
 {
-       struct tree_content *t = root->tree;
+       struct tree_entry *a = *((struct tree_entry**)_a);
+       struct tree_entry *b = *((struct tree_entry**)_b);
+       return base_name_compare(
+               a->name->str_dat, a->name->str_len, a->versions[1].mode,
+               b->name->str_dat, b->name->str_len, b->versions[1].mode);
+}
+
+static void* mktree(struct tree_content *t, int v, unsigned long *szp)
+{
+       size_t maxlen = 0;
        unsigned int i;
-       size_t maxlen;
        char *buf, *c;
 
-       if (!is_null_sha1(root->sha1))
-               return;
+       if (!v)
+               qsort(t->entries,t->entry_count,sizeof(t->entries[0]),tecmp0);
+       else
+               qsort(t->entries,t->entry_count,sizeof(t->entries[0]),tecmp1);
 
-       maxlen = 0;
        for (i = 0; i < t->entry_count; i++) {
-               maxlen += t->entries[i]->name->str_len + 34;
-               if (t->entries[i]->tree)
-                       store_tree(t->entries[i]);
+               if (t->entries[i]->versions[v].mode)
+                       maxlen += t->entries[i]->name->str_len + 34;
        }
 
-       qsort(t->entries, t->entry_count, sizeof(t->entries[0]), tecmp);
        buf = c = xmalloc(maxlen);
        for (i = 0; i < t->entry_count; i++) {
                struct tree_entry *e = t->entries[i];
-               c += sprintf(c, "%o", e->mode);
+               if (!e->versions[v].mode)
+                       continue;
+               c += sprintf(c, "%o", e->versions[v].mode);
                *c++ = ' ';
                strcpy(c, e->name->str_dat);
                c += e->name->str_len + 1;
-               hashcpy(c, e->sha1);
+               hashcpy((unsigned char*)c, e->versions[v].sha1);
                c += 20;
        }
-       store_object(OBJ_TREE, buf, c - buf, NULL, root->sha1, 0);
-       free(buf);
+
+       *szp = c - buf;
+       return buf;
+}
+
+static void store_tree(struct tree_entry *root)
+{
+       struct tree_content *t = root->tree;
+       unsigned int i, j, del;
+       unsigned long vers1len;
+       void **vers1dat;
+       struct last_object lo;
+
+       if (!is_null_sha1(root->versions[1].sha1))
+               return;
+
+       for (i = 0; i < t->entry_count; i++) {
+               if (t->entries[i]->tree)
+                       store_tree(t->entries[i]);
+       }
+
+       if (is_null_sha1(root->versions[0].sha1)
+                       || !find_object(root->versions[0].sha1)) {
+               lo.data = NULL;
+               lo.depth = 0;
+       } else {
+               lo.data = mktree(t, 0, &lo.len);
+               lo.depth = t->delta_depth;
+               hashcpy(lo.sha1, root->versions[0].sha1);
+       }
+       vers1dat = mktree(t, 1, &vers1len);
+
+       store_object(OBJ_TREE, vers1dat, vers1len,
+               &lo, root->versions[1].sha1, 0);
+       /* note: lo.dat (if created) was freed by store_object */
+       free(vers1dat);
+
+       t->delta_depth = lo.depth;
+       hashcpy(root->versions[0].sha1, root->versions[1].sha1);
+       for (i = 0, j = 0, del = 0; i < t->entry_count; i++) {
+               struct tree_entry *e = t->entries[i];
+               if (e->versions[1].mode) {
+                       e->versions[0].mode = e->versions[1].mode;
+                       hashcpy(e->versions[0].sha1, e->versions[1].sha1);
+                       t->entries[j++] = e;
+               } else {
+                       release_tree_entry(e);
+                       del++;
+               }
+       }
+       t->entry_count -= del;
 }
 
 static int tree_content_set(
@@ -923,25 +999,26 @@ static int tree_content_set(
                e = t->entries[i];
                if (e->name->str_len == n && !strncmp(p, e->name->str_dat, n)) {
                        if (!slash1) {
-                               if (e->mode == mode && !hashcmp(e->sha1, sha1))
+                               if (e->versions[1].mode == mode
+                                               && !hashcmp(e->versions[1].sha1, sha1))
                                        return 0;
-                               e->mode = mode;
-                               hashcpy(e->sha1, sha1);
+                               e->versions[1].mode = mode;
+                               hashcpy(e->versions[1].sha1, sha1);
                                if (e->tree) {
                                        release_tree_content_recursive(e->tree);
                                        e->tree = NULL;
                                }
-                               hashclr(root->sha1);
+                               hashclr(root->versions[1].sha1);
                                return 1;
                        }
-                       if (!S_ISDIR(e->mode)) {
+                       if (!S_ISDIR(e->versions[1].mode)) {
                                e->tree = new_tree_content(8);
-                               e->mode = S_IFDIR;
+                               e->versions[1].mode = S_IFDIR;
                        }
                        if (!e->tree)
                                load_tree(e);
                        if (tree_content_set(e, slash1 + 1, sha1, mode)) {
-                               hashclr(root->sha1);
+                               hashclr(root->versions[1].sha1);
                                return 1;
                        }
                        return 0;
@@ -952,17 +1029,19 @@ static int tree_content_set(
                root->tree = t = grow_tree_content(t, 8);
        e = new_tree_entry();
        e->name = to_atom(p, n);
+       e->versions[0].mode = 0;
+       hashclr(e->versions[0].sha1);
        t->entries[t->entry_count++] = e;
        if (slash1) {
                e->tree = new_tree_content(8);
-               e->mode = S_IFDIR;
+               e->versions[1].mode = S_IFDIR;
                tree_content_set(e, slash1 + 1, sha1, mode);
        } else {
                e->tree = NULL;
-               e->mode = mode;
-               hashcpy(e->sha1, sha1);
+               e->versions[1].mode = mode;
+               hashcpy(e->versions[1].sha1, sha1);
        }
-       hashclr(root->sha1);
+       hashclr(root->versions[1].sha1);
        return 1;
 }
 
@@ -982,14 +1061,14 @@ static int tree_content_remove(struct tree_entry *root, const char *p)
        for (i = 0; i < t->entry_count; i++) {
                e = t->entries[i];
                if (e->name->str_len == n && !strncmp(p, e->name->str_dat, n)) {
-                       if (!slash1 || !S_ISDIR(e->mode))
+                       if (!slash1 || !S_ISDIR(e->versions[1].mode))
                                goto del_entry;
                        if (!e->tree)
                                load_tree(e);
                        if (tree_content_remove(e, slash1 + 1)) {
                                if (!e->tree->entry_count)
                                        goto del_entry;
-                               hashclr(root->sha1);
+                               hashclr(root->versions[1].sha1);
                                return 1;
                        }
                        return 0;
@@ -998,11 +1077,13 @@ static int tree_content_remove(struct tree_entry *root, const char *p)
        return 0;
 
 del_entry:
-       for (i++; i < t->entry_count; i++)
-               t->entries[i-1] = t->entries[i];
-       t->entry_count--;
-       release_tree_entry(e);
-       hashclr(root->sha1);
+       if (e->tree) {
+               release_tree_content_recursive(e->tree);
+               e->tree = NULL;
+       }
+       e->versions[1].mode = 0;
+       hashclr(e->versions[1].sha1);
+       hashclr(root->versions[1].sha1);
        return 1;
 }
 
@@ -1359,27 +1440,33 @@ static void cmd_from(struct branch *b)
        if (b == s)
                die("Can't create a branch from itself: %s", b->name);
        else if (s) {
+               unsigned char *t = s->branch_tree.versions[1].sha1;
                hashcpy(b->sha1, s->sha1);
-               hashcpy(b->branch_tree.sha1, s->branch_tree.sha1);
+               hashcpy(b->branch_tree.versions[0].sha1, t);
+               hashcpy(b->branch_tree.versions[1].sha1, t);
        } else if (*from == ':') {
                unsigned long idnum = strtoul(from + 1, NULL, 10);
                struct object_entry *oe = find_mark(idnum);
                unsigned long size;
+               unsigned int depth;
                char *buf;
                if (oe->type != OBJ_COMMIT)
                        die("Mark :%lu not a commit", idnum);
                hashcpy(b->sha1, oe->sha1);
-               buf = unpack_entry(oe->offset, &size);
+               buf = unpack_entry(oe->offset, &size, &depth);
                if (!buf || size < 46)
                        die("Not a valid commit: %s", from);
                if (memcmp("tree ", buf, 5)
-                       || get_sha1_hex(buf + 5, b->branch_tree.sha1))
+                       || get_sha1_hex(buf + 5, b->branch_tree.versions[1].sha1))
                        die("The commit %s is corrupt", sha1_to_hex(b->sha1));
                free(buf);
+               hashcpy(b->branch_tree.versions[0].sha1,
+                       b->branch_tree.versions[1].sha1);
        } else if (!get_sha1(from, b->sha1)) {
-               if (is_null_sha1(b->sha1))
-                       hashclr(b->branch_tree.sha1);
-               else {
+               if (is_null_sha1(b->sha1)) {
+                       hashclr(b->branch_tree.versions[0].sha1);
+                       hashclr(b->branch_tree.versions[1].sha1);
+               } else {
                        unsigned long size;
                        char *buf;
 
@@ -1388,9 +1475,11 @@ static void cmd_from(struct branch *b)
                        if (!buf || size < 46)
                                die("Not a valid commit: %s", from);
                        if (memcmp("tree ", buf, 5)
-                               || get_sha1_hex(buf + 5, b->branch_tree.sha1))
+                               || get_sha1_hex(buf + 5, b->branch_tree.versions[1].sha1))
                                die("The commit %s is corrupt", sha1_to_hex(b->sha1));
                        free(buf);
+                       hashcpy(b->branch_tree.versions[0].sha1,
+                               b->branch_tree.versions[1].sha1);
                }
        } else
                die("Invalid ref name or SHA1 expression: %s", from);
@@ -1466,7 +1555,8 @@ static void cmd_new_commit()
                        ? strlen(author) + strlen(committer)
                        : 2 * strlen(committer)));
        sp = body;
-       sp += sprintf(sp, "tree %s\n", sha1_to_hex(b->branch_tree.sha1));
+       sp += sprintf(sp, "tree %s\n",
+               sha1_to_hex(b->branch_tree.versions[1].sha1));
        if (!is_null_sha1(b->sha1))
                sp += sprintf(sp, "parent %s\n", sha1_to_hex(b->sha1));
        if (author)
@@ -1722,10 +1812,10 @@ int main(int argc, const char **argv)
        fprintf(stderr, "---------------------------------------------------\n");
        fprintf(stderr, "Alloc'd objects: %10lu (%10lu overflow  )\n", alloc_count, alloc_count - est_obj_cnt);
        fprintf(stderr, "Total objects:   %10lu (%10lu duplicates)\n", object_count, duplicate_count);
-       fprintf(stderr, "      blobs  :   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB]);
-       fprintf(stderr, "      trees  :   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE]);
-       fprintf(stderr, "      commits:   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT]);
-       fprintf(stderr, "      tags   :   %10lu (%10lu duplicates)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG]);
+       fprintf(stderr, "      blobs  :   %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_BLOB], duplicate_count_by_type[OBJ_BLOB], delta_count_by_type[OBJ_BLOB]);
+       fprintf(stderr, "      trees  :   %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_TREE], duplicate_count_by_type[OBJ_TREE], delta_count_by_type[OBJ_TREE]);
+       fprintf(stderr, "      commits:   %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_COMMIT], duplicate_count_by_type[OBJ_COMMIT], delta_count_by_type[OBJ_COMMIT]);
+       fprintf(stderr, "      tags   :   %10lu (%10lu duplicates %10lu deltas)\n", object_count_by_type[OBJ_TAG], duplicate_count_by_type[OBJ_TAG], delta_count_by_type[OBJ_TAG]);
        fprintf(stderr, "Total branches:  %10lu (%10lu loads     )\n", branch_count, branch_load_count);
        fprintf(stderr, "      marks:     %10u (%10lu unique    )\n", (1 << marks->shift) * 1024, marks_set_count);
        fprintf(stderr, "      atoms:     %10u\n", atom_cnt);