diffcore-rename: somewhat optimized.
authorJunio C Hamano <junkio@cox.net>
Sun, 12 Mar 2006 11:22:10 +0000 (03:22 -0800)
committerJunio C Hamano <junkio@cox.net>
Sun, 12 Mar 2006 11:22:10 +0000 (03:22 -0800)
This changes diffcore-rename to reuse statistics information
gathered during similarity estimation, and updates the hashtable
implementation used to keep track of the statistics to be
denser.  This seems to give better performance.

Signed-off-by: Junio C Hamano <junkio@cox.net>
diff.c
diffcore-break.c
diffcore-delta.c
diffcore-rename.c
diffcore.h

diff --git a/diff.c b/diff.c
index c0548eed98df0fb6c11b04a775b6b64b3bb4d417..c73a40b9ab3abe3e4a1ddd51a23a19be36b07ea4 100644 (file)
--- a/diff.c
+++ b/diff.c
@@ -463,6 +463,8 @@ void diff_free_filespec_data(struct diff_filespec *s)
                munmap(s->data, s->size);
        s->should_free = s->should_munmap = 0;
        s->data = NULL;
+       free(s->cnt_data);
+       s->cnt_data = NULL;
 }
 
 static void prep_temp_blob(struct diff_tempfile *temp,
index 71ad58a25a05ea760fd67fc1583e3323dd53a8f6..ed0e14c6d8b1b8347f20c890854972be6c6a234c 100644 (file)
@@ -68,6 +68,7 @@ static int should_break(struct diff_filespec *src,
 
        if (diffcore_count_changes(src->data, src->size,
                                   dst->data, dst->size,
+                                  NULL, NULL,
                                   0,
                                   &src_copied, &literal_added))
                return 0;
index 70bacff83752a6f91b6aa3089374660840e8059a..471b98f05de2899f6aaa4818966aa99df7ec8f7e 100644 (file)
  * The length of the sequence is arbitrarily set to 8 for now.
  */
 
+/* Wild guess at the initial hash size */
+#define INITIAL_HASH_SIZE 10
 #define HASHBASE 65537 /* next_prime(2^16) */
 
-static void hash_chars(unsigned char *buf, unsigned long sz, int *count)
+struct spanhash {
+       unsigned long hashval;
+       unsigned long cnt;
+};
+struct spanhash_top {
+       int alloc_log2;
+       int free;
+       struct spanhash data[FLEX_ARRAY];
+};
+
+static struct spanhash *spanhash_find(struct spanhash_top *top, unsigned long hashval)
+{
+       int sz = 1 << top->alloc_log2;
+       int bucket = hashval & (sz - 1);
+       while (1) {
+               struct spanhash *h = &(top->data[bucket++]);
+               if (!h->cnt)
+                       return NULL;
+               if (h->hashval == hashval)
+                       return h;
+               if (sz <= bucket)
+                       bucket = 0;
+       }
+}
+
+static struct spanhash_top *spanhash_rehash(struct spanhash_top *orig)
+{
+       struct spanhash_top *new;
+       int i;
+       int osz = 1 << orig->alloc_log2;
+       int sz = osz << 1;
+
+       new = xmalloc(sizeof(*orig) + sizeof(struct spanhash) * sz);
+       new->alloc_log2 = orig->alloc_log2 + 1;
+       new->free = osz;
+       memset(new->data, 0, sizeof(struct spanhash) * sz);
+       for (i = 0; i < osz; i++) {
+               struct spanhash *o = &(orig->data[i]);
+               int bucket;
+               if (!o->cnt)
+                       continue;
+               bucket = o->hashval & (sz - 1);
+               while (1) {
+                       struct spanhash *h = &(new->data[bucket++]);
+                       if (!h->cnt) {
+                               h->hashval = o->hashval;
+                               h->cnt = o->cnt;
+                               new->free--;
+                               break;
+                       }
+                       if (sz <= bucket)
+                               bucket = 0;
+               }
+       }
+       free(orig);
+       return new;
+}
+
+static struct spanhash_top *add_spanhash(struct spanhash_top *top,
+                                        unsigned long hashval)
+{
+       int bucket, lim;
+       struct spanhash *h;
+
+       lim = (1 << top->alloc_log2);
+       bucket = hashval & (lim - 1);
+       while (1) {
+               h = &(top->data[bucket++]);
+               if (!h->cnt) {
+                       h->hashval = hashval;
+                       h->cnt = 1;
+                       top->free--;
+                       if (top->free < 0)
+                               return spanhash_rehash(top);
+                       return top;
+               }
+               if (h->hashval == hashval) {
+                       h->cnt++;
+                       return top;
+               }
+               if (lim <= bucket)
+                       bucket = 0;
+       }
+}
+
+static struct spanhash_top *hash_chars(unsigned char *buf, unsigned long sz)
 {
-       unsigned int accum1, accum2, i;
+       int i;
+       unsigned long accum1, accum2, hashval;
+       struct spanhash_top *hash;
+
+       i = INITIAL_HASH_SIZE;
+       hash = xmalloc(sizeof(*hash) + sizeof(struct spanhash) * (1<<i));
+       hash->alloc_log2 = i;
+       hash->free = (1<<i)/2;
+       memset(hash->data, 0, sizeof(struct spanhash) * (1<<i));
 
        /* an 8-byte shift register made of accum1 and accum2.  New
         * bytes come at LSB of accum2, and shifted up to accum1
@@ -40,44 +135,68 @@ static void hash_chars(unsigned char *buf, unsigned long sz, int *count)
        while (sz) {
                accum1 = (accum1 << 8) | (accum2 >> 24);
                accum2 = (accum2 << 8) | *buf++;
-               /* We want something that hashes permuted byte
-                * sequences nicely; simpler hash like (accum1 ^
-                * accum2) does not perform as well.
-                */
-               i = (accum1 + accum2 * 0x61) % HASHBASE;
-               count[i]++;
+               hashval = (accum1 + accum2 * 0x61) % HASHBASE;
+               hash = add_spanhash(hash, hashval);
                sz--;
        }
+       return hash;
 }
 
 int diffcore_count_changes(void *src, unsigned long src_size,
                           void *dst, unsigned long dst_size,
+                          void **src_count_p,
+                          void **dst_count_p,
                           unsigned long delta_limit,
                           unsigned long *src_copied,
                           unsigned long *literal_added)
 {
-       int *src_count, *dst_count, i;
+       int i, ssz;
+       struct spanhash_top *src_count, *dst_count;
        unsigned long sc, la;
 
        if (src_size < 8 || dst_size < 8)
                return -1;
 
-       src_count = xcalloc(HASHBASE * 2, sizeof(int));
-       dst_count = src_count + HASHBASE;
-       hash_chars(src, src_size, src_count);
-       hash_chars(dst, dst_size, dst_count);
-
+       src_count = dst_count = NULL;
+       if (src_count_p)
+               src_count = *src_count_p;
+       if (!src_count) {
+               src_count = hash_chars(src, src_size);
+               if (src_count_p)
+                       *src_count_p = src_count;
+       }
+       if (dst_count_p)
+               dst_count = *dst_count_p;
+       if (!dst_count) {
+               dst_count = hash_chars(dst, dst_size);
+               if (dst_count_p)
+                       *dst_count_p = dst_count;
+       }
        sc = la = 0;
-       for (i = 0; i < HASHBASE; i++) {
-               if (src_count[i] < dst_count[i]) {
-                       la += dst_count[i] - src_count[i];
-                       sc += src_count[i];
+
+       ssz = 1 << src_count->alloc_log2;
+       for (i = 0; i < ssz; i++) {
+               struct spanhash *s = &(src_count->data[i]);
+               struct spanhash *d;
+               unsigned dst_cnt, src_cnt;
+               if (!s->cnt)
+                       continue;
+               src_cnt = s->cnt;
+               d = spanhash_find(dst_count, s->hashval);
+               dst_cnt = d ? d->cnt : 0;
+               if (src_cnt < dst_cnt) {
+                       la += dst_cnt - src_cnt;
+                       sc += src_cnt;
                }
-               else /* i.e. if (dst_count[i] <= src_count[i]) */
-                       sc += dst_count[i];
+               else
+                       sc += dst_cnt;
        }
+
+       if (!src_count_p)
+               free(src_count);
+       if (!dst_count_p)
+               free(dst_count);
        *src_copied = sc;
        *literal_added = la;
-       free(src_count);
        return 0;
 }
index 625b589fb728e5ae94b3b13d419b43c30347508d..b80b4320ff6ea586d8d3c2800a7c42d68ccdd796 100644 (file)
@@ -166,6 +166,7 @@ static int estimate_similarity(struct diff_filespec *src,
        delta_limit = base_size * (MAX_SCORE-minimum_score) / MAX_SCORE;
        if (diffcore_count_changes(src->data, src->size,
                                   dst->data, dst->size,
+                                  &src->cnt_data, &dst->cnt_data,
                                   delta_limit,
                                   &src_copied, &literal_added))
                return 0;
@@ -306,6 +307,8 @@ void diffcore_rename(struct diff_options *options)
                        m->score = estimate_similarity(one, two,
                                                       minimum_score);
                }
+               free(two->cnt_data);
+               two->cnt_data = NULL;
                dst_cnt++;
        }
        /* cost matrix sorted by most to least similar pair */
index d31b3b476c0fc25a85b483b3d1d04b2d36bc6481..73c7842cc7fd907519016b936a96b6f08cd80b47 100644 (file)
@@ -26,6 +26,7 @@ struct diff_filespec {
        unsigned char sha1[20];
        char *path;
        void *data;
+       void *cnt_data;
        unsigned long size;
        int xfrm_flags;          /* for use by the xfrm */
        unsigned short mode;     /* file mode */
@@ -103,6 +104,8 @@ void diff_debug_queue(const char *, struct diff_queue_struct *);
 
 extern int diffcore_count_changes(void *src, unsigned long src_size,
                                  void *dst, unsigned long dst_size,
+                                 void **src_count_p,
+                                 void **dst_count_p,
                                  unsigned long delta_limit,
                                  unsigned long *src_copied,
                                  unsigned long *literal_added);