From: Junio C Hamano Date: Fri, 27 May 2005 22:56:38 +0000 (-0700) Subject: [PATCH] Optimize diff-tree -[CM] --stdin X-Git-Tag: v0.99~425 X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=f0c6b2a2fd98b51f1f2655ea69ace9763da28e79;p=git.git [PATCH] Optimize diff-tree -[CM] --stdin This attempts to optimize "diff-tree -[CM] --stdin", which compares successible tree pairs. This optimization does not make much sense for other commands in the diff-* brothers. When reading from --stdin and using rename/copy detection, the patch makes diff-tree to read the current index file first. This is done to reuse the optimization used by diff-cache in the non-cached case. Similarity estimator can avoid expanding a blob if the index says what is in the work tree has an exact copy of that blob already expanded. Another optimization the patch makes is to check only file sizes first to terminate similarity estimation early. In order for this to work, it needs a way to tell the size of the blob without expanding it. Since an obvious way of doing it, which is to keep all the blobs previously used in the memory, is too costly, it does so by keeping the filesize for each object it has already seen in memory. Signed-off-by: Junio C Hamano Signed-off-by: Linus Torvalds --- diff --git a/diff-tree.c b/diff-tree.c index c66c78708..8bdb1dba5 100644 --- a/diff-tree.c +++ b/diff-tree.c @@ -578,6 +578,9 @@ int main(int argc, const char **argv) if (!read_stdin) return 0; + if (detect_rename) + diff_setup_opt |= (DIFF_SETUP_USE_SIZE_CACHE | + DIFF_SETUP_USE_CACHE); while (fgets(line, sizeof(line), stdin)) diff_tree_stdin(line); diff --git a/diff.c b/diff.c index ebec71aee..357c4efce 100644 --- a/diff.c +++ b/diff.c @@ -12,6 +12,7 @@ static const char *diff_opts = "-pu"; static unsigned char null_sha1[20] = { 0, }; static int reverse_diff; +static int use_size_cache; static const char *external_diff(void) { @@ -222,12 +223,60 @@ static int work_tree_matches(const char *name, const unsigned char *sha1) return 1; } +static struct sha1_size_cache { + unsigned char sha1[20]; + unsigned long size; +} **sha1_size_cache; +static int sha1_size_cache_nr, sha1_size_cache_alloc; + +static struct sha1_size_cache *locate_size_cache(unsigned char *sha1, + unsigned long size) +{ + int first, last; + struct sha1_size_cache *e; + + first = 0; + last = sha1_size_cache_nr; + while (last > first) { + int next = (last + first) >> 1; + e = sha1_size_cache[next]; + int cmp = memcmp(e->sha1, sha1, 20); + if (!cmp) + return e; + if (cmp < 0) { + last = next; + continue; + } + first = next+1; + } + /* not found */ + if (size == UINT_MAX) + return NULL; + /* insert to make it at "first" */ + if (sha1_size_cache_alloc <= sha1_size_cache_nr) { + sha1_size_cache_alloc = alloc_nr(sha1_size_cache_alloc); + sha1_size_cache = xrealloc(sha1_size_cache, + sha1_size_cache_alloc * + sizeof(*sha1_size_cache)); + } + sha1_size_cache_nr++; + if (first < sha1_size_cache_nr) + memmove(sha1_size_cache + first + 1, sha1_size_cache + first, + (sha1_size_cache_nr - first - 1) * + sizeof(*sha1_size_cache)); + e = xmalloc(sizeof(struct sha1_size_cache)); + sha1_size_cache[first] = e; + memcpy(e->sha1, sha1, 20); + e->size = size; + return e; +} + /* * While doing rename detection and pickaxe operation, we may need to * grab the data for the blob (or file) for our own in-core comparison. * diff_filespec has data and size fields for this purpose. */ -int diff_populate_filespec(struct diff_filespec *s) +int diff_populate_filespec(struct diff_filespec *s, int size_only) { int err = 0; if (!DIFF_FILE_VALID(s)) @@ -235,6 +284,9 @@ int diff_populate_filespec(struct diff_filespec *s) if (S_ISDIR(s->mode)) return -1; + if (!use_size_cache) + size_only = 0; + if (s->data) return err; if (!s->sha1_valid || @@ -254,6 +306,8 @@ int diff_populate_filespec(struct diff_filespec *s) s->size = st.st_size; if (!s->size) goto empty; + if (size_only) + return 0; if (S_ISLNK(st.st_mode)) { int ret; s->data = xmalloc(s->size); @@ -273,9 +327,21 @@ int diff_populate_filespec(struct diff_filespec *s) close(fd); } else { + /* We cannot do size only for SHA1 blobs */ char type[20]; + struct sha1_size_cache *e; + + if (size_only) { + e = locate_size_cache(s->sha1, UINT_MAX); + if (e) { + s->size = e->size; + return 0; + } + } s->data = read_sha1_file(s->sha1, type, &s->size); s->should_free = 1; + if (s->data && size_only) + locate_size_cache(s->sha1, s->size); } return 0; } @@ -361,7 +427,7 @@ static void prepare_temp_file(const char *name, return; } else { - if (diff_populate_filespec(one)) + if (diff_populate_filespec(one, 0)) die("cannot read data blob for %s", one->path); prep_temp_blob(temp, one->data, one->size, one->sha1, one->mode); @@ -496,6 +562,19 @@ void diff_setup(int flags) { if (flags & DIFF_SETUP_REVERSE) reverse_diff = 1; + if (flags & DIFF_SETUP_USE_CACHE) { + if (!active_cache) + /* read-cache does not die even when it fails + * so it is safe for us to do this here. Also + * it does not smudge active_cache or active_nr + * when it fails, so we do not have to worry about + * cleaning it up oufselves either. + */ + read_cache(); + } + if (flags & DIFF_SETUP_USE_SIZE_CACHE) + use_size_cache = 1; + } struct diff_queue_struct diff_queued_diff; diff --git a/diff.h b/diff.h index 40a6757dc..a07ee9f36 100644 --- a/diff.h +++ b/diff.h @@ -29,6 +29,8 @@ extern void diff_unmerge(const char *path); extern int diff_scoreopt_parse(const char *opt); #define DIFF_SETUP_REVERSE 1 +#define DIFF_SETUP_USE_CACHE 2 +#define DIFF_SETUP_USE_SIZE_CACHE 4 extern void diff_setup(int flags); #define DIFF_DETECT_RENAME 1 diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c index 9cf3a5083..ef9c5c1d2 100644 --- a/diffcore-pickaxe.c +++ b/diffcore-pickaxe.c @@ -11,7 +11,7 @@ static int contains(struct diff_filespec *one, { unsigned long offset, sz; const char *data; - if (diff_populate_filespec(one)) + if (diff_populate_filespec(one, 0)) return 0; sz = one->size; data = one->data; diff --git a/diffcore-rename.c b/diffcore-rename.c index 6389dedbf..035d4ebb8 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -99,8 +99,11 @@ static int is_exact_match(struct diff_filespec *src, struct diff_filespec *dst) if (src->sha1_valid && dst->sha1_valid && !memcmp(src->sha1, dst->sha1, 20)) return 1; - if (diff_populate_filespec(src) || diff_populate_filespec(dst)) - /* this is an error but will be caught downstream */ + if (diff_populate_filespec(src, 1) || diff_populate_filespec(dst, 1)) + return 0; + if (src->size != dst->size) + return 0; + if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0)) return 0; if (src->size == dst->size && !memcmp(src->data, dst->data, src->size)) @@ -125,9 +128,11 @@ static int estimate_similarity(struct diff_filespec *src, * dst, and then some edit has been applied to dst. * * Compare them and return how similar they are, representing - * the score as an integer between 0 and 10000, except - * where they match exactly it is considered better than anything - * else. + * the score as an integer between 0 and MAX_SCORE. + * + * When there is an exact match, it is considered a better + * match than anything else; the destination does not even + * call into this function in that case. */ void *delta; unsigned long delta_size, base_size; @@ -147,6 +152,7 @@ static int estimate_similarity(struct diff_filespec *src, /* We would not consider edits that change the file size so * drastically. delta_size must be smaller than * (MAX_SCORE-minimum_score)/MAX_SCORE * min(src->size, dst->size). + * * Note that base_size == 0 case is handled here already * and the final score computation below would not have a * divide-by-zero issue. @@ -154,6 +160,9 @@ static int estimate_similarity(struct diff_filespec *src, if (base_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE) return 0; + if (diff_populate_filespec(src, 0) || diff_populate_filespec(dst, 0)) + return 0; /* error but caught downstream */ + delta = diff_delta(src->data, src->size, dst->data, dst->size, &delta_size); diff --git a/diffcore.h b/diffcore.h index 462014b65..60ee7756e 100644 --- a/diffcore.h +++ b/diffcore.h @@ -33,7 +33,7 @@ extern struct diff_filespec *alloc_filespec(const char *); extern void fill_filespec(struct diff_filespec *, const unsigned char *, unsigned short); -extern int diff_populate_filespec(struct diff_filespec *); +extern int diff_populate_filespec(struct diff_filespec *, int); extern void diff_free_filespec_data(struct diff_filespec *); struct diff_filepair {