From f6c0e191020ad330c06438c144e0ea787ca964fd Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Sat, 21 Oct 2006 02:56:33 -0700 Subject: [PATCH] git-pickaxe: get rid of wasteful find_origin(). After finding out which path in the parent to scan to pass blames, using get_tree_entry() to extract the blob information again was quite wasteful, since diff-tree already gave us that information. Separate the function to create an origin out as get_origin(). You'll never know what is more efficient unless you try and/or think hard. I somehow thought that extracting one known path out of commit's tree is cheaper than running a diff-tree for the current path between the commit and its parent, but it is not the case. In real, non-toy projects, most commits do not touch the path you are interested in, and if the path is a few levels away from the toplevel, whole-subdirectory comparison logic diff-tree allows us to skip opening lower subdirectories. This commit rewrites find_origin() function to use a single-path diff-tree to see if the parent has the same blob as the current suspect, which is cheaper than extracting the blob information using get_tree_entry() and comparing it with what the current suspect has. This shaves about 6% overhead when annotating kernel/sched.c in the Linux kernel repository on my machine. The saving rises to 25% for arch/i386/kernel/Makefile. Signed-off-by: Junio C Hamano --- builtin-pickaxe.c | 138 +++++++++++++++++++++++++++++++++------------- 1 file changed, 101 insertions(+), 37 deletions(-) diff --git a/builtin-pickaxe.c b/builtin-pickaxe.c index 3ab87efac..cf474b0d1 100644 --- a/builtin-pickaxe.c +++ b/builtin-pickaxe.c @@ -140,48 +140,103 @@ static void coalesce(struct scoreboard *sb) } } -static void free_origin(struct origin *o) +static struct origin *get_origin(struct scoreboard *sb, + struct commit *commit, + const char *path) { - free(o); -} - -static struct origin *find_origin(struct scoreboard *sb, - struct commit *commit, - const char *path) -{ - struct blame_entry *ent; + struct blame_entry *e; struct origin *o; - unsigned mode; - char type[10]; - for (ent = sb->ent; ent; ent = ent->next) { - if (ent->suspect->commit == commit && - !strcmp(ent->suspect->path, path)) - return ent->suspect; + for (e = sb->ent; e; e = e->next) { + if (e->suspect->commit == commit && + !strcmp(e->suspect->path, path)) + return e->suspect; } - o = xcalloc(1, sizeof(*o) + strlen(path) + 1); o->commit = commit; strcpy(o->path, path); - if (get_tree_entry(commit->object.sha1, path, o->blob_sha1, &mode)) - goto err_out; - if (sha1_object_info(o->blob_sha1, type, NULL) || - strcmp(type, blob_type)) - goto err_out; return o; - err_out: - free_origin(o); - return NULL; } -static struct origin *find_rename(struct scoreboard *sb, +static int fill_blob_sha1(struct origin *origin) +{ + unsigned mode; + char type[10]; + + if (!is_null_sha1(origin->blob_sha1)) + return 0; + if (get_tree_entry(origin->commit->object.sha1, + origin->path, + origin->blob_sha1, &mode)) + goto error_out; + if (sha1_object_info(origin->blob_sha1, type, NULL) || + strcmp(type, blob_type)) + goto error_out; + return 0; + error_out: + hashclr(origin->blob_sha1); + return -1; +} + +static struct origin *find_origin(struct scoreboard *sb, struct commit *parent, struct origin *origin) { struct origin *porigin = NULL; struct diff_options diff_opts; int i; - const char *paths[1]; + const char *paths[2]; + + /* See if the origin->path is different between parent + * and origin first. Most of the time they are the + * same and diff-tree is fairly efficient about this. + */ + diff_setup(&diff_opts); + diff_opts.recursive = 1; + diff_opts.detect_rename = 0; + diff_opts.output_format = DIFF_FORMAT_NO_OUTPUT; + paths[0] = origin->path; + paths[1] = NULL; + + diff_tree_setup_paths(paths, &diff_opts); + if (diff_setup_done(&diff_opts) < 0) + die("diff-setup"); + diff_tree_sha1(parent->tree->object.sha1, + origin->commit->tree->object.sha1, + "", &diff_opts); + diffcore_std(&diff_opts); + + /* It is either one entry that says "modified", or "created", + * or nothing. + */ + if (!diff_queued_diff.nr) { + /* The path is the same as parent */ + porigin = get_origin(sb, parent, origin->path); + hashcpy(porigin->blob_sha1, origin->blob_sha1); + } + else if (diff_queued_diff.nr != 1) + die("internal error in pickaxe::find_origin"); + else { + struct diff_filepair *p = diff_queued_diff.queue[0]; + switch (p->status) { + default: + die("internal error in pickaxe::find_origin (%c)", + p->status); + case 'M': + porigin = get_origin(sb, parent, origin->path); + hashcpy(porigin->blob_sha1, p->one->sha1); + break; + case 'A': + case 'T': + /* Did not exist in parent, or type changed */ + break; + } + } + diff_flush(&diff_opts); + if (porigin) + return porigin; + + /* Otherwise we would look for a rename */ diff_setup(&diff_opts); diff_opts.recursive = 1; @@ -191,16 +246,17 @@ static struct origin *find_rename(struct scoreboard *sb, diff_tree_setup_paths(paths, &diff_opts); if (diff_setup_done(&diff_opts) < 0) die("diff-setup"); - diff_tree_sha1(origin->commit->tree->object.sha1, - parent->tree->object.sha1, + diff_tree_sha1(parent->tree->object.sha1, + origin->commit->tree->object.sha1, "", &diff_opts); diffcore_std(&diff_opts); for (i = 0; i < diff_queued_diff.nr; i++) { struct diff_filepair *p = diff_queued_diff.queue[i]; if ((p->status == 'R' || p->status == 'C') && - !strcmp(p->one->path, origin->path)) { - porigin = find_origin(sb, parent, p->two->path); + !strcmp(p->two->path, origin->path)) { + porigin = get_origin(sb, parent, p->one->path); + hashcpy(porigin->blob_sha1, p->one->sha1); break; } } @@ -705,6 +761,15 @@ static int find_copy_in_parent(struct scoreboard *sb, "", &diff_opts); diffcore_std(&diff_opts); + + /* + * NEEDSWORK: This loop is wasteful in that it opens the same + * blob in the parent number of times. We should swap the + * loop inside out, which would require keeping track of + * "best blame so far" for blame entries that the current + * "target" is being suspected. + */ + for (e = sb->ent; e; e = e->next) { struct blame_entry split[3]; if (e->guilty || cmp_suspect(e->suspect, target)) @@ -724,8 +789,8 @@ static int find_copy_in_parent(struct scoreboard *sb, if (porigin && !strcmp(p->one->path, porigin->path)) /* find_move already dealt with this path */ continue; - norigin = find_origin(sb, parent, p->one->path); - + norigin = get_origin(sb, parent, p->one->path); + hashcpy(norigin->blob_sha1, p->one->sha1); blob = read_sha1_file(norigin->blob_sha1, type, (unsigned long *) &file_p.size); file_p.ptr = blob; @@ -735,6 +800,7 @@ static int find_copy_in_parent(struct scoreboard *sb, } find_copy_in_blob(sb, e, norigin, this, &file_p); copy_split_if_better(sb, split, this); + free(blob); } if (split[1].suspect && blame_copy_score < ent_score(sb, &split[1])) @@ -762,9 +828,7 @@ static void pass_blame(struct scoreboard *sb, struct origin *origin, int opt) if (parse_commit(p)) continue; - porigin = find_origin(sb, parent->item, origin->path); - if (!porigin) - porigin = find_rename(sb, parent->item, origin); + porigin = find_origin(sb, parent->item, origin); if (!porigin) continue; if (!hashcmp(porigin->blob_sha1, origin->blob_sha1)) { @@ -1423,8 +1487,8 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix) */ prepare_revision_walk(&revs); - o = find_origin(&sb, sb.final, path); - if (!o) + o = get_origin(&sb, sb.final, path); + if (fill_blob_sha1(o)) die("no such path %s in %s", path, final_commit_name); sb.final_buf = read_sha1_file(o->blob_sha1, type, &sb.final_buf_size); -- 2.26.2