git-pickaxe: improve "best match" heuristics
authorJunio C Hamano <junkio@cox.net>
Fri, 20 Oct 2006 21:51:12 +0000 (14:51 -0700)
committerJunio C Hamano <junkio@cox.net>
Sat, 21 Oct 2006 01:48:23 +0000 (18:48 -0700)
Instead of comparing number of lines matched, look at the
matched characters and count alnums, so that we do not pass
blame on not-so-interesting lines, such as an empty line and
a line that is indentation followed by a closing brace.

Add an option --score-debug to show the score of each
blame_entry while we cook this further on the "next" branch.

Signed-off-by: Junio C Hamano <junkio@cox.net>
builtin-pickaxe.c

index b595299bf8f3ec7361ec7afe5fe7515502d60730..52e32dcc718c182b25b9ef1ca1e62b65475c48dd 100644 (file)
@@ -34,8 +34,7 @@ static int longest_file;
 static int longest_author;
 static int max_orig_digits;
 static int max_digits;
-
-#define DEBUG 0
+static int max_score_digits;
 
 #define PICKAXE_BLAME_MOVE             01
 #define PICKAXE_BLAME_COPY             02
@@ -78,6 +77,11 @@ struct blame_entry {
         * suspect's file; internally all line numbers are 0 based.
         */
        int s_lno;
+
+       /* how significant this entry is -- cached to avoid
+        * scanning the lines over and over
+        */
+       unsigned score;
 };
 
 struct scoreboard {
@@ -215,9 +219,6 @@ static void process_u_diff(void *state_, char *line, unsigned long len)
        struct chunk *chunk;
        int off1, off2, len1, len2, num;
 
-       if (DEBUG)
-               fprintf(stderr, "%.*s", (int) len, line);
-
        num = state->ret->num;
        if (len < 4 || line[0] != '@' || line[1] != '@') {
                if (state->hunk_in_pre_context && line[0] == ' ')
@@ -295,10 +296,6 @@ static struct patch *get_patch(struct origin *parent, struct origin *origin)
        char *blob_p, *blob_o;
        struct patch *patch;
 
-       if (DEBUG) fprintf(stderr, "get patch %.8s %.8s\n",
-                          sha1_to_hex(parent->commit->object.sha1),
-                          sha1_to_hex(origin->commit->object.sha1));
-
        blob_p = read_sha1_file(parent->blob_sha1, type,
                                (unsigned long *) &file_p.size);
        blob_o = read_sha1_file(origin->blob_sha1, type,
@@ -352,6 +349,7 @@ static void dup_entry(struct blame_entry *dst, struct blame_entry *src)
        memcpy(dst, src, sizeof(*src));
        dst->prev = p;
        dst->next = n;
+       dst->score = 0;
 }
 
 static const char *nth_line(struct scoreboard *sb, int lno)
@@ -448,7 +446,7 @@ static void split_blame(struct scoreboard *sb,
                add_blame_entry(sb, new_entry);
        }
 
-       if (DEBUG) {
+       if (1) { /* sanity */
                struct blame_entry *ent;
                int lno = 0, corrupt = 0;
 
@@ -530,12 +528,6 @@ static int pass_blame_to_parent(struct scoreboard *sb,
        for (i = 0; i < patch->num; i++) {
                struct chunk *chunk = &patch->chunks[i];
 
-               if (DEBUG)
-                       fprintf(stderr,
-                               "plno = %d, tlno = %d, "
-                               "same as parent up to %d, resync %d and %d\n",
-                               plno, tlno,
-                               chunk->same, chunk->p_next, chunk->t_next);
                blame_chunk(sb, tlno, plno, chunk->same, target, parent);
                plno = chunk->p_next;
                tlno = chunk->t_next;
@@ -547,14 +539,37 @@ static int pass_blame_to_parent(struct scoreboard *sb,
        return 0;
 }
 
-static void copy_split_if_better(struct blame_entry best_so_far[3],
+static unsigned ent_score(struct scoreboard *sb, struct blame_entry *e)
+{
+       unsigned score;
+       const char *cp, *ep;
+
+       if (e->score)
+               return e->score;
+
+       score = 0;
+       cp = nth_line(sb, e->lno);
+       ep = nth_line(sb, e->lno + e->num_lines);
+       while (cp < ep) {
+               unsigned ch = *((unsigned char *)cp);
+               if (isalnum(ch))
+                       score++;
+               cp++;
+       }
+       e->score = score;
+       return score;
+}
+
+static void copy_split_if_better(struct scoreboard *sb,
+                                struct blame_entry best_so_far[3],
                                 struct blame_entry this[3])
 {
        if (!this[1].suspect)
                return;
-       if (best_so_far[1].suspect &&
-           (this[1].num_lines < best_so_far[1].num_lines))
-               return;
+       if (best_so_far[1].suspect) {
+               if (ent_score(sb, &this[1]) < ent_score(sb, &best_so_far[1]))
+                       return;
+       }
        memcpy(best_so_far, this, sizeof(struct blame_entry [3]));
 }
 
@@ -596,7 +611,7 @@ static void find_copy_in_blob(struct scoreboard *sb,
                                      tlno + ent->s_lno, plno,
                                      chunk->same + ent->s_lno,
                                      parent);
-                       copy_split_if_better(split, this);
+                       copy_split_if_better(sb, split, this);
                }
                plno = chunk->p_next;
                tlno = chunk->t_next;
@@ -699,7 +714,7 @@ static int find_copy_in_parent(struct scoreboard *sb,
                                continue;
                        }
                        find_copy_in_blob(sb, ent, norigin, this, &file_p);
-                       copy_split_if_better(split, this);
+                       copy_split_if_better(sb, split, this);
                }
                if (split[1].suspect)
                        split_blame(sb, split, ent);
@@ -944,6 +959,7 @@ static void get_commit_info(struct commit *commit,
 #define OUTPUT_PORCELAIN       010
 #define OUTPUT_SHOW_NAME       020
 #define OUTPUT_SHOW_NUMBER     040
+#define OUTPUT_SHOW_SCORE      0100
 
 static void emit_porcelain(struct scoreboard *sb, struct blame_entry *ent)
 {
@@ -1016,6 +1032,8 @@ static void emit_other(struct scoreboard *sb, struct blame_entry *ent, int opt)
                                           show_raw_time),
                               ent->lno + 1 + cnt);
                else {
+                       if (opt & OUTPUT_SHOW_SCORE)
+                               printf(" %*d", max_score_digits, ent->score);
                        if (opt & OUTPUT_SHOW_NAME)
                                printf(" %-*.*s", longest_file, longest_file,
                                       suspect->path);
@@ -1060,8 +1078,9 @@ static void output(struct scoreboard *sb, int option)
        for (ent = sb->ent; ent; ent = ent->next) {
                if (option & OUTPUT_PORCELAIN)
                        emit_porcelain(sb, ent);
-               else
+               else {
                        emit_other(sb, ent, option);
+               }
        }
 }
 
@@ -1121,6 +1140,7 @@ static void find_alignment(struct scoreboard *sb, int *option)
 {
        int longest_src_lines = 0;
        int longest_dst_lines = 0;
+       unsigned largest_score = 0;
        struct blame_entry *e;
 
        for (e = sb->ent; e; e = e->next) {
@@ -1146,9 +1166,12 @@ static void find_alignment(struct scoreboard *sb, int *option)
                num = e->lno + e->num_lines;
                if (longest_dst_lines < num)
                        longest_dst_lines = num;
+               if (largest_score < ent_score(sb, e))
+                       largest_score = ent_score(sb, e);
        }
        max_orig_digits = lineno_width(longest_src_lines);
        max_digits = lineno_width(longest_dst_lines);
+       max_score_digits = lineno_width(largest_score);
 }
 
 static int has_path_in_work_tree(const char *path)
@@ -1209,6 +1232,8 @@ int cmd_pickaxe(int argc, const char **argv, const char *prefix)
                                tmp = top; top = bottom; bottom = tmp;
                        }
                }
+               else if (!strcmp("--score-debug", arg))
+                       output_option |= OUTPUT_SHOW_SCORE;
                else if (!strcmp("-f", arg) ||
                         !strcmp("--show-name", arg))
                        output_option |= OUTPUT_SHOW_NAME;