Implement the patience diff algorithm
authorJohannes Schindelin <Johannes.Schindelin@gmx.de>
Wed, 7 Jan 2009 17:04:09 +0000 (18:04 +0100)
committerJunio C Hamano <gitster@pobox.com>
Wed, 7 Jan 2009 21:35:44 +0000 (13:35 -0800)
The patience diff algorithm produces slightly more intuitive output
than the classic Myers algorithm, as it does not try to minimize the
number of +/- lines first, but tries to preserve the lines that are
unique.

To this end, it first determines lines that are unique in both files,
then the maximal sequence which preserves the order (relative to both
files) is extracted.

Starting from this initial set of common lines, the rest of the lines
is handled recursively, with Myers' algorithm as a fallback when
the patience algorithm fails (due to no common unique lines).

This patch includes memory leak fixes by Pierre Habouzit.

Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
xdiff/xdiff.h
xdiff/xdiffi.c
xdiff/xdiffi.h
xdiff/xpatience.c [new file with mode: 0644]
xdiff/xprepare.c

index 84fff583e2a6e3411da4260b9af98043fade1e49..d238c8263eebbbcb83d0d9f551560d86ec4866bb 100644 (file)
@@ -32,6 +32,7 @@ extern "C" {
 #define XDF_IGNORE_WHITESPACE (1 << 2)
 #define XDF_IGNORE_WHITESPACE_CHANGE (1 << 3)
 #define XDF_IGNORE_WHITESPACE_AT_EOL (1 << 4)
+#define XDF_PATIENCE_DIFF (1 << 5)
 #define XDF_WHITESPACE_FLAGS (XDF_IGNORE_WHITESPACE | XDF_IGNORE_WHITESPACE_CHANGE | XDF_IGNORE_WHITESPACE_AT_EOL)
 
 #define XDL_PATCH_NORMAL '-'
index 9d0324a38c2a1974648e67161fa0ed1b0f811233..3e97462bdd2ed72b4ec60a1eb3869e516609867b 100644 (file)
@@ -329,6 +329,9 @@ int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
        xdalgoenv_t xenv;
        diffdata_t dd1, dd2;
 
+       if (xpp->flags & XDF_PATIENCE_DIFF)
+               return xdl_do_patience_diff(mf1, mf2, xpp, xe);
+
        if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0) {
 
                return -1;
index 3e099dc445d6130f6a0ce2c6270a3b06d6ee119f..ad033a8e6a79600b6a3ba0cc16244ede0e9437ea 100644 (file)
@@ -55,5 +55,7 @@ int xdl_build_script(xdfenv_t *xe, xdchange_t **xscr);
 void xdl_free_script(xdchange_t *xscr);
 int xdl_emit_diff(xdfenv_t *xe, xdchange_t *xscr, xdemitcb_t *ecb,
                  xdemitconf_t const *xecfg);
+int xdl_do_patience_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
+               xdfenv_t *env);
 
 #endif /* #if !defined(XDIFFI_H) */
diff --git a/xdiff/xpatience.c b/xdiff/xpatience.c
new file mode 100644 (file)
index 0000000..e42c16a
--- /dev/null
@@ -0,0 +1,381 @@
+/*
+ *  LibXDiff by Davide Libenzi ( File Differential Library )
+ *  Copyright (C) 2003-2009 Davide Libenzi, Johannes E. Schindelin
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Lesser General Public
+ *  License along with this library; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ *  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+#include "xinclude.h"
+#include "xtypes.h"
+#include "xdiff.h"
+
+/*
+ * The basic idea of patience diff is to find lines that are unique in
+ * both files.  These are intuitively the ones that we want to see as
+ * common lines.
+ *
+ * The maximal ordered sequence of such line pairs (where ordered means
+ * that the order in the sequence agrees with the order of the lines in
+ * both files) naturally defines an initial set of common lines.
+ *
+ * Now, the algorithm tries to extend the set of common lines by growing
+ * the line ranges where the files have identical lines.
+ *
+ * Between those common lines, the patience diff algorithm is applied
+ * recursively, until no unique line pairs can be found; these line ranges
+ * are handled by the well-known Myers algorithm.
+ */
+
+#define NON_UNIQUE ULONG_MAX
+
+/*
+ * This is a hash mapping from line hash to line numbers in the first and
+ * second file.
+ */
+struct hashmap {
+       int nr, alloc;
+       struct entry {
+               unsigned long hash;
+               /*
+                * 0 = unused entry, 1 = first line, 2 = second, etc.
+                * line2 is NON_UNIQUE if the line is not unique
+                * in either the first or the second file.
+                */
+               unsigned long line1, line2;
+               /*
+                * "next" & "previous" are used for the longest common
+                * sequence;
+                * initially, "next" reflects only the order in file1.
+                */
+               struct entry *next, *previous;
+       } *entries, *first, *last;
+       /* were common records found? */
+       unsigned long has_matches;
+       mmfile_t *file1, *file2;
+       xdfenv_t *env;
+       xpparam_t const *xpp;
+};
+
+/* The argument "pass" is 1 for the first file, 2 for the second. */
+static void insert_record(int line, struct hashmap *map, int pass)
+{
+       xrecord_t **records = pass == 1 ?
+               map->env->xdf1.recs : map->env->xdf2.recs;
+       xrecord_t *record = records[line - 1], *other;
+       /*
+        * After xdl_prepare_env() (or more precisely, due to
+        * xdl_classify_record()), the "ha" member of the records (AKA lines)
+        * is _not_ the hash anymore, but a linearized version of it.  In
+        * other words, the "ha" member is guaranteed to start with 0 and
+        * the second record's ha can only be 0 or 1, etc.
+        *
+        * So we multiply ha by 2 in the hope that the hashing was
+        * "unique enough".
+        */
+       int index = (int)((record->ha << 1) % map->alloc);
+
+       while (map->entries[index].line1) {
+               other = map->env->xdf1.recs[map->entries[index].line1 - 1];
+               if (map->entries[index].hash != record->ha ||
+                               !xdl_recmatch(record->ptr, record->size,
+                                       other->ptr, other->size,
+                                       map->xpp->flags)) {
+                       if (++index >= map->alloc)
+                               index = 0;
+                       continue;
+               }
+               if (pass == 2)
+                       map->has_matches = 1;
+               if (pass == 1 || map->entries[index].line2)
+                       map->entries[index].line2 = NON_UNIQUE;
+               else
+                       map->entries[index].line2 = line;
+               return;
+       }
+       if (pass == 2)
+               return;
+       map->entries[index].line1 = line;
+       map->entries[index].hash = record->ha;
+       if (!map->first)
+               map->first = map->entries + index;
+       if (map->last) {
+               map->last->next = map->entries + index;
+               map->entries[index].previous = map->last;
+       }
+       map->last = map->entries + index;
+       map->nr++;
+}
+
+/*
+ * This function has to be called for each recursion into the inter-hunk
+ * parts, as previously non-unique lines can become unique when being
+ * restricted to a smaller part of the files.
+ *
+ * It is assumed that env has been prepared using xdl_prepare().
+ */
+static int fill_hashmap(mmfile_t *file1, mmfile_t *file2,
+               xpparam_t const *xpp, xdfenv_t *env,
+               struct hashmap *result,
+               int line1, int count1, int line2, int count2)
+{
+       result->file1 = file1;
+       result->file2 = file2;
+       result->xpp = xpp;
+       result->env = env;
+
+       /* We know exactly how large we want the hash map */
+       result->alloc = count1 * 2;
+       result->entries = (struct entry *)
+               xdl_malloc(result->alloc * sizeof(struct entry));
+       if (!result->entries)
+               return -1;
+       memset(result->entries, 0, result->alloc * sizeof(struct entry));
+
+       /* First, fill with entries from the first file */
+       while (count1--)
+               insert_record(line1++, result, 1);
+
+       /* Then search for matches in the second file */
+       while (count2--)
+               insert_record(line2++, result, 2);
+
+       return 0;
+}
+
+/*
+ * Find the longest sequence with a smaller last element (meaning a smaller
+ * line2, as we construct the sequence with entries ordered by line1).
+ */
+static int binary_search(struct entry **sequence, int longest,
+               struct entry *entry)
+{
+       int left = -1, right = longest;
+
+       while (left + 1 < right) {
+               int middle = (left + right) / 2;
+               /* by construction, no two entries can be equal */
+               if (sequence[middle]->line2 > entry->line2)
+                       right = middle;
+               else
+                       left = middle;
+       }
+       /* return the index in "sequence", _not_ the sequence length */
+       return left;
+}
+
+/*
+ * The idea is to start with the list of common unique lines sorted by
+ * the order in file1.  For each of these pairs, the longest (partial)
+ * sequence whose last element's line2 is smaller is determined.
+ *
+ * For efficiency, the sequences are kept in a list containing exactly one
+ * item per sequence length: the sequence with the smallest last
+ * element (in terms of line2).
+ */
+static struct entry *find_longest_common_sequence(struct hashmap *map)
+{
+       struct entry **sequence = xdl_malloc(map->nr * sizeof(struct entry *));
+       int longest = 0, i;
+       struct entry *entry;
+
+       for (entry = map->first; entry; entry = entry->next) {
+               if (!entry->line2 || entry->line2 == NON_UNIQUE)
+                       continue;
+               i = binary_search(sequence, longest, entry);
+               entry->previous = i < 0 ? NULL : sequence[i];
+               sequence[++i] = entry;
+               if (i == longest)
+                       longest++;
+       }
+
+       /* No common unique lines were found */
+       if (!longest) {
+               xdl_free(sequence);
+               return NULL;
+       }
+
+       /* Iterate starting at the last element, adjusting the "next" members */
+       entry = sequence[longest - 1];
+       entry->next = NULL;
+       while (entry->previous) {
+               entry->previous->next = entry;
+               entry = entry->previous;
+       }
+       xdl_free(sequence);
+       return entry;
+}
+
+static int match(struct hashmap *map, int line1, int line2)
+{
+       xrecord_t *record1 = map->env->xdf1.recs[line1 - 1];
+       xrecord_t *record2 = map->env->xdf2.recs[line2 - 1];
+       return xdl_recmatch(record1->ptr, record1->size,
+               record2->ptr, record2->size, map->xpp->flags);
+}
+
+static int patience_diff(mmfile_t *file1, mmfile_t *file2,
+               xpparam_t const *xpp, xdfenv_t *env,
+               int line1, int count1, int line2, int count2);
+
+static int walk_common_sequence(struct hashmap *map, struct entry *first,
+               int line1, int count1, int line2, int count2)
+{
+       int end1 = line1 + count1, end2 = line2 + count2;
+       int next1, next2;
+
+       for (;;) {
+               /* Try to grow the line ranges of common lines */
+               if (first) {
+                       next1 = first->line1;
+                       next2 = first->line2;
+                       while (next1 > line1 && next2 > line2 &&
+                                       match(map, next1 - 1, next2 - 1)) {
+                               next1--;
+                               next2--;
+                       }
+               } else {
+                       next1 = end1;
+                       next2 = end2;
+               }
+               while (line1 < next1 && line2 < next2 &&
+                               match(map, line1, line2)) {
+                       line1++;
+                       line2++;
+               }
+
+               /* Recurse */
+               if (next1 > line1 || next2 > line2) {
+                       struct hashmap submap;
+
+                       memset(&submap, 0, sizeof(submap));
+                       if (patience_diff(map->file1, map->file2,
+                                       map->xpp, map->env,
+                                       line1, next1 - line1,
+                                       line2, next2 - line2))
+                               return -1;
+               }
+
+               if (!first)
+                       return 0;
+
+               while (first->next &&
+                               first->next->line1 == first->line1 + 1 &&
+                               first->next->line2 == first->line2 + 1)
+                       first = first->next;
+
+               line1 = first->line1 + 1;
+               line2 = first->line2 + 1;
+
+               first = first->next;
+       }
+}
+
+static int fall_back_to_classic_diff(struct hashmap *map,
+               int line1, int count1, int line2, int count2)
+{
+       /*
+        * This probably does not work outside Git, since
+        * we have a very simple mmfile structure.
+        *
+        * Note: ideally, we would reuse the prepared environment, but
+        * the libxdiff interface does not (yet) allow for diffing only
+        * ranges of lines instead of the whole files.
+        */
+       mmfile_t subfile1, subfile2;
+       xpparam_t xpp;
+       xdfenv_t env;
+
+       subfile1.ptr = (char *)map->env->xdf1.recs[line1 - 1]->ptr;
+       subfile1.size = map->env->xdf1.recs[line1 + count1 - 2]->ptr +
+               map->env->xdf1.recs[line1 + count1 - 2]->size - subfile1.ptr;
+       subfile2.ptr = (char *)map->env->xdf2.recs[line2 - 1]->ptr;
+       subfile2.size = map->env->xdf2.recs[line2 + count2 - 2]->ptr +
+               map->env->xdf2.recs[line2 + count2 - 2]->size - subfile2.ptr;
+       xpp.flags = map->xpp->flags & ~XDF_PATIENCE_DIFF;
+       if (xdl_do_diff(&subfile1, &subfile2, &xpp, &env) < 0)
+               return -1;
+
+       memcpy(map->env->xdf1.rchg + line1 - 1, env.xdf1.rchg, count1);
+       memcpy(map->env->xdf2.rchg + line2 - 1, env.xdf2.rchg, count2);
+
+       xdl_free_env(&env);
+
+       return 0;
+}
+
+/*
+ * Recursively find the longest common sequence of unique lines,
+ * and if none was found, ask xdl_do_diff() to do the job.
+ *
+ * This function assumes that env was prepared with xdl_prepare_env().
+ */
+static int patience_diff(mmfile_t *file1, mmfile_t *file2,
+               xpparam_t const *xpp, xdfenv_t *env,
+               int line1, int count1, int line2, int count2)
+{
+       struct hashmap map;
+       struct entry *first;
+       int result = 0;
+
+       /* trivial case: one side is empty */
+       if (!count1) {
+               while(count2--)
+                       env->xdf2.rchg[line2++ - 1] = 1;
+               return 0;
+       } else if (!count2) {
+               while(count1--)
+                       env->xdf1.rchg[line1++ - 1] = 1;
+               return 0;
+       }
+
+       memset(&map, 0, sizeof(map));
+       if (fill_hashmap(file1, file2, xpp, env, &map,
+                       line1, count1, line2, count2))
+               return -1;
+
+       /* are there any matching lines at all? */
+       if (!map.has_matches) {
+               while(count1--)
+                       env->xdf1.rchg[line1++ - 1] = 1;
+               while(count2--)
+                       env->xdf2.rchg[line2++ - 1] = 1;
+               xdl_free(map.entries);
+               return 0;
+       }
+
+       first = find_longest_common_sequence(&map);
+       if (first)
+               result = walk_common_sequence(&map, first,
+                       line1, count1, line2, count2);
+       else
+               result = fall_back_to_classic_diff(&map,
+                       line1, count1, line2, count2);
+
+       xdl_free(map.entries);
+       return result;
+}
+
+int xdl_do_patience_diff(mmfile_t *file1, mmfile_t *file2,
+               xpparam_t const *xpp, xdfenv_t *env)
+{
+       if (xdl_prepare_env(file1, file2, xpp, env) < 0)
+               return -1;
+
+       /* environment is cleaned up in xdl_diff() */
+       return patience_diff(file1, file2, xpp, env,
+                       1, env->xdf1.nrec, 1, env->xdf2.nrec);
+}
index a43aa72cd088af07b13a6bc8de304ecc178047e5..16890852350cb62bb9f9aec5e52eea8ba46f1192 100644 (file)
@@ -290,7 +290,8 @@ int xdl_prepare_env(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
 
        xdl_free_classifier(&cf);
 
-       if (xdl_optimize_ctxs(&xe->xdf1, &xe->xdf2) < 0) {
+       if (!(xpp->flags & XDF_PATIENCE_DIFF) &&
+                       xdl_optimize_ctxs(&xe->xdf1, &xe->xdf2) < 0) {
 
                xdl_free_ctx(&xe->xdf2);
                xdl_free_ctx(&xe->xdf1);