binary diff: further updates.
authorJunio C Hamano <junkio@cox.net>
Fri, 5 May 2006 09:41:53 +0000 (02:41 -0700)
committerJunio C Hamano <junkio@cox.net>
Fri, 5 May 2006 22:24:32 +0000 (15:24 -0700)
This updates the user interface and generated diff data format.

 * "diff --binary" is used to signal that we want an e-mailable
   binary patch.  It implies --full-index and -p.

 * "apply --allow-binary-replacement" acquired a short synonym
   "apply --binary".

 * After the "GIT binary patch\n" header line there is a token
   to record which binary patch mechanism was used, so that we
   can extend it later.  Currently there are two mechanisms
   defined: "literal" and "delta".  The former records the
   deflated postimage and the latter records the deflated delta
   from the preimage to postimage.

   For purely implementation convenience, I added the deflated
   length after these "literal/delta" tokens (otherwise the
   decoding side needs to guess and reallocate the buffer while
   inflating).  Improvement patches are very welcomed.

Signed-off-by: Junio C Hamano <junkio@cox.net>
apply.c
base85.c [new file with mode: 0644]
cache.h
diff.c
diff.h

diff --git a/apply.c b/apply.c
index e37c4ebf524672fc738a74537b7e9fef6fa508b1..1b93aab8af1a5498e69fe3cffe7cdfb527fa4317 100644 (file)
--- a/apply.c
+++ b/apply.c
@@ -114,6 +114,9 @@ struct patch {
        char *new_name, *old_name, *def_name;
        unsigned int old_mode, new_mode;
        int is_rename, is_copy, is_new, is_delete, is_binary;
+#define BINARY_DELTA_DEFLATED 1
+#define BINARY_LITERAL_DEFLATED 2
+       unsigned long deflate_origlen;
        int lines_added, lines_deleted;
        int score;
        struct fragment *fragments;
@@ -969,9 +972,11 @@ static inline int metadata_changes(struct patch *patch)
 
 static int parse_binary(char *buffer, unsigned long size, struct patch *patch)
 {
-       /* We have read "GIT binary patch\n"; what follows is a
-        * sequence of 'length-byte' followed by base-85 encoded
-        * delta data.
+       /* We have read "GIT binary patch\n"; what follows is a line
+        * that says the patch method (currently, either "deflated
+        * literal" or "deflated delta") and the length of data before
+        * deflating; a sequence of 'length-byte' followed by base-85
+        * encoded data follows.
         *
         * Each 5-byte sequence of base-85 encodes up to 4 bytes,
         * and we would limit the patch line to 66 characters,
@@ -982,11 +987,27 @@ static int parse_binary(char *buffer, unsigned long size, struct patch *patch)
         */
        int llen, used;
        struct fragment *fragment;
-       char *delta = NULL;
+       char *data = NULL;
 
-       patch->is_binary = 1;
        patch->fragments = fragment = xcalloc(1, sizeof(*fragment));
-       used = 0;
+
+       /* Grab the type of patch */
+       llen = linelen(buffer, size);
+       used = llen;
+       linenr++;
+
+       if (!strncmp(buffer, "delta ", 6)) {
+               patch->is_binary = BINARY_DELTA_DEFLATED;
+               patch->deflate_origlen = strtoul(buffer + 6, NULL, 10);
+       }
+       else if (!strncmp(buffer, "literal ", 8)) {
+               patch->is_binary = BINARY_LITERAL_DEFLATED;
+               patch->deflate_origlen = strtoul(buffer + 8, NULL, 10);
+       }
+       else
+               return error("unrecognized binary patch at line %d: %.*s",
+                            linenr-1, llen-1, buffer);
+       buffer += llen;
        while (1) {
                int byte_length, max_byte_length, newsize;
                llen = linelen(buffer, size);
@@ -1015,8 +1036,8 @@ static int parse_binary(char *buffer, unsigned long size, struct patch *patch)
                    byte_length <= max_byte_length - 4)
                        goto corrupt;
                newsize = fragment->size + byte_length;
-               delta = xrealloc(delta, newsize);
-               if (decode_85(delta + fragment->size,
+               data = xrealloc(data, newsize);
+               if (decode_85(data + fragment->size,
                              buffer + 1,
                              byte_length))
                        goto corrupt;
@@ -1024,7 +1045,7 @@ static int parse_binary(char *buffer, unsigned long size, struct patch *patch)
                buffer += llen;
                size -= llen;
        }
-       fragment->patch = delta;
+       fragment->patch = data;
        return used;
  corrupt:
        return error("corrupt binary patch at line %d: %.*s",
@@ -1425,6 +1446,61 @@ static int apply_one_fragment(struct buffer_desc *desc, struct fragment *frag)
        return offset;
 }
 
+static char *inflate_it(const void *data, unsigned long size,
+                       unsigned long inflated_size)
+{
+       z_stream stream;
+       void *out;
+       int st;
+
+       memset(&stream, 0, sizeof(stream));
+
+       stream.next_in = (unsigned char *)data;
+       stream.avail_in = size;
+       stream.next_out = out = xmalloc(inflated_size);
+       stream.avail_out = inflated_size;
+       inflateInit(&stream);
+       st = inflate(&stream, Z_FINISH);
+       if ((st != Z_STREAM_END) || stream.total_out != inflated_size) {
+               free(out);
+               return NULL;
+       }
+       return out;
+}
+
+static int apply_binary_fragment(struct buffer_desc *desc, struct patch *patch)
+{
+       unsigned long dst_size;
+       struct fragment *fragment = patch->fragments;
+       void *data;
+       void *result;
+
+       data = inflate_it(fragment->patch, fragment->size,
+                         patch->deflate_origlen);
+       if (!data)
+               return error("corrupt patch data");
+       switch (patch->is_binary) {
+       case BINARY_DELTA_DEFLATED:
+               result = patch_delta(desc->buffer, desc->size,
+                                    data,
+                                    patch->deflate_origlen,
+                                    &dst_size);
+               free(desc->buffer);
+               desc->buffer = result;
+               free(data);
+               break;
+       case BINARY_LITERAL_DEFLATED:
+               free(desc->buffer);
+               desc->buffer = data;
+               dst_size = patch->deflate_origlen;
+               break;
+       }
+       if (!desc->buffer)
+               return -1;
+       desc->size = desc->alloc = dst_size;
+       return 0;
+}
+
 static int apply_binary(struct buffer_desc *desc, struct patch *patch)
 {
        const char *name = patch->old_name ? patch->old_name : patch->new_name;
@@ -1466,18 +1542,20 @@ static int apply_binary(struct buffer_desc *desc, struct patch *patch)
                                     "'%s' but it is not empty", name);
        }
 
-       if (desc->buffer) {
+       get_sha1_hex(patch->new_sha1_prefix, sha1);
+       if (!memcmp(sha1, null_sha1, 20)) {
                free(desc->buffer);
                desc->alloc = desc->size = 0;
-       }
-       get_sha1_hex(patch->new_sha1_prefix, sha1);
-       if (!memcmp(sha1, null_sha1, 20))
+               desc->buffer = NULL;
                return 0; /* deletion patch */
+       }
 
        if (has_sha1_file(sha1)) {
+               /* We already have the postimage */
                char type[10];
                unsigned long size;
 
+               free(desc->buffer);
                desc->buffer = read_sha1_file(sha1, type, &size);
                if (!desc->buffer)
                        return error("the necessary postimage %s for "
@@ -1486,28 +1564,13 @@ static int apply_binary(struct buffer_desc *desc, struct patch *patch)
                desc->alloc = desc->size = size;
        }
        else {
-               char type[10];
-               unsigned long src_size, dst_size;
-               void *src;
-
-               get_sha1_hex(patch->old_sha1_prefix, sha1);
-               src = read_sha1_file(sha1, type, &src_size);
-               if (!src)
-                       return error("the necessary preimage %s for "
-                                    "'%s' cannot be read",
-                                    patch->old_sha1_prefix, name);
-
-               /* patch->fragment->patch has the delta data and
-                * we should apply it to the preimage.
+               /* We have verified desc matches the preimage;
+                * apply the patch data to it, which is stored
+                * in the patch->fragments->{patch,size}.
                 */
-               desc->buffer = patch_delta(src, src_size,
-                                          (void*) patch->fragments->patch,
-                                          patch->fragments->size,
-                                          &dst_size);
-               if (!desc->buffer)
+               if (apply_binary_fragment(desc, patch))
                        return error("binary patch does not apply to '%s'",
                                     name);
-               desc->size = desc->alloc = dst_size;
 
                /* verify that the result matches */
                write_sha1_file_prepare(desc->buffer, desc->size, blob_type,
@@ -2102,7 +2165,8 @@ int main(int argc, char **argv)
                        diffstat = 1;
                        continue;
                }
-               if (!strcmp(arg, "--allow-binary-replacement")) {
+               if (!strcmp(arg, "--allow-binary-replacement") ||
+                   !strcmp(arg, "--binary")) {
                        allow_binary_replacement = 1;
                        continue;
                }
diff --git a/base85.c b/base85.c
new file mode 100644 (file)
index 0000000..b97f7f9
--- /dev/null
+++ b/base85.c
@@ -0,0 +1,134 @@
+#include "cache.h"
+
+#undef DEBUG_85
+
+#ifdef DEBUG_85
+#define say(a) fprintf(stderr, a)
+#define say1(a,b) fprintf(stderr, a, b)
+#define say2(a,b,c) fprintf(stderr, a, b, c)
+#else
+#define say(a) do {} while(0)
+#define say1(a,b) do {} while(0)
+#define say2(a,b,c) do {} while(0)
+#endif
+
+static const char en85[] = {
+       '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
+       'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
+       'U', 'V', 'W', 'X', 'Y', 'Z',
+       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
+       'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
+       'u', 'v', 'w', 'x', 'y', 'z',
+       '!', '#', '$', '%', '&', '(', ')', '*', '+', '-',
+       ';', '<', '=', '>', '?', '@', '^', '_', '`', '{',
+       '|', '}', '~'
+};
+
+static char de85[256];
+static void prep_base85(void)
+{
+       int i;
+       if (de85['Z'])
+               return;
+       for (i = 0; i < ARRAY_SIZE(en85); i++) {
+               int ch = en85[i];
+               de85[ch] = i + 1;
+       }
+}
+
+int decode_85(char *dst, char *buffer, int len)
+{
+       prep_base85();
+
+       say2("decode 85 <%.*s>", len/4*5, buffer);
+       while (len) {
+               unsigned acc = 0;
+               int cnt;
+               for (cnt = 0; cnt < 5; cnt++, buffer++) {
+                       int ch = *((unsigned char *)buffer);
+                       int de = de85[ch];
+                       if (!de)
+                               return error("invalid base85 alphabet %c", ch);
+                       de--;
+                       if (cnt == 4) {
+                               /*
+                                * Detect overflow.  The largest
+                                * 5-letter possible is "|NsC0" to
+                                * encode 0xffffffff, and "|NsC" gives
+                                * 0x03030303 at this point (i.e.
+                                * 0xffffffff = 0x03030303 * 85).
+                                */
+                               if (0x03030303 < acc ||
+                                   (0x03030303 == acc && de))
+                                       error("invalid base85 sequence %.5s",
+                                             buffer-3);
+                       }
+                       acc = acc * 85 + de;
+                       say1(" <%08x>", acc);
+               }
+               say1(" %08x", acc);
+               for (cnt = 0; cnt < 4 && len; cnt++, len--) {
+                       *dst++ = (acc >> 24) & 0xff;
+                       acc = acc << 8;
+               }
+       }
+       say("\n");
+
+       return 0;
+}
+
+void encode_85(char *buf, unsigned char *data, int bytes)
+{
+       prep_base85();
+
+       say("encode 85");
+       while (bytes) {
+               unsigned acc = 0;
+               int cnt;
+               for (cnt = 0; cnt < 4 && bytes; cnt++, bytes--) {
+                       int ch = *data++;
+                       acc |= ch << ((3-cnt)*8);
+               }
+               say1(" %08x", acc);
+               for (cnt = 0; cnt < 5; cnt++) {
+                       int val = acc % 85;
+                       acc /= 85;
+                       buf[4-cnt] = en85[val];
+               }
+               buf += 5;
+       }
+       say("\n");
+
+       *buf = 0;
+}
+
+#ifdef DEBUG_85
+int main(int ac, char **av)
+{
+       char buf[1024];
+
+       if (!strcmp(av[1], "-e")) {
+               int len = strlen(av[2]);
+               encode_85(buf, av[2], len);
+               if (len <= 26) len = len + 'A' - 1;
+               else len = len + 'a' - 26 + 1;
+               printf("encoded: %c%s\n", len, buf);
+               return 0;
+       }
+       if (!strcmp(av[1], "-d")) {
+               int len = *av[2];
+               if ('A' <= len && len <= 'Z') len = len - 'A' + 1;
+               else len = len - 'a' + 26 + 1;
+               decode_85(buf, av[2]+1, len);
+               printf("decoded: %.*s\n", len, buf);
+               return 0;
+       }
+       if (!strcmp(av[1], "-t")) {
+               char t[4] = { -1,-1,-1,-1 };
+               encode_85(buf, t, 4);
+               printf("encoded: D%s\n", buf);
+               return 0;
+       }
+}
+#endif
diff --git a/cache.h b/cache.h
index 2f32f3d62a16a4aa823d88b20a43907267ffab44..4b7a4392531dd4064694cd346fef902a2b1930e9 100644 (file)
--- a/cache.h
+++ b/cache.h
@@ -365,5 +365,6 @@ extern void setup_pager(void);
 
 /* base85 */
 int decode_85(char *dst, char *line, int linelen);
+void encode_85(char *buf, unsigned char *data, int bytes);
 
 #endif /* CACHE_H */
diff --git a/diff.c b/diff.c
index b14d897f196ddab42d56285ffe14f7f616b01f71..bfe54c3e093439d8e9df55fa319715ca20090f99 100644 (file)
--- a/diff.c
+++ b/diff.c
@@ -392,78 +392,78 @@ static void show_stats(struct diffstat_t* data)
                        total_files, adds, dels);
 }
 
-static void *encode_delta_size(void *data, unsigned long size)
+static unsigned char *deflate_it(char *data,
+                                unsigned long size,
+                                unsigned long *result_size)
 {
-       unsigned char *cp = data;
-       *cp++ = size;
-       size >>= 7;
-       while (size) {
-               cp[-1] |= 0x80;
-               *cp++ = size;
-               size >>= 7;
-       }
-       return cp;
+       int bound;
+       unsigned char *deflated;
+       z_stream stream;
+
+       memset(&stream, 0, sizeof(stream));
+       deflateInit(&stream, Z_BEST_COMPRESSION);
+       bound = deflateBound(&stream, size);
+       deflated = xmalloc(bound);
+       stream.next_out = deflated;
+       stream.avail_out = bound;
+
+       stream.next_in = (unsigned char *)data;
+       stream.avail_in = size;
+       while (deflate(&stream, Z_FINISH) == Z_OK)
+               ; /* nothing */
+       deflateEnd(&stream);
+       *result_size = stream.total_out;
+       return deflated;
 }
 
-static void *safe_diff_delta(const unsigned char *src, unsigned long src_size,
-                            const unsigned char *dst, unsigned long dst_size,
-                            unsigned long *delta_size)
+static void emit_binary_diff(mmfile_t *one, mmfile_t *two)
 {
-       unsigned long bufsize;
-       unsigned char *data;
-       unsigned char *cp;
-
-       if (src_size && dst_size)
-               return diff_delta(src, src_size, dst, dst_size, delta_size, 0);
+       void *cp;
+       void *delta;
+       void *deflated;
+       void *data;
+       unsigned long orig_size;
+       unsigned long delta_size;
+       unsigned long deflate_size;
+       unsigned long data_size;
 
-       /* diff-delta does not like to do delta with empty, so
-        * we do that by hand here.  Sigh...
+       printf("GIT binary patch\n");
+       /* We could do deflated delta, or we could do just deflated two,
+        * whichever is smaller.
         */
-
-       if (!src_size)
-               /* literal copy can be done only 127-byte at a time.
-                */
-               bufsize = dst_size + (dst_size / 127) + 40;
-       else
-               bufsize = 40;
-       data = xmalloc(bufsize);
-       cp = encode_delta_size(data, src_size);
-       cp = encode_delta_size(cp, dst_size);
-
-       if (dst_size) {
-               /* copy out literally */
-               while (dst_size) {
-                       int sz = (127 < dst_size) ? 127 : dst_size;
-                       *cp++ = sz;
-                       dst_size -= sz;
-                       while (sz) {
-                               *cp++ = *dst++;
-                               sz--;
-                       }
+       delta = NULL;
+       deflated = deflate_it(two->ptr, two->size, &deflate_size);
+       if (one->size && two->size) {
+               delta = diff_delta(one->ptr, one->size,
+                                  two->ptr, two->size,
+                                  &delta_size, deflate_size);
+               if (delta) {
+                       void *to_free = delta;
+                       orig_size = delta_size;
+                       delta = deflate_it(delta, delta_size, &delta_size);
+                       free(to_free);
                }
        }
-       *delta_size = (cp - data);
-       return data;
-}
 
-static void emit_binary_diff(mmfile_t *one, mmfile_t *two)
-{
-       void *delta, *cp;
-       unsigned long delta_size;
+       if (delta && delta_size < deflate_size) {
+               printf("delta %lu\n", orig_size);
+               free(deflated);
+               data = delta;
+               data_size = delta_size;
+       }
+       else {
+               printf("literal %lu\n", two->size);
+               free(delta);
+               data = deflated;
+               data_size = deflate_size;
+       }
 
-       printf("GIT binary patch\n");
-       delta = safe_diff_delta(one->ptr, one->size,
-                               two->ptr, two->size,
-                               &delta_size);
-       if (!delta)
-               die("unable to generate binary diff");
-
-       /* emit delta encoded in base85 */
-       cp = delta;
-       while (delta_size) {
-               int bytes = (52 < delta_size) ? 52 : delta_size;
+       /* emit data encoded in base85 */
+       cp = data;
+       while (data_size) {
+               int bytes = (52 < data_size) ? 52 : data_size;
                char line[70];
-               delta_size -= bytes;
+               data_size -= bytes;
                if (bytes <= 26)
                        line[0] = bytes + 'A' - 1;
                else
@@ -473,7 +473,7 @@ static void emit_binary_diff(mmfile_t *one, mmfile_t *two)
                puts(line);
        }
        printf("\n");
-       free(delta);
+       free(data);
 }
 
 #define FIRST_FEW_BYTES 8000
@@ -538,7 +538,11 @@ static void builtin_diff(const char *name_a,
                die("unable to read files to diff");
 
        if (mmfile_is_binary(&mf1) || mmfile_is_binary(&mf2)) {
-               if (o->full_index)
+               /* Quite common confusing case */
+               if (mf1.size == mf2.size &&
+                   !memcmp(mf1.ptr, mf2.ptr, mf1.size))
+                       goto free_ab_and_return;
+               if (o->binary)
                        emit_binary_diff(&mf1, &mf2);
                else
                        printf("Binary files %s and %s differ\n",
@@ -1239,6 +1243,10 @@ int diff_opt_parse(struct diff_options *options, const char **av, int ac)
                options->rename_limit = strtoul(arg+2, NULL, 10);
        else if (!strcmp(arg, "--full-index"))
                options->full_index = 1;
+       else if (!strcmp(arg, "--binary")) {
+               options->output_format = DIFF_FORMAT_PATCH;
+               options->full_index = options->binary = 1;
+       }
        else if (!strcmp(arg, "--name-only"))
                options->output_format = DIFF_FORMAT_NAME;
        else if (!strcmp(arg, "--name-status"))
diff --git a/diff.h b/diff.h
index b3b2c4dd28c4f1a8f14948e5d1262dd71f1e96cf..d052608404314880652561e4296e05e9da3f96ca 100644 (file)
--- a/diff.h
+++ b/diff.h
@@ -28,6 +28,7 @@ struct diff_options {
                 with_raw:1,
                 with_stat:1,
                 tree_in_recursive:1,
+                binary:1,
                 full_index:1,
                 silent_on_remove:1,
                 find_copies_harder:1;