From: Jani Nikula Date: Thu, 3 Sep 2015 19:40:02 +0000 (+0300) Subject: [PATCH v2 6/9] cli: change the data structure for notmuch address deduplication X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=812500a7b656fe5dbb9c1ce24a6dd5fa7bb9d73e;p=notmuch-archives.git [PATCH v2 6/9] cli: change the data structure for notmuch address deduplication --- diff --git a/aa/29f233b2a9e93070c08f44842adfa4cde55464 b/aa/29f233b2a9e93070c08f44842adfa4cde55464 new file mode 100644 index 000000000..d269a7c01 --- /dev/null +++ b/aa/29f233b2a9e93070c08f44842adfa4cde55464 @@ -0,0 +1,232 @@ +Return-Path: +X-Original-To: notmuch@notmuchmail.org +Delivered-To: notmuch@notmuchmail.org +Received: from localhost (localhost [127.0.0.1]) + by arlo.cworth.org (Postfix) with ESMTP id 4571A6DE15D4 + for ; Thu, 3 Sep 2015 12:40:36 -0700 (PDT) +X-Virus-Scanned: Debian amavisd-new at cworth.org +X-Amavis-Alert: BAD HEADER SECTION, Duplicate header field: "References" +X-Spam-Flag: NO +X-Spam-Score: -0.425 +X-Spam-Level: +X-Spam-Status: No, score=-0.425 tagged_above=-999 required=5 tests=[AWL=0.295, + RCVD_IN_DNSWL_LOW=-0.7, RCVD_IN_MSPIKE_H3=-0.01, RCVD_IN_MSPIKE_WL=-0.01] + autolearn=disabled +Received: from arlo.cworth.org ([127.0.0.1]) + by localhost (arlo.cworth.org [127.0.0.1]) (amavisd-new, port 10024) + with ESMTP id 4NCdwu0Yw4pW for ; + Thu, 3 Sep 2015 12:40:34 -0700 (PDT) +Received: from mail-wi0-f171.google.com (mail-wi0-f171.google.com + [209.85.212.171]) + by arlo.cworth.org (Postfix) with ESMTPS id DA4D76DE15E7 + for ; Thu, 3 Sep 2015 12:40:26 -0700 (PDT) +Received: by wicge5 with SMTP id ge5so84705833wic.0 + for ; Thu, 03 Sep 2015 12:40:25 -0700 (PDT) +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20130820; + h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to + :references:in-reply-to:references; + bh=NlXTuY6QGPqQJwbKZYJuxHESc8MPqbilrd/29Rp56KY=; + b=ecazczo3DDgja9izKe6j5vSEjzimdT4lIChtohGtJzltidQ/N7gVdhdpsctKCvfn0L + o8BjfRVm0lpEWr9UaBEAWvDLNIeThvxKQpedMj2kexADMKuz14RVKgA4eGGfrFIrp5R4 + XkWTi2m6QMtb70WM61MxwpcrcAlCtBgU9CLg3C8t/OFEBMh3Gto6krijoMHafKTTZnNw + dBQxQ1NqKurOCju0PPJTQq0+gv/iHbraoYAtGYttxqKy9wn3sjOsgIHVnxbT6E78Oxqw + vXpYphUWecIZHWj3+sFoFfD51HIhMRk+AuX9Rz//qvnp9SsYxKG0wKQyThSiKXy9xVA2 + dkhw== +X-Gm-Message-State: + ALoCoQmLKjgeYYFJb4z06SoBNJroKC0WFBfEf1bmpF0SZFar09Xx49ipH/WxwFH5thPRF0fQ2Ort +X-Received: by 10.194.82.167 with SMTP id j7mr54475903wjy.123.1441309225376; + Thu, 03 Sep 2015 12:40:25 -0700 (PDT) +Received: from localhost (mobile-access-bcee4f-131.dhcp.inet.fi. + [188.238.79.131]) + by smtp.gmail.com with ESMTPSA id i7sm625501wib.15.2015.09.03.12.40.24 + (version=TLSv1/SSLv3 cipher=OTHER); + Thu, 03 Sep 2015 12:40:24 -0700 (PDT) +From: Jani Nikula +To: notmuch@notmuchmail.org +Subject: [PATCH v2 6/9] cli: change the data structure for notmuch address + deduplication +Date: Thu, 3 Sep 2015 22:40:02 +0300 +Message-Id: + +X-Mailer: git-send-email 2.1.4 +In-Reply-To: +References: +In-Reply-To: +References: +X-BeenThere: notmuch@notmuchmail.org +X-Mailman-Version: 2.1.18 +Precedence: list +List-Id: "Use and development of the notmuch mail system." + +List-Unsubscribe: , + +List-Archive: +List-Post: +List-Help: +List-Subscribe: , + +X-List-Received-Date: Thu, 03 Sep 2015 19:40:36 -0000 + +Currently we key the address hash table with the case sensitive "name +
". Switch to case insensitive keying with just address, and +store the case sensitive name and address in linked lists. This will +be helpful in adding support for different deduplication schemes in +the future. + +There will be a slight performance penalty for the current full case +sensitive name + address deduplication, but this is simpler as a whole +when other deduplication schemes are added, and I expect the schemes +to be added to become more popular than the current default. + +Aparet from the possible performance penalty, the only user visible +change should be the change in the output ordering for +--output=count. The order is not guaranteed (and is based on hash +table traversal) currently anyway, so this should be of no +consequence. +--- + notmuch-client.h | 1 + + notmuch-search.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++---------- + 2 files changed, 70 insertions(+), 15 deletions(-) + +diff --git a/notmuch-client.h b/notmuch-client.h +index 882aa30563df..97d68d1158ac 100644 +--- a/notmuch-client.h ++++ b/notmuch-client.h +@@ -48,6 +48,7 @@ typedef GMimeCryptoContext notmuch_crypto_context_t; + #include + #include + #include ++#include + + #include "talloc-extra.h" + +diff --git a/notmuch-search.c b/notmuch-search.c +index 66404b561679..7c51d5df6bd4 100644 +--- a/notmuch-search.c ++++ b/notmuch-search.c +@@ -265,30 +265,74 @@ static mailbox_t *new_mailbox (void *ctx, const char *name, const char *addr) + return mailbox; + } + ++static int mailbox_compare (const void *v1, const void *v2) ++{ ++ const mailbox_t *m1 = v1, *m2 = v2; ++ int v; ++ ++ if (m1->name && m2->name) ++ v = strcmp (m1->name, m2->name); ++ else ++ v = !!m1->name - !!m2->name; ++ ++ if (! v) ++ v = strcmp (m1->addr, m2->addr); ++ ++ return v; ++} ++ + /* Returns TRUE iff name and addr is duplicate. If not, stores the + * name/addr pair in order to detect subsequent duplicates. */ + static notmuch_bool_t + is_duplicate (const search_context_t *ctx, const char *name, const char *addr) + { + char *key; ++ GList *list, *l; + mailbox_t *mailbox; + +- key = talloc_asprintf (ctx->format, "%s <%s>", name, addr); +- if (! key) +- return FALSE; ++ list = g_hash_table_lookup (ctx->addresses, addr); ++ if (list) { ++ mailbox_t find = { ++ .name = name, ++ .addr = addr, ++ }; ++ ++ l = g_list_find_custom (list, &find, mailbox_compare); ++ if (l) { ++ mailbox = l->data; ++ mailbox->count++; ++ return TRUE; ++ } + +- mailbox = g_hash_table_lookup (ctx->addresses, key); +- if (mailbox) { +- mailbox->count++; +- talloc_free (key); +- return TRUE; ++ mailbox = new_mailbox (ctx->format, name, addr); ++ if (! mailbox) ++ return FALSE; ++ ++ /* ++ * XXX: It would be more efficient to prepend to the list, but ++ * then we'd have to store the changed list head back to the ++ * hash table. This check is here just to avoid the compiler ++ * warning for unused result. ++ */ ++ if (list != g_list_append (list, mailbox)) ++ INTERNAL_ERROR ("appending to list changed list head\n"); ++ ++ return FALSE; + } + ++ key = talloc_strdup (ctx->format, addr); ++ if (! key) ++ return FALSE; ++ + mailbox = new_mailbox (ctx->format, name, addr); + if (! mailbox) + return FALSE; + +- g_hash_table_insert (ctx->addresses, key, mailbox); ++ list = g_list_append (NULL, mailbox); ++ if (! list) ++ return FALSE; ++ ++ g_hash_table_insert (ctx->addresses, key, list); + + return FALSE; + } +@@ -401,12 +445,21 @@ _talloc_free_for_g_hash (void *ptr) + } + + static void +-print_hash_value (unused (gpointer key), gpointer value, gpointer user_data) ++_list_free_for_g_hash (void *ptr) ++{ ++ g_list_free_full (ptr, _talloc_free_for_g_hash); ++} ++ ++static void ++print_list_value (void *mailbox, void *context) + { +- const mailbox_t *mailbox = value; +- search_context_t *ctx = user_data; ++ print_mailbox (context, mailbox); ++} + +- print_mailbox (ctx, mailbox); ++static void ++print_hash_value (unused (void *key), void *list, void *context) ++{ ++ g_list_foreach (list, print_list_value, context); + } + + static int +@@ -792,8 +845,9 @@ notmuch_address_command (notmuch_config_t *config, int argc, char *argv[]) + argc - opt_index, argv + opt_index)) + return EXIT_FAILURE; + +- ctx->addresses = g_hash_table_new_full (g_str_hash, g_str_equal, +- _talloc_free_for_g_hash, _talloc_free_for_g_hash); ++ ctx->addresses = g_hash_table_new_full (strcase_hash, strcase_equal, ++ _talloc_free_for_g_hash, ++ _list_free_for_g_hash); + + ret = do_search_messages (ctx); + +-- +2.1.4 +