From 2c18d1395aa550d76604541cbbcde61ab29abd4c Mon Sep 17 00:00:00 2001 From: Michal Sojka Date: Thu, 30 Oct 2014 22:34:49 +0100 Subject: [PATCH] Re: [PATCH v4 5/6] cli: search: Add configurable way to filter out duplicate addresses --- ac/fd88e49a1ec216cff236c14c7ee84b6680403a | 228 ++++++++++++++++++++++ 1 file changed, 228 insertions(+) create mode 100644 ac/fd88e49a1ec216cff236c14c7ee84b6680403a diff --git a/ac/fd88e49a1ec216cff236c14c7ee84b6680403a b/ac/fd88e49a1ec216cff236c14c7ee84b6680403a new file mode 100644 index 000000000..15b960958 --- /dev/null +++ b/ac/fd88e49a1ec216cff236c14c7ee84b6680403a @@ -0,0 +1,228 @@ +Return-Path: +X-Original-To: notmuch@notmuchmail.org +Delivered-To: notmuch@notmuchmail.org +Received: from localhost (localhost [127.0.0.1]) + by olra.theworths.org (Postfix) with ESMTP id 22336431FCB + for ; Thu, 30 Oct 2014 14:35:08 -0700 (PDT) +X-Virus-Scanned: Debian amavisd-new at olra.theworths.org +X-Spam-Flag: NO +X-Spam-Score: -2.3 +X-Spam-Level: +X-Spam-Status: No, score=-2.3 tagged_above=-999 required=5 + tests=[RCVD_IN_DNSWL_MED=-2.3] autolearn=disabled +Received: from olra.theworths.org ([127.0.0.1]) + by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024) + with ESMTP id alERxAgzj4jm for ; + Thu, 30 Oct 2014 14:35:00 -0700 (PDT) +Received: from max.feld.cvut.cz (max.feld.cvut.cz [147.32.192.36]) + by olra.theworths.org (Postfix) with ESMTP id 275C0431FC2 + for ; Thu, 30 Oct 2014 14:35:00 -0700 (PDT) +Received: from localhost (unknown [192.168.200.7]) + by max.feld.cvut.cz (Postfix) with ESMTP id E0B475CCFD0; + Thu, 30 Oct 2014 22:34:58 +0100 (CET) +X-Virus-Scanned: IMAP STYX AMAVIS +Received: from max.feld.cvut.cz ([192.168.200.1]) + by localhost (styx.feld.cvut.cz [192.168.200.7]) (amavisd-new, + port 10044) + with ESMTP id yqLkr-oU8A41; Thu, 30 Oct 2014 22:34:55 +0100 (CET) +Received: from imap.feld.cvut.cz (imap.feld.cvut.cz [147.32.192.34]) + by max.feld.cvut.cz (Postfix) with ESMTP id 0F2B95CCFCB; + Thu, 30 Oct 2014 22:34:55 +0100 (CET) +Received: from wsh by steelpick.2x.cz with local (Exim 4.84) + (envelope-from ) + id 1XjxMn-0005tO-5G; Thu, 30 Oct 2014 22:34:49 +0100 +From: Michal Sojka +To: Mark Walters , notmuch@notmuchmail.org +Subject: Re: [PATCH v4 5/6] cli: search: Add configurable way to filter + out duplicate addresses +In-Reply-To: <87egtqug4t.fsf@qmul.ac.uk> +References: <1414421455-3037-1-git-send-email-sojkam1@fel.cvut.cz> + <1414421455-3037-6-git-send-email-sojkam1@fel.cvut.cz> + <87egtqug4t.fsf@qmul.ac.uk> +User-Agent: Notmuch/0.18.2+157~ga00d359 (http://notmuchmail.org) Emacs/24.3.1 + (x86_64-pc-linux-gnu) +Date: Thu, 30 Oct 2014 22:34:49 +0100 +Message-ID: <874mulckcm.fsf@steelpick.2x.cz> +MIME-Version: 1.0 +Content-Type: text/plain +X-BeenThere: notmuch@notmuchmail.org +X-Mailman-Version: 2.1.13 +Precedence: list +List-Id: "Use and development of the notmuch mail system." + +List-Unsubscribe: , + +List-Archive: +List-Post: +List-Help: +List-Subscribe: , + +X-List-Received-Date: Thu, 30 Oct 2014 21:35:08 -0000 + +On Thu, Oct 30 2014, Mark Walters wrote: +> On Mon, 27 Oct 2014, Michal Sojka wrote: +>> This adds an algorithm to filter out duplicate addresses from address +>> outputs (sender, receivers). The algorithm can be configured with +>> --filter-by command line option. +>> +>> The code here is an extended version of a patch from Jani Nikula. +> +> Hi +> +> As this is getting into the more controversial bike shedding region I +> wonder if it would be worth splitting this into 2 patches: the first +> could do the default dedupe based on name/address and the second could +> do add the filter-by options. +> +> I think the default deduping is obviously worth doing but I am not sure +> about the rest. In any case I think the default deduping could go in +> pre-freeze but I would recommend the rest is left until after. + +Yes, this makes sense. I'll send v5 in a while. + +> +>> --- +>> completion/notmuch-completion.bash | 6 ++- +>> completion/notmuch-completion.zsh | 3 +- +>> doc/man1/notmuch-search.rst | 38 +++++++++++++++ +>> notmuch-search.c | 98 +++++++++++++++++++++++++++++++++++--- +>> test/T090-search-output.sh | 87 +++++++++++++++++++++++++++++++++ +>> test/T095-search-filter-by.sh | 64 +++++++++++++++++++++++++ +>> 6 files changed, 288 insertions(+), 8 deletions(-) +>> create mode 100755 test/T095-search-filter-by.sh +>> +>> diff --git a/completion/notmuch-completion.bash b/completion/notmuch-completion.bash +>> index cfbd389..6b6d43a 100644 +>> --- a/completion/notmuch-completion.bash +>> +++ b/completion/notmuch-completion.bash +>> @@ -305,12 +305,16 @@ _notmuch_search() +>> COMPREPLY=( $( compgen -W "true false flag all" -- "${cur}" ) ) +>> return +>> ;; +>> + --filter-by) +>> + COMPREPLY=( $( compgen -W "nameaddr name addr addrfold nameaddrfold" -- "${cur}" ) ) +>> + return +>> + ;; +>> esac +>> +>> ! $split && +>> case "${cur}" in +>> -*) +>> - local options="--format= --output= --sort= --offset= --limit= --exclude= --duplicate=" +>> + local options="--format= --output= --sort= --offset= --limit= --exclude= --duplicate= --filter-by=" +>> compopt -o nospace +>> COMPREPLY=( $(compgen -W "$options" -- ${cur}) ) +>> ;; +>> diff --git a/completion/notmuch-completion.zsh b/completion/notmuch-completion.zsh +>> index 3e52a00..3e535df 100644 +>> --- a/completion/notmuch-completion.zsh +>> +++ b/completion/notmuch-completion.zsh +>> @@ -53,7 +53,8 @@ _notmuch_search() +>> '--max-threads=[display only the first x threads from the search results]:number of threads to show: ' \ +>> '--first=[omit the first x threads from the search results]:number of threads to omit: ' \ +>> '--sort=[sort results]:sorting:((newest-first\:"reverse chronological order" oldest-first\:"chronological order"))' \ +>> - '--output=[select what to output]:output:((summary threads messages files tags sender recipients))' +>> + '--output=[select what to output]:output:((summary threads messages files tags sender recipients))' \ +>> + '--filter-by=[filter out duplicate addresses]:filter-by:((nameaddr\:"both name and address part" name\:"name part" addr\:"address part" addrfold\:"case-insensitive address part" nameaddrfold\:"name and case-insensitive address part"))' +>> } +>> +>> _notmuch() +>> diff --git a/doc/man1/notmuch-search.rst b/doc/man1/notmuch-search.rst +>> index b6607c9..84af2da 100644 +>> --- a/doc/man1/notmuch-search.rst +>> +++ b/doc/man1/notmuch-search.rst +>> @@ -85,6 +85,9 @@ Supported options for **search** include +>> (--format=text0), as a JSON array (--format=json), or as +>> an S-Expression list (--format=sexp). +>> +>> + Duplicate addresses are filtered out. Filtering can be +>> + configured with the --filter-by option. +>> + +>> Note: Searching for **sender** should be much faster than +>> searching for **recipients**, because sender addresses are +>> cached directly in the database whereas other addresses +>> @@ -151,6 +154,41 @@ Supported options for **search** include +>> prefix. The prefix matches messages based on filenames. This +>> option filters filenames of the matching messages. +>> +>> + ``--filter-by=``\ (**nameaddr**\ \|\ **name** \|\ **addr**\ \|\ **addrfold**\ \|\ **nameaddrfold**\) +>> + +>> + Can be used with ``--output=sender`` or +>> + ``--output=recipients`` to filter out duplicate addresses. The +>> + filtering algorithm receives a sequence of email addresses and +>> + outputs the same sequence without the addresses that are +>> + considered a duplicate of a previously output address. What is +>> + considered a duplicate depends on how the two addresses are +>> + compared and this can be controlled with the follwing flags: +>> + +>> + **nameaddr** means that both name and address parts are +>> + compared in case-sensitive manner. Therefore, all same looking +>> + addresses strings are considered duplicate. This is the +>> + default. +>> + +>> + **name** means that only the name part is compared (in +>> + case-sensitive manner). For example, the addresses "John Doe +>> + " and "John Doe " will be +>> + considered duplicate. +>> + +>> + **addr** means that only the address part is compared (in +>> + case-sensitive manner). For example, the addresses "John Doe +>> + " and "Dr. John Doe " will +>> + be considered duplicate. +>> + +>> + **addrfold** is like **addr**, but comparison is done in +>> + canse-insensitive manner. For example, the addresses "John Doe +>> + " and "Dr. John Doe " will +>> + be considered duplicate. +>> + +>> + **nameaddrfold** is like **nameaddr**, but address comparison +>> + is done in canse-insensitive manner. For example, the +>> + addresses "John Doe " and "John Doe +>> + " will be considered duplicate. +>> + +>> EXIT STATUS +>> =========== +>> +>> diff --git a/notmuch-search.c b/notmuch-search.c +>> index ce3bfb2..47aa979 100644 +>> --- a/notmuch-search.c +>> +++ b/notmuch-search.c +>> @@ -34,6 +34,14 @@ typedef enum { +>> +>> #define OUTPUT_ADDRESS_FLAGS (OUTPUT_SENDER | OUTPUT_RECIPIENTS) +>> +>> +typedef enum { +>> + FILTER_BY_NAMEADDR = 0, +>> + FILTER_BY_NAME, +>> + FILTER_BY_ADDR, +>> + FILTER_BY_ADDRFOLD, +>> + FILTER_BY_NAMEADDRFOLD, +>> +} filter_by_t; +>> + +>> typedef struct { +>> sprinter_t *format; +>> notmuch_query_t *query; +>> @@ -42,6 +50,7 @@ typedef struct { +>> int offset; +>> int limit; +>> int dupe; +>> + filter_by_t filter_by; +>> } search_options_t; +>> +>> typedef struct { +>> @@ -229,6 +238,52 @@ do_search_threads (search_options_t *opt) +>> return 0; +>> } +>> +>> +/* Returns TRUE iff name and/or addr is considered duplicite. */ +> +> A triviality; duplicite should be duplicate +> +>> +static notmuch_bool_t +>> +check_duplicite (const search_options_t *opt, GHashTable *addrs, const char *name, const char *addr) +> +> I am not sure on style but maybe is_duplicate would be clearer? + +OK + +-Michal -- 2.26.2