ac/fd88e49a1ec216cff236c14c7ee84b6680403a

   1 Return-Path: <sojkam1@fel.cvut.cz>\r
   2 X-Original-To: notmuch@notmuchmail.org\r
   3 Delivered-To: notmuch@notmuchmail.org\r
   4 Received: from localhost (localhost [127.0.0.1])\r
   5         by olra.theworths.org (Postfix) with ESMTP id 22336431FCB\r
   6         for <notmuch@notmuchmail.org>; Thu, 30 Oct 2014 14:35:08 -0700 (PDT)\r
   7 X-Virus-Scanned: Debian amavisd-new at olra.theworths.org\r
   8 X-Spam-Flag: NO\r
   9 X-Spam-Score: -2.3\r
  10 X-Spam-Level: \r
  11 X-Spam-Status: No, score=-2.3 tagged_above=-999 required=5\r
  12         tests=[RCVD_IN_DNSWL_MED=-2.3] autolearn=disabled\r
  13 Received: from olra.theworths.org ([127.0.0.1])\r
  14         by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024)\r
  15         with ESMTP id alERxAgzj4jm for <notmuch@notmuchmail.org>;\r
  16         Thu, 30 Oct 2014 14:35:00 -0700 (PDT)\r
  17 Received: from max.feld.cvut.cz (max.feld.cvut.cz [147.32.192.36])\r
  18         by olra.theworths.org (Postfix) with ESMTP id 275C0431FC2\r
  19         for <notmuch@notmuchmail.org>; Thu, 30 Oct 2014 14:35:00 -0700 (PDT)\r
  20 Received: from localhost (unknown [192.168.200.7])\r
  21         by max.feld.cvut.cz (Postfix) with ESMTP id E0B475CCFD0;\r
  22         Thu, 30 Oct 2014 22:34:58 +0100 (CET)\r
  23 X-Virus-Scanned: IMAP STYX AMAVIS\r
  24 Received: from max.feld.cvut.cz ([192.168.200.1])\r
  25         by localhost (styx.feld.cvut.cz [192.168.200.7]) (amavisd-new,\r
  26         port 10044)\r
  27         with ESMTP id yqLkr-oU8A41; Thu, 30 Oct 2014 22:34:55 +0100 (CET)\r
  28 Received: from imap.feld.cvut.cz (imap.feld.cvut.cz [147.32.192.34])\r
  29         by max.feld.cvut.cz (Postfix) with ESMTP id 0F2B95CCFCB;\r
  30         Thu, 30 Oct 2014 22:34:55 +0100 (CET)\r
  31 Received: from wsh by steelpick.2x.cz with local (Exim 4.84)\r
  32         (envelope-from <sojkam1@fel.cvut.cz>)\r
  33         id 1XjxMn-0005tO-5G; Thu, 30 Oct 2014 22:34:49 +0100\r
  34 From: Michal Sojka <sojkam1@fel.cvut.cz>\r
  35 To: Mark Walters <markwalters1009@gmail.com>, notmuch@notmuchmail.org\r
  36 Subject: Re: [PATCH v4 5/6] cli: search: Add configurable way to filter\r
  37         out     duplicate addresses\r
  38 In-Reply-To: <87egtqug4t.fsf@qmul.ac.uk>\r
  39 References: <1414421455-3037-1-git-send-email-sojkam1@fel.cvut.cz>\r
  40         <1414421455-3037-6-git-send-email-sojkam1@fel.cvut.cz>\r
  41         <87egtqug4t.fsf@qmul.ac.uk>\r
  42 User-Agent: Notmuch/0.18.2+157~ga00d359 (http://notmuchmail.org) Emacs/24.3.1\r
  43         (x86_64-pc-linux-gnu)\r
  44 Date: Thu, 30 Oct 2014 22:34:49 +0100\r
  45 Message-ID: <874mulckcm.fsf@steelpick.2x.cz>\r
  46 MIME-Version: 1.0\r
  47 Content-Type: text/plain\r
  48 X-BeenThere: notmuch@notmuchmail.org\r
  49 X-Mailman-Version: 2.1.13\r
  50 Precedence: list\r
  51 List-Id: "Use and development of the notmuch mail system."\r
  52         <notmuch.notmuchmail.org>\r
  53 List-Unsubscribe: <http://notmuchmail.org/mailman/options/notmuch>,\r
  54         <mailto:notmuch-request@notmuchmail.org?subject=unsubscribe>\r
  55 List-Archive: <http://notmuchmail.org/pipermail/notmuch>\r
  56 List-Post: <mailto:notmuch@notmuchmail.org>\r
  57 List-Help: <mailto:notmuch-request@notmuchmail.org?subject=help>\r
  58 List-Subscribe: <http://notmuchmail.org/mailman/listinfo/notmuch>,\r
  59         <mailto:notmuch-request@notmuchmail.org?subject=subscribe>\r
  60 X-List-Received-Date: Thu, 30 Oct 2014 21:35:08 -0000\r
  61 \r
  62 On Thu, Oct 30 2014, Mark Walters wrote:\r
  63 > On Mon, 27 Oct 2014, Michal Sojka <sojkam1@fel.cvut.cz> wrote:\r
  64 >> This adds an algorithm to filter out duplicate addresses from address\r
  65 >> outputs (sender, receivers). The algorithm can be configured with\r
  66 >> --filter-by command line option.\r
  67 >>\r
  68 >> The code here is an extended version of a patch from Jani Nikula.\r
  69 >\r
  70 > Hi\r
  71 >\r
  72 > As this is getting into the more controversial bike shedding region I\r
  73 > wonder if it would be worth splitting this into 2 patches: the first\r
  74 > could do the default dedupe based on name/address and the second could\r
  75 > do add the filter-by options. \r
  76 >\r
  77 > I think the default deduping is obviously worth doing but I am not sure\r
  78 > about the rest. In any case I think the default deduping could go in\r
  79 > pre-freeze but I would recommend the rest is left until after.\r
  80 \r
  81 Yes, this makes sense. I'll send v5 in a while.\r
  82 \r
  83 >\r
  84 >> ---\r
  85 >>  completion/notmuch-completion.bash |  6 ++-\r
  86 >>  completion/notmuch-completion.zsh  |  3 +-\r
  87 >>  doc/man1/notmuch-search.rst        | 38 +++++++++++++++\r
  88 >>  notmuch-search.c                   | 98 +++++++++++++++++++++++++++++++++++---\r
  89 >>  test/T090-search-output.sh         | 87 +++++++++++++++++++++++++++++++++\r
  90 >>  test/T095-search-filter-by.sh      | 64 +++++++++++++++++++++++++\r
  91 >>  6 files changed, 288 insertions(+), 8 deletions(-)\r
  92 >>  create mode 100755 test/T095-search-filter-by.sh\r
  93 >>\r
  94 >> diff --git a/completion/notmuch-completion.bash b/completion/notmuch-completion.bash\r
  95 >> index cfbd389..6b6d43a 100644\r
  96 >> --- a/completion/notmuch-completion.bash\r
  97 >> +++ b/completion/notmuch-completion.bash\r
  98 >> @@ -305,12 +305,16 @@ _notmuch_search()\r
  99 >>          COMPREPLY=( $( compgen -W "true false flag all" -- "${cur}" ) )\r
 100 >>          return\r
 101 >>          ;;\r
 102 >> +    --filter-by)\r
 103 >> +        COMPREPLY=( $( compgen -W "nameaddr name addr addrfold nameaddrfold" -- "${cur}" ) )\r
 104 >> +        return\r
 105 >> +        ;;\r
 106 >>      esac\r
 107 >>  \r
 108 >>      ! $split &&\r
 109 >>      case "${cur}" in\r
 110 >>      -*)\r
 111 >> -        local options="--format= --output= --sort= --offset= --limit= --exclude= --duplicate="\r
 112 >> +        local options="--format= --output= --sort= --offset= --limit= --exclude= --duplicate= --filter-by="\r
 113 >>          compopt -o nospace\r
 114 >>          COMPREPLY=( $(compgen -W "$options" -- ${cur}) )\r
 115 >>          ;;\r
 116 >> diff --git a/completion/notmuch-completion.zsh b/completion/notmuch-completion.zsh\r
 117 >> index 3e52a00..3e535df 100644\r
 118 >> --- a/completion/notmuch-completion.zsh\r
 119 >> +++ b/completion/notmuch-completion.zsh\r
 120 >> @@ -53,7 +53,8 @@ _notmuch_search()\r
 121 >>      '--max-threads=[display only the first x threads from the search results]:number of threads to show: ' \\r
 122 >>      '--first=[omit the first x threads from the search results]:number of threads to omit: ' \\r
 123 >>      '--sort=[sort results]:sorting:((newest-first\:"reverse chronological order" oldest-first\:"chronological order"))' \\r
 124 >> -    '--output=[select what to output]:output:((summary threads messages files tags sender recipients))'\r
 125 >> +    '--output=[select what to output]:output:((summary threads messages files tags sender recipients))' \\r
 126 >> +    '--filter-by=[filter out duplicate addresses]:filter-by:((nameaddr\:"both name and address part" name\:"name part" addr\:"address part" addrfold\:"case-insensitive address part" nameaddrfold\:"name and case-insensitive address part"))'\r
 127 >>  }\r
 128 >>  \r
 129 >>  _notmuch()\r
 130 >> diff --git a/doc/man1/notmuch-search.rst b/doc/man1/notmuch-search.rst\r
 131 >> index b6607c9..84af2da 100644\r
 132 >> --- a/doc/man1/notmuch-search.rst\r
 133 >> +++ b/doc/man1/notmuch-search.rst\r
 134 >> @@ -85,6 +85,9 @@ Supported options for **search** include\r
 135 >>              (--format=text0), as a JSON array (--format=json), or as\r
 136 >>              an S-Expression list (--format=sexp).\r
 137 >>  \r
 138 >> +            Duplicate addresses are filtered out. Filtering can be\r
 139 >> +            configured with the --filter-by option.\r
 140 >> +\r
 141 >>          Note: Searching for **sender** should be much faster than\r
 142 >>          searching for **recipients**, because sender addresses are\r
 143 >>          cached directly in the database whereas other addresses\r
 144 >> @@ -151,6 +154,41 @@ Supported options for **search** include\r
 145 >>          prefix. The prefix matches messages based on filenames. This\r
 146 >>          option filters filenames of the matching messages.\r
 147 >>  \r
 148 >> +    ``--filter-by=``\ (**nameaddr**\ \|\ **name** \|\ **addr**\ \|\ **addrfold**\ \|\ **nameaddrfold**\)\r
 149 >> +\r
 150 >> +    Can be used with ``--output=sender`` or\r
 151 >> +    ``--output=recipients`` to filter out duplicate addresses. The\r
 152 >> +    filtering algorithm receives a sequence of email addresses and\r
 153 >> +    outputs the same sequence without the addresses that are\r
 154 >> +    considered a duplicate of a previously output address. What is\r
 155 >> +    considered a duplicate depends on how the two addresses are\r
 156 >> +    compared and this can be controlled with the follwing flags:\r
 157 >> +\r
 158 >> +    **nameaddr** means that both name and address parts are\r
 159 >> +    compared in case-sensitive manner. Therefore, all same looking\r
 160 >> +    addresses strings are considered duplicate. This is the\r
 161 >> +    default.\r
 162 >> +\r
 163 >> +    **name** means that only the name part is compared (in\r
 164 >> +    case-sensitive manner). For example, the addresses "John Doe\r
 165 >> +    <me@example.com>" and "John Doe <john@doe.name>" will be\r
 166 >> +    considered duplicate.\r
 167 >> +\r
 168 >> +    **addr** means that only the address part is compared (in\r
 169 >> +    case-sensitive manner). For example, the addresses "John Doe\r
 170 >> +    <john@example.com>" and "Dr. John Doe <john@example.com>" will\r
 171 >> +    be considered duplicate.\r
 172 >> +\r
 173 >> +    **addrfold** is like **addr**, but comparison is done in\r
 174 >> +    canse-insensitive manner. For example, the addresses "John Doe\r
 175 >> +    <john@example.com>" and "Dr. John Doe <JOHN@EXAMPLE.COM>" will\r
 176 >> +    be considered duplicate.\r
 177 >> +\r
 178 >> +    **nameaddrfold** is like **nameaddr**, but address comparison\r
 179 >> +    is done in canse-insensitive manner. For example, the\r
 180 >> +    addresses "John Doe <john@example.com>" and "John Doe\r
 181 >> +    <JOHN@EXAMPLE.COM>" will be considered duplicate.\r
 182 >> +\r
 183 >>  EXIT STATUS\r
 184 >>  ===========\r
 185 >>  \r
 186 >> diff --git a/notmuch-search.c b/notmuch-search.c\r
 187 >> index ce3bfb2..47aa979 100644\r
 188 >> --- a/notmuch-search.c\r
 189 >> +++ b/notmuch-search.c\r
 190 >> @@ -34,6 +34,14 @@ typedef enum {\r
 191 >>  \r
 192 >>  #define OUTPUT_ADDRESS_FLAGS (OUTPUT_SENDER | OUTPUT_RECIPIENTS)\r
 193 >>  \r
 194 >> +typedef enum {\r
 195 >> +    FILTER_BY_NAMEADDR = 0,\r
 196 >> +    FILTER_BY_NAME,\r
 197 >> +    FILTER_BY_ADDR,\r
 198 >> +    FILTER_BY_ADDRFOLD,\r
 199 >> +    FILTER_BY_NAMEADDRFOLD,\r
 200 >> +} filter_by_t;\r
 201 >> +\r
 202 >>  typedef struct {\r
 203 >>      sprinter_t *format;\r
 204 >>      notmuch_query_t *query;\r
 205 >> @@ -42,6 +50,7 @@ typedef struct {\r
 206 >>      int offset;\r
 207 >>      int limit;\r
 208 >>      int dupe;\r
 209 >> +    filter_by_t filter_by;\r
 210 >>  } search_options_t;\r
 211 >>  \r
 212 >>  typedef struct {\r
 213 >> @@ -229,6 +238,52 @@ do_search_threads (search_options_t *opt)\r
 214 >>      return 0;\r
 215 >>  }\r
 216 >>  \r
 217 >> +/* Returns TRUE iff name and/or addr is considered duplicite. */\r
 218 >\r
 219 > A triviality; duplicite should be duplicate\r
 220 >\r
 221 >> +static notmuch_bool_t\r
 222 >> +check_duplicite (const search_options_t *opt, GHashTable *addrs, const char *name, const char *addr)\r
 223 >\r
 224 > I am not sure on style but maybe is_duplicate would be clearer?\r
 225 \r
 226 OK\r
 227 \r
 228 -Michal\r