From 72ae118f165f32a92599c26f899e9cb2fae46625 Mon Sep 17 00:00:00 2001 From: David Bremner Date: Sun, 14 Aug 2016 21:43:18 +0900 Subject: [PATCH] [PATCH] WIP: remove all non-prefixed-terms (and stemmed versions) --- 93/06ffe85afcb0b296b5e729df3e45120beed24c | 223 ++++++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 93/06ffe85afcb0b296b5e729df3e45120beed24c diff --git a/93/06ffe85afcb0b296b5e729df3e45120beed24c b/93/06ffe85afcb0b296b5e729df3e45120beed24c new file mode 100644 index 000000000..32f97fb5a --- /dev/null +++ b/93/06ffe85afcb0b296b5e729df3e45120beed24c @@ -0,0 +1,223 @@ +Return-Path: +X-Original-To: notmuch@notmuchmail.org +Delivered-To: notmuch@notmuchmail.org +Received: from localhost (localhost [127.0.0.1]) + by arlo.cworth.org (Postfix) with ESMTP id CCD876DE81BA + for ; Sun, 14 Aug 2016 16:41:53 -0700 (PDT) +X-Virus-Scanned: Debian amavisd-new at cworth.org +X-Spam-Flag: NO +X-Spam-Score: -0.001 +X-Spam-Level: +X-Spam-Status: No, score=-0.001 tagged_above=-999 required=5 + tests=[AWL=-0.002, HEADER_FROM_DIFFERENT_DOMAINS=0.001] + autolearn=disabled +Received: from arlo.cworth.org ([127.0.0.1]) + by localhost (arlo.cworth.org [127.0.0.1]) (amavisd-new, port 10024) + with ESMTP id sAEvMdRXwP9Q for ; + Sun, 14 Aug 2016 16:41:45 -0700 (PDT) +Received: from fethera.tethera.net (fethera.tethera.net [198.245.60.197]) + by arlo.cworth.org (Postfix) with ESMTPS id F08A56DEAEF3 + for ; Sun, 14 Aug 2016 16:24:10 -0700 (PDT) +Received: from remotemail by fethera.tethera.net with local (Exim 4.84_2) + (envelope-from ) + id 1bZ4lN-0005Jc-Ol; Sun, 14 Aug 2016 19:24:17 -0400 +Received: (nullmailer pid 9958 invoked by uid 1000); + Sun, 14 Aug 2016 12:43:42 -0000 +From: David Bremner +To: Daniel Kahn Gillmor , + Notmuch Mail +Subject: [PATCH] WIP: remove all non-prefixed-terms (and stemmed versions) +Date: Sun, 14 Aug 2016 21:43:18 +0900 +Message-Id: <1471178598-9639-1-git-send-email-david@tethera.net> +X-Mailer: git-send-email 2.8.1 +In-Reply-To: <1467970047-8013-16-git-send-email-dkg@fifthhorseman.net> +References: <1467970047-8013-16-git-send-email-dkg@fifthhorseman.net> +X-BeenThere: notmuch@notmuchmail.org +X-Mailman-Version: 2.1.20 +Precedence: list +List-Id: "Use and development of the notmuch mail system." + +List-Unsubscribe: , + +List-Archive: +List-Post: +List-Help: +List-Subscribe: , + +X-List-Received-Date: Sun, 14 Aug 2016 23:41:54 -0000 + +The testing here is not really suitable for production, since we export +a function just for testing. It would be possible to modify the test +framework to test functions in notmuch-private.h, but this was the quick +and dirty solution. +--- + +dkg wrote: + +> I could find no way to distinguish terms which were added during +> indexing of the message body from other terms associated with the +> document. + +I think this does the trick. If it makes sense, I can polish it +up. I'd appreciate any ideas about the right way to manage the +testing. We could either modify the test framework to test internal +functions, or continue on testing only exported functions and the CLI. + + lib/message.cc | 33 ++++++++++++++++++++++ + lib/notmuch-private.h | 2 ++ + lib/notmuch.h | 4 +++ + test/T650-message-terms.sh | 70 ++++++++++++++++++++++++++++++++++++++++++++++ + 4 files changed, 109 insertions(+) + create mode 100755 test/T650-message-terms.sh + +diff --git a/lib/message.cc b/lib/message.cc +index 9d3e807..9a9845a 100644 +--- a/lib/message.cc ++++ b/lib/message.cc +@@ -577,6 +577,39 @@ _notmuch_message_remove_terms (notmuch_message_t *message, const char *prefix) + } + } + ++void notmuch_test_clear_terms(notmuch_message_t *message) { ++ _notmuch_message_remove_unprefixed_terms (message); ++ _notmuch_message_sync (message); ++} ++void ++_notmuch_message_remove_unprefixed_terms (notmuch_message_t *message) ++{ ++ Xapian::TermIterator i; ++ ++ for (i = message->doc.termlist_begin (); ++ i != message->doc.termlist_end () && ++ ((*i).c_str ()[0] < 'A'); ++ i++) { ++ try { ++ message->doc.remove_term ((*i)); ++ message->modified = TRUE; ++ } catch (const Xapian::InvalidArgumentError) { ++ /* Ignore failure to remove non-existent term. */ ++ } ++ } ++ ++ /* We want to remove stemmed terms, but only those not from a ++ prefixed term */ ++ for (i.skip_to ("Z["); i != message->doc.termlist_end (); i++) { ++ try { ++ message->doc.remove_term ((*i)); ++ message->modified = TRUE; ++ } catch (const Xapian::InvalidArgumentError) { ++ /* Ignore failure to remove non-existent term. */ ++ } ++ } ++} ++ + /* Return true if p points at "new" or "cur". */ + static bool is_maildir (const char *p) + { +diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h +index 65f7ead..646fc78 100644 +--- a/lib/notmuch-private.h ++++ b/lib/notmuch-private.h +@@ -502,6 +502,8 @@ _notmuch_message_add_reply (notmuch_message_t *message, + notmuch_database_t * + _notmuch_message_database (notmuch_message_t *message); + ++void ++_notmuch_message_remove_unprefixed_terms (notmuch_message_t *message); + /* sha1.c */ + + char * +diff --git a/lib/notmuch.h b/lib/notmuch.h +index e03a05d..e964b1a 100644 +--- a/lib/notmuch.h ++++ b/lib/notmuch.h +@@ -1658,6 +1658,10 @@ notmuch_message_thaw (notmuch_message_t *message); + void + notmuch_message_destroy (notmuch_message_t *message); + ++/* for testing */ ++ ++void ++notmuch_test_clear_terms(notmuch_message_t *message); + /** + * @name Message Properties + * +diff --git a/test/T650-message-terms.sh b/test/T650-message-terms.sh +new file mode 100755 +index 0000000..553e95b +--- /dev/null ++++ b/test/T650-message-terms.sh +@@ -0,0 +1,70 @@ ++#!/usr/bin/env bash ++test_description="message API" ++ ++. ./test-lib.sh || exit 1 ++ ++add_email_corpus ++ ++cat < c_head ++#include ++#include ++#include ++#include ++#include ++ ++int main (int argc, char** argv) ++{ ++ notmuch_database_t *db; ++ notmuch_message_t *message = NULL; ++ const char *val; ++ notmuch_status_t stat; ++ ++ EXPECT0(notmuch_database_open (argv[1], NOTMUCH_DATABASE_MODE_READ_WRITE, &db)); ++ EXPECT0(notmuch_database_find_message(db, "4EFC743A.3060609@april.org", &message)); ++ if (message == NULL) { ++ fprintf (stderr, "unable to find message"); ++ exit (1); ++ } ++EOF ++ ++cat < c_tail ++ EXPECT0(notmuch_database_destroy(db)); ++} ++EOF ++ ++add_email_corpus ++ ++test_begin_subtest "check unique term" ++byid=$(notmuch count id:4EFC743A.3060609@april.org) ++byterm=$(notmuch count Boulogne) ++test_expect_equal "$byid" "$byterm" ++ ++xapian-delve -1 -a ${MAIL_DIR}/.notmuch/xapian > BEFORE ++ ++test_begin_subtest "clear non-prefixed terms from message" ++cat c_head - c_tail <<'EOF' | test_C ${MAIL_DIR} ++{ ++notmuch_test_clear_terms(message); ++} ++EOF ++byterm=$(notmuch count Boulogne) ++test_expect_equal 0 "$byterm" ++ ++test_begin_subtest "check removed terms" ++xapian-delve -1 -a ${MAIL_DIR}/.notmuch/xapian > AFTER ++comm -2 -3 BEFORE AFTER | egrep '^Z?a' > REMOVED ++cat < EXPECTED ++Zallan ++Zarch ++Zarch_packaging_standard ++Zarchlinux ++Zaur ++allan ++arch ++arch_packaging_standards ++archlinux ++aur ++EOF ++test_expect_equal_file EXPECTED REMOVED ++ ++test_done +-- +2.8.1 + -- 2.26.2