From 83166d2f55de594a91942100dc283c4a717ca2ae Mon Sep 17 00:00:00 2001 From: Daniel Kahn Gillmor Date: Sat, 9 Apr 2016 22:02:29 +2100 Subject: [PATCH] [PATCH v3 2/7] test thread breakage when messages are removed and re-added --- 5c/9854f08b602f0a1d1cbe94b45cffa4d75358b9 | 274 ++++++++++++++++++++++ 1 file changed, 274 insertions(+) create mode 100644 5c/9854f08b602f0a1d1cbe94b45cffa4d75358b9 diff --git a/5c/9854f08b602f0a1d1cbe94b45cffa4d75358b9 b/5c/9854f08b602f0a1d1cbe94b45cffa4d75358b9 new file mode 100644 index 000000000..e00c4330c --- /dev/null +++ b/5c/9854f08b602f0a1d1cbe94b45cffa4d75358b9 @@ -0,0 +1,274 @@ +Return-Path: +X-Original-To: notmuch@notmuchmail.org +Delivered-To: notmuch@notmuchmail.org +Received: from localhost (localhost [127.0.0.1]) + by arlo.cworth.org (Postfix) with ESMTP id E0F3F6DE02DD + for ; Fri, 8 Apr 2016 18:02:46 -0700 (PDT) +X-Virus-Scanned: Debian amavisd-new at cworth.org +X-Spam-Flag: NO +X-Spam-Score: 0 +X-Spam-Level: +X-Spam-Status: No, score=0 tagged_above=-999 required=5 tests=[none] + autolearn=disabled +Received: from arlo.cworth.org ([127.0.0.1]) + by localhost (arlo.cworth.org [127.0.0.1]) (amavisd-new, port 10024) + with ESMTP id 6tE6w4f95HQZ for ; + Fri, 8 Apr 2016 18:02:39 -0700 (PDT) +Received: from che.mayfirst.org (che.mayfirst.org [209.234.253.108]) + by arlo.cworth.org (Postfix) with ESMTP id C349A6DE028C + for ; Fri, 8 Apr 2016 18:02:38 -0700 (PDT) +Received: from fifthhorseman.net (unknown [201.140.212.132]) + by che.mayfirst.org (Postfix) with ESMTPSA id 969C110083 + for ; Fri, 8 Apr 2016 21:02:36 -0400 (EDT) +Received: by fifthhorseman.net (Postfix, from userid 1000) + id F13151FE7C; Fri, 8 Apr 2016 22:02:34 -0300 (ART) +From: Daniel Kahn Gillmor +To: Notmuch Mail +Subject: [PATCH v3 2/7] test thread breakage when messages are removed and + re-added +Date: Fri, 8 Apr 2016 22:02:29 -0300 +Message-Id: <1460163754-22994-2-git-send-email-dkg@fifthhorseman.net> +X-Mailer: git-send-email 2.8.0.rc3 +In-Reply-To: <1460163754-22994-1-git-send-email-dkg@fifthhorseman.net> +References: <1459445693-3900-1-git-send-email-dkg@fifthhorseman.net> + <1460163754-22994-1-git-send-email-dkg@fifthhorseman.net> +X-BeenThere: notmuch@notmuchmail.org +X-Mailman-Version: 2.1.20 +Precedence: list +List-Id: "Use and development of the notmuch mail system." + +List-Unsubscribe: , + +List-Archive: +List-Post: +List-Help: +List-Subscribe: , + +X-List-Received-Date: Sat, 09 Apr 2016 01:02:47 -0000 + +This test (T590-thread-breakage.sh) has known-broken subtests. + +If you have a two-message thread where message "B" is in-reply-to "A", +notmuch rightly sees this as a single thread. + +But if you: + + * remove "A" from the message store + * run "notmuch new" + * add "A" back into the message store + * re-run "notmuch new" + +Then notmuch sees the messages as distinct threads. + +This happens because if you insert "B" initially (before anything is +known about "A"), then a "ghost message" gets added to the database in +reference to "A" that is in the same thread, which "A" takes over when +it appears. + +But if "A" is subsequently removed, no ghost message is retained, so +when "A" appears, it is treated as a new thread. + +I see a few options to fix this: + +ghost-on-removal +---------------- + +We could unilaterally add a ghost upon message removal. This has a +few disadvantages: the message index would leak information about what +messages the user has ever been exposed to, and we also create a +perpetually-growing dataset -- the ghosts can never be removed. + +ghost-on-removal-when-shared-thread-exists +------------------------------------------ + +We could add a ghost upon message removal iff there are other +non-ghost messages with the same thread ID. + +We'd also need to remove all ghost messages that share a thread when +the last non-ghost message in that thread is removed. + +This still has a bit of information leakage, though: the message index +would reveal that i've seen a newer message in a thread, even if i had +deleted it from my message store + +track-dependencies +------------------ + +rather than a simple "ghost-message" we could store all the (A,B) +message-reference pairs internally, showing which messages A reference +which other messages B. + +Then removal of message X would require deleting all message-reference +pairs (X,B), and only deleting a ghost message if no (A,X) reference +pair exists. + +This requires modifying the database by adding a new and fairly weird +table that would need to be indexed by both columns. I don't know +whether xapian has nice ways to do that. + +scan-dependencies +----------------- + +Without modifying the database, we could do something less efficient. + +Upon removal of message X, we could scan the headers of all non-ghost +messages that share a thread with X. If any of those messages refers +to X, we would add a ghost message. If none of them do, then we would +just drop X entirely from the table. + +--------------------- + +One risk of attempted fixes to this problem is that we could fail to +remove the search term indexes entirely. This test contains +additional subtests to guard against that. + +This test also ensures that the right number of ghost messages exist +in each situation; this will help us ensure we don't accumulate ghosts +indefinitely or leak too much information about what messages we've +seen or not seen, while still making it easy to reassemble threads +when messages come in out-of-order. +--- + test/T590-thread-breakage.sh | 131 +++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 131 insertions(+) + create mode 100755 test/T590-thread-breakage.sh + +diff --git a/test/T590-thread-breakage.sh b/test/T590-thread-breakage.sh +new file mode 100755 +index 0000000..24ffb42 +--- /dev/null ++++ b/test/T590-thread-breakage.sh +@@ -0,0 +1,131 @@ ++#!/usr/bin/env bash ++# ++# Copyright (c) 2016 Daniel Kahn Gillmor ++# ++ ++test_description='thread breakage during reindexing ++ ++notmuch uses ghost documents to track messages we have seen references ++to but have never seen. Regardless of the order of delivery, message ++deletion, and reindexing, the list of ghost messages for a given ++stored corpus should not vary, so that threads can be reassmebled ++cleanly. ++ ++In practice, we accept a small amount of variation (and therefore ++traffic pattern metadata leakage to be stored in the index) for the ++sake of efficiency. ++ ++This test also embeds some subtests to ensure that indexing actually ++works properly and attempted fixes to threading issues do not break ++the expected contents of the index.' ++ ++. ./test-lib.sh || exit 1 ++ ++message_a() { ++ mkdir -p ${MAIL_DIR}/cur ++ cat > ${MAIL_DIR}/cur/a < ++From: Alice ++To: Bob ++Date: Thu, 31 Mar 2016 20:10:00 -0400 ++ ++This is the first message in the thread. ++Apple ++EOF ++} ++ ++message_b() { ++ mkdir -p ${MAIL_DIR}/cur ++ cat > ${MAIL_DIR}/cur/b < ++In-Reply-To: ++References: ++From: Bob ++To: Alice ++Date: Thu, 31 Mar 2016 20:15:00 -0400 ++ ++This is the second message in the thread. ++Banana ++EOF ++} ++ ++ ++test_subject_count() { ++ test_begin_subtest "${3:-looking for $2 instance of '$1'}" ++ count=$(notmuch count --output=threads "$1") ++ test_expect_equal "$count" "$2" ++} ++ ++test_thread_count() { ++ test_begin_subtest "${2:-Expecting $1 thread(s)}" ++ count=$(notmuch count --output=threads) ++ test_expect_equal "$count" "$1" ++} ++ ++test_ghost_count() { ++ test_begin_subtest "${2:-Expecting $1 ghosts(s)}" ++ ghosts=$(../ghost-report ${MAIL_DIR}/.notmuch/xapian) ++ test_expect_equal "$ghosts" "$1" ++} ++ ++notmuch new >/dev/null ++pwd ++ls -la ${MAIL_DIR}/.notmuch/xapian ++ ++test_thread_count 0 'There should be no threads initially' ++test_ghost_count 0 'There should be no ghosts initially' ++ ++message_a ++notmuch new >/dev/null ++test_thread_count 1 'One message in: one thread' ++test_subject_count apple 1 ++test_subject_count banana 0 ++test_ghost_count 0 ++ ++message_b ++notmuch new >/dev/null ++test_thread_count 1 'Second message in the same thread: one thread' ++test_subject_count apple 1 ++test_subject_count banana 1 ++test_ghost_count 0 ++ ++rm -f ${MAIL_DIR}/cur/a ++notmuch new >/dev/null ++test_thread_count 1 'First message removed: still only one thread' ++test_subject_count apple 0 ++test_subject_count banana 1 ++test_begin_subtest 'should be one ghost after first message removed' ++test_subtest_known_broken ++ghosts=$(../ghost-report ${MAIL_DIR}/.notmuch/xapian) ++test_expect_equal "$ghosts" "1" ++ ++message_a ++notmuch new >/dev/null ++# this is known to fail (it shows 2 threads) because no "ghost ++# message" was created for message A when it was removed from the ++# index, despite message B still pointing to it. ++test_begin_subtest 'First message reappears: should return to the same thread' ++test_subtest_known_broken ++count=$(notmuch count --output=threads) ++test_expect_equal "$count" "1" ++test_subject_count apple 1 ++test_subject_count banana 1 ++test_ghost_count 0 ++ ++rm -f ${MAIL_DIR}/cur/b ++notmuch new >/dev/null ++test_thread_count 1 'Removing second message: still only one thread' ++test_subject_count apple 1 ++test_subject_count banana 0 ++test_ghost_count 0 'No ghosts should remain after deletion of second message' ++ ++rm -f ${MAIL_DIR}/cur/a ++notmuch new >/dev/null ++test_thread_count 0 'All messages gone: no threads' ++test_subject_count apple 0 ++test_subject_count banana 0 ++test_ghost_count 0 ++ ++test_done +-- +2.8.0.rc3 + -- 2.26.2