From: Austin Clements Date: Wed, 17 Nov 2010 19:28:26 +0000 (-0500) Subject: Optimize thread search using matched docid sets. X-Git-Tag: debian/0.6_254~223 X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=b3caef1f;p=notmuch.git Optimize thread search using matched docid sets. This reduces thread search's 1+2t Xapian queries (where t is the number of matched threads) to 1+t queries and constructs exactly one notmuch_message_t for each message instead of 2 to 3. notmuch_query_search_threads eagerly fetches the docids of all messages matching the user query instead of lazily constructing message objects and fetching thread ID's from term lists. _notmuch_thread_create takes a seed docid and the set of all matched docids and uses a single Xapian query to expand this docid to its containing thread, using the matched docid set to determine which messages in the thread match the user query instead of using a second Xapian query. This reduces the amount of time required to load my inbox from 4.523 seconds to 3.025 seconds (1.5X faster). --- diff --git a/lib/message.cc b/lib/message.cc index 225b7e91..adcd07d6 100644 --- a/lib/message.cc +++ b/lib/message.cc @@ -254,6 +254,12 @@ _notmuch_message_create_for_message_id (notmuch_database_t *notmuch, return message; } +unsigned int +_notmuch_message_get_doc_id (notmuch_message_t *message) +{ + return message->doc_id; +} + const char * notmuch_message_get_message_id (notmuch_message_t *message) { diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h index 592cfb2b..303aeb30 100644 --- a/lib/notmuch-private.h +++ b/lib/notmuch-private.h @@ -156,6 +156,8 @@ typedef enum _notmuch_private_status { : \ (notmuch_status_t) private_status) +typedef struct _notmuch_doc_id_set notmuch_doc_id_set_t; + /* database.cc */ /* Lookup a prefix value by name. @@ -222,8 +224,8 @@ _notmuch_directory_get_document_id (notmuch_directory_t *directory); notmuch_thread_t * _notmuch_thread_create (void *ctx, notmuch_database_t *notmuch, - const char *thread_id, - const char *query_string, + unsigned int seed_doc_id, + notmuch_doc_id_set_t *match_set, notmuch_sort_t sort); /* message.cc */ @@ -239,6 +241,9 @@ _notmuch_message_create_for_message_id (notmuch_database_t *notmuch, const char *message_id, notmuch_private_status_t *status); +unsigned int +_notmuch_message_get_doc_id (notmuch_message_t *message); + const char * _notmuch_message_get_in_reply_to (notmuch_message_t *message); @@ -426,6 +431,14 @@ _notmuch_mset_messages_get (notmuch_messages_t *messages); void _notmuch_mset_messages_move_to_next (notmuch_messages_t *messages); +notmuch_bool_t +_notmuch_doc_id_set_contains (notmuch_doc_id_set_t *doc_ids, + unsigned int doc_id); + +void +_notmuch_doc_id_set_remove (notmuch_doc_id_set_t *doc_ids, + unsigned int doc_id); + /* message.cc */ void diff --git a/lib/query.cc b/lib/query.cc index 7916421e..c7ae4ee7 100644 --- a/lib/query.cc +++ b/lib/query.cc @@ -36,13 +36,21 @@ typedef struct _notmuch_mset_messages { Xapian::MSetIterator iterator_end; } notmuch_mset_messages_t; +struct _notmuch_doc_id_set { + unsigned int *bitmap; + unsigned int bound; +}; + struct _notmuch_threads { notmuch_query_t *query; - GHashTable *threads; - notmuch_messages_t *messages; - /* This thread ID is our iterator state. */ - const char *thread_id; + /* The ordered list of doc ids matched by the query. */ + GArray *doc_ids; + /* Our iterator's current position in doc_ids. */ + unsigned int doc_id_pos; + /* The set of matched docid's that have not been assigned to a + * thread. Initially, this contains every docid in doc_ids. */ + notmuch_doc_id_set_t match_set; }; notmuch_query_t * @@ -195,6 +203,19 @@ _notmuch_mset_messages_valid (notmuch_messages_t *messages) return (mset_messages->iterator != mset_messages->iterator_end); } +static Xapian::docid +_notmuch_mset_messages_get_doc_id (notmuch_messages_t *messages) +{ + notmuch_mset_messages_t *mset_messages; + + mset_messages = (notmuch_mset_messages_t *) messages; + + if (! _notmuch_mset_messages_valid (&mset_messages->base)) + return 0; + + return *mset_messages->iterator; +} + notmuch_message_t * _notmuch_mset_messages_get (notmuch_messages_t *messages) { @@ -233,6 +254,49 @@ _notmuch_mset_messages_move_to_next (notmuch_messages_t *messages) mset_messages->iterator++; } +static notmuch_bool_t +_notmuch_doc_id_set_init (void *ctx, + notmuch_doc_id_set_t *doc_ids, + GArray *arr, unsigned int bound) +{ + size_t count = (bound + sizeof (doc_ids->bitmap[0]) - 1) / + sizeof (doc_ids->bitmap[0]); + unsigned int *bitmap = talloc_zero_array (ctx, unsigned int, count); + + if (bitmap == NULL) + return FALSE; + + doc_ids->bitmap = bitmap; + doc_ids->bound = bound; + + for (unsigned int i = 0; i < arr->len; i++) { + unsigned int doc_id = g_array_index(arr, unsigned int, i); + bitmap[doc_id / sizeof (bitmap[0])] |= + 1 << (doc_id % sizeof (bitmap[0])); + } + + return TRUE; +} + +notmuch_bool_t +_notmuch_doc_id_set_contains (notmuch_doc_id_set_t *doc_ids, + unsigned int doc_id) +{ + if (doc_id >= doc_ids->bound) + return FALSE; + return (doc_ids->bitmap[doc_id / sizeof (doc_ids->bitmap[0])] & + (1 << (doc_id % sizeof (doc_ids->bitmap[0])))) != 0; +} + +void +_notmuch_doc_id_set_remove (notmuch_doc_id_set_t *doc_ids, + unsigned int doc_id) +{ + if (doc_id < doc_ids->bound) + doc_ids->bitmap[doc_id / sizeof (doc_ids->bitmap[0])] &= + ~(1 << (doc_id % sizeof (doc_ids->bitmap[0]))); +} + /* Glib objects force use to use a talloc destructor as well, (but not * nearly as ugly as the for messages due to C++ objects). At * this point, I'd really like to have some talloc-friendly @@ -240,7 +304,8 @@ _notmuch_mset_messages_move_to_next (notmuch_messages_t *messages) static int _notmuch_threads_destructor (notmuch_threads_t *threads) { - g_hash_table_unref (threads->threads); + if (threads->doc_ids) + g_array_unref (threads->doc_ids); return 0; } @@ -249,24 +314,39 @@ notmuch_threads_t * notmuch_query_search_threads (notmuch_query_t *query) { notmuch_threads_t *threads; + notmuch_messages_t *messages; + Xapian::docid max_doc_id = 0; threads = talloc (query, notmuch_threads_t); if (threads == NULL) return NULL; + threads->doc_ids = NULL; + talloc_set_destructor (threads, _notmuch_threads_destructor); threads->query = query; - threads->threads = g_hash_table_new_full (g_str_hash, g_str_equal, - free, NULL); - threads->messages = notmuch_query_search_messages (query); - if (threads->messages == NULL) { + messages = notmuch_query_search_messages (query); + if (messages == NULL) { talloc_free (threads); return NULL; } - threads->thread_id = NULL; + threads->doc_ids = g_array_new (FALSE, FALSE, sizeof (unsigned int)); + while (notmuch_messages_valid (messages)) { + unsigned int doc_id = _notmuch_mset_messages_get_doc_id (messages); + g_array_append_val (threads->doc_ids, doc_id); + max_doc_id = MAX (max_doc_id, doc_id); + notmuch_messages_move_to_next (messages); + } + threads->doc_id_pos = 0; - talloc_set_destructor (threads, _notmuch_threads_destructor); + talloc_free (messages); + + if (! _notmuch_doc_id_set_init (threads, &threads->match_set, + threads->doc_ids, max_doc_id + 1)) { + talloc_free (threads); + return NULL; + } return threads; } @@ -280,51 +360,41 @@ notmuch_query_destroy (notmuch_query_t *query) notmuch_bool_t notmuch_threads_valid (notmuch_threads_t *threads) { - notmuch_message_t *message; - - if (threads->thread_id) - return TRUE; - - while (notmuch_messages_valid (threads->messages)) - { - message = notmuch_messages_get (threads->messages); + unsigned int doc_id; - threads->thread_id = notmuch_message_get_thread_id (message); - - if (! g_hash_table_lookup_extended (threads->threads, - threads->thread_id, - NULL, NULL)) - { - g_hash_table_insert (threads->threads, - xstrdup (threads->thread_id), NULL); - notmuch_messages_move_to_next (threads->messages); - return TRUE; - } + while (threads->doc_id_pos < threads->doc_ids->len) { + doc_id = g_array_index (threads->doc_ids, unsigned int, + threads->doc_id_pos); + if (_notmuch_doc_id_set_contains (&threads->match_set, doc_id)) + break; - notmuch_messages_move_to_next (threads->messages); + threads->doc_id_pos++; } - threads->thread_id = NULL; - return FALSE; + return threads->doc_id_pos < threads->doc_ids->len; } notmuch_thread_t * notmuch_threads_get (notmuch_threads_t *threads) { + unsigned int doc_id; + if (! notmuch_threads_valid (threads)) return NULL; + doc_id = g_array_index (threads->doc_ids, unsigned int, + threads->doc_id_pos); return _notmuch_thread_create (threads->query, threads->query->notmuch, - threads->thread_id, - threads->query->query_string, + doc_id, + &threads->match_set, threads->query->sort); } void notmuch_threads_move_to_next (notmuch_threads_t *threads) { - threads->thread_id = NULL; + threads->doc_id_pos++; } void diff --git a/lib/thread.cc b/lib/thread.cc index 31c655b6..b29f2c92 100644 --- a/lib/thread.cc +++ b/lib/thread.cc @@ -371,16 +371,17 @@ _resolve_thread_relationships (unused (notmuch_thread_t *thread)) */ } -/* Create a new notmuch_thread_t object for the given thread ID, - * treating any messages matching 'query_string' as "matched". +/* Create a new notmuch_thread_t object by finding the thread + * containing the message with the given doc ID, treating any messages + * contained in match_set as "matched". Remove all messages in the + * thread from match_set. * - * Creating the thread will trigger two database searches. The first - * is for all messages belonging to the thread, (to get the first - * subject line, the total count of messages, and all authors). The - * second search is for all messages that are in the thread and that - * also match the given query_string. This is to allow for a separate - * count of matched messages, and to allow a viewer to display these - * messages differently. + * Creating the thread will perform a database search to get all + * messages belonging to the thread and will get the first subject + * line, the total count of messages, and all authors in the thread. + * Each message in the thread is checked against match_set to allow + * for a separate count of matched messages, and to allow a viewer to + * display these messages differently. * * Here, 'ctx' is talloc context for the resulting thread object. * @@ -389,53 +390,28 @@ _resolve_thread_relationships (unused (notmuch_thread_t *thread)) notmuch_thread_t * _notmuch_thread_create (void *ctx, notmuch_database_t *notmuch, - const char *thread_id, - const char *query_string, + unsigned int seed_doc_id, + notmuch_doc_id_set_t *match_set, notmuch_sort_t sort) { notmuch_thread_t *thread; + notmuch_message_t *seed_message; + const char *thread_id; const char *thread_id_query_string; notmuch_query_t *thread_id_query; notmuch_messages_t *messages; notmuch_message_t *message; - notmuch_bool_t matched_is_subset_of_thread; + seed_message = _notmuch_message_create (ctx, notmuch, seed_doc_id, NULL); + if (! seed_message) + INTERNAL_ERROR ("Thread seed message %u does not exist", seed_doc_id); + + thread_id = notmuch_message_get_thread_id (seed_message); thread_id_query_string = talloc_asprintf (ctx, "thread:%s", thread_id); - if (unlikely (query_string == NULL)) + if (unlikely (thread_id_query_string == NULL)) return NULL; - /* Under normal circumstances we need to do two database - * queries. One is for the thread itself (thread_id_query_string) - * and the second is to determine which messages in that thread - * match the original query (matched_query_string). - * - * But under two circumstances, we use only the - * thread_id_query_string: - * - * 1. If the original query_string *is* just the thread - * specification. - * - * 2. If the original query_string matches all messages ("" or - * "*"). - * - * In either of these cases, we can be more efficient by running - * just the thread_id query (since we know all messages in the - * thread will match the query_string). - * - * Beyond the performance advantage, in the second case, it's - * important to not try to create a concatenated query because our - * parser handles "" and "*" as special cases and will not do the - * right thing with a query string of "* and thread:". - **/ - matched_is_subset_of_thread = 1; - if (strcmp (query_string, thread_id_query_string) == 0 || - strcmp (query_string, "") == 0 || - strcmp (query_string, "*") == 0) - { - matched_is_subset_of_thread = 0; - } - thread_id_query = notmuch_query_create (notmuch, thread_id_query_string); if (unlikely (thread_id_query == NULL)) return NULL; @@ -482,48 +458,25 @@ _notmuch_thread_create (void *ctx, notmuch_messages_valid (messages); notmuch_messages_move_to_next (messages)) { + unsigned int doc_id; + message = notmuch_messages_get (messages); + doc_id = _notmuch_message_get_doc_id (message); + if (doc_id == seed_doc_id) + message = seed_message; _thread_add_message (thread, message); - if (! matched_is_subset_of_thread) + if ( _notmuch_doc_id_set_contains (match_set, doc_id)) { + _notmuch_doc_id_set_remove (match_set, doc_id); _thread_add_matched_message (thread, message, sort); + } _notmuch_message_close (message); } notmuch_query_destroy (thread_id_query); - if (matched_is_subset_of_thread) - { - const char *matched_query_string; - notmuch_query_t *matched_query; - - matched_query_string = talloc_asprintf (ctx, "%s AND (%s)", - thread_id_query_string, - query_string); - if (unlikely (matched_query_string == NULL)) - return NULL; - - matched_query = notmuch_query_create (notmuch, matched_query_string); - if (unlikely (matched_query == NULL)) - return NULL; - - /* As above, we use oldest-first order unconditionally here. */ - notmuch_query_set_sort (matched_query, NOTMUCH_SORT_OLDEST_FIRST); - - for (messages = notmuch_query_search_messages (matched_query); - notmuch_messages_valid (messages); - notmuch_messages_move_to_next (messages)) - { - message = notmuch_messages_get (messages); - _thread_add_matched_message (thread, message, sort); - _notmuch_message_close (message); - } - - notmuch_query_destroy (matched_query); - } - _resolve_thread_authors_string (thread); _resolve_thread_relationships (thread);