From: Jani Nikula Date: Sat, 30 Nov 2013 15:33:54 +0000 (+0200) Subject: [PATCH v2 5/7] lib: replace the header parser with gmime X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=a88da18327b69204f55fe351653c68f4ebd67c65;p=notmuch-archives.git [PATCH v2 5/7] lib: replace the header parser with gmime --- diff --git a/1e/73643df18b586c994b6b63ca37303c363f97c0 b/1e/73643df18b586c994b6b63ca37303c363f97c0 new file mode 100644 index 000000000..9154136f8 --- /dev/null +++ b/1e/73643df18b586c994b6b63ca37303c363f97c0 @@ -0,0 +1,600 @@ +Return-Path: +X-Original-To: notmuch@notmuchmail.org +Delivered-To: notmuch@notmuchmail.org +Received: from localhost (localhost [127.0.0.1]) + by olra.theworths.org (Postfix) with ESMTP id 8F40B431FBC + for ; Sat, 30 Nov 2013 07:34:34 -0800 (PST) +X-Virus-Scanned: Debian amavisd-new at olra.theworths.org +X-Amavis-Alert: BAD HEADER SECTION, Duplicate header field: "References" +X-Spam-Flag: NO +X-Spam-Score: -0.7 +X-Spam-Level: +X-Spam-Status: No, score=-0.7 tagged_above=-999 required=5 + tests=[RCVD_IN_DNSWL_LOW=-0.7] autolearn=disabled +Received: from olra.theworths.org ([127.0.0.1]) + by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024) + with ESMTP id DXn2XKqJ8EbZ for ; + Sat, 30 Nov 2013 07:34:27 -0800 (PST) +Received: from mail-ea0-f173.google.com (mail-ea0-f173.google.com + [209.85.215.173]) (using TLSv1 with cipher RC4-SHA (128/128 bits)) + (No client certificate requested) + by olra.theworths.org (Postfix) with ESMTPS id 60134431FD7 + for ; Sat, 30 Nov 2013 07:34:13 -0800 (PST) +Received: by mail-ea0-f173.google.com with SMTP id g15so7632776eak.18 + for ; Sat, 30 Nov 2013 07:34:12 -0800 (PST) +X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; + d=1e100.net; s=20130820; + h=x-gm-message-state:from:to:cc:subject:date:message-id:in-reply-to + :references:in-reply-to:references; + bh=5TLswB6ZYLSpzgYHTBKMQuqK7ULcyjfn5xOnEwr4aEQ=; + b=JIcp083Z2JbwhZia8TcawqQpOmw32kPgR0/RCG4FmH7qRXGdnazwdVqcREyQ7a4fZb + O1qQ6EHrsPM4cJJG/OvRFtM+JVk2ptr55HgQ6rYheIq/4CaQRT8xIWZw8u5eybVRDZio + 9eByhahqb0aWShW4Tjt4pXjQfrAR5uEA6TE7l5yvi0E43h5aUzmQqspOw7BkTTj+r0pn + hYLEFbQYgQdx5kK4dxZgVg8mWtbkk4KxHBb4tAhZVpASutfOR6qnslZJ2LITppdcScb7 + JxZtfj8a+IuN1jev7mQj2mlD9z9tw2jPqhdS5oC2v9YqzUjyp4mWpDvbcHBfuk3ZYRw5 + VcTg== +X-Gm-Message-State: + ALoCoQlPg7PhcFekdins5GGuToPdOX1ab8mQDAfAM0vETCqDtB5c0IG9zRpxLJ7nb7vCgBt1M2Y+ +X-Received: by 10.14.182.199 with SMTP id o47mr57789251eem.7.1385825652280; + Sat, 30 Nov 2013 07:34:12 -0800 (PST) +Received: from localhost (dsl-hkibrasgw2-58c36f-91.dhcp.inet.fi. + [88.195.111.91]) + by mx.google.com with ESMTPSA id v7sm34804952eel.2.2013.11.30.07.34.10 + for + (version=TLSv1.2 cipher=RC4-SHA bits=128/128); + Sat, 30 Nov 2013 07:34:11 -0800 (PST) +From: Jani Nikula +To: notmuch@notmuchmail.org +Subject: [PATCH v2 5/7] lib: replace the header parser with gmime +Date: Sat, 30 Nov 2013 17:33:54 +0200 +Message-Id: + <336c7d56a56685e82e06275dc4971c76f18f3d9b.1385825425.git.jani@nikula.org> +X-Mailer: git-send-email 1.8.4.2 +In-Reply-To: +References: +In-Reply-To: +References: +X-BeenThere: notmuch@notmuchmail.org +X-Mailman-Version: 2.1.13 +Precedence: list +List-Id: "Use and development of the notmuch mail system." + +List-Unsubscribe: , + +List-Archive: +List-Post: +List-Help: +List-Subscribe: , + +X-List-Received-Date: Sat, 30 Nov 2013 15:34:34 -0000 + +The notmuch library includes a full blown message header parser. Yet +the same message headers are parsed by gmime during indexing. Switch +to gmime parsing completely. + +These are the main differences between the parsers: + +* Gmime stops header parsing at the first invalid header, and presumes + the message body starts from there. The current parser is quite + liberal in accepting broken headers. The change means we will be + much pickier about accepting invalid messages. For example the + performance test suite contains plenty of email that does not pass + as valid email. (Note that the mail body would not have been indexed + before the change anyway, as we use gmime for that.) + +* The current parser converts tabs used in header folding to + spaces. Gmime preserve the tabs. Due to a broken python library used + in mailman, there are plenty of mailing lists that produce headers + with tabs in header folding, we'll see plenty of tabs. + +At this step, we only switch the header parsing to gmime. +--- + lib/database.cc | 4 + + lib/index.cc | 11 -- + lib/message-file.c | 346 ++++++++++++++++---------------------------------- + lib/message.cc | 6 + + lib/notmuch-private.h | 4 + + 5 files changed, 122 insertions(+), 249 deletions(-) + +diff --git a/lib/database.cc b/lib/database.cc +index f395061..d1bea88 100644 +--- a/lib/database.cc ++++ b/lib/database.cc +@@ -1940,6 +1940,10 @@ notmuch_database_add_message (notmuch_database_t *notmuch, + "to", + (char *) NULL); + ++ ret = notmuch_message_file_parse (message_file); ++ if (ret) ++ goto DONE; ++ + try { + /* Before we do any real work, (especially before doing a + * potential SHA-1 computation on the entire file's contents), +diff --git a/lib/index.cc b/lib/index.cc +index 78c18cf..976e49f 100644 +--- a/lib/index.cc ++++ b/lib/index.cc +@@ -437,7 +437,6 @@ _notmuch_message_index_file (notmuch_message_t *message, + static int initialized = 0; + char from_buf[5]; + bool is_mbox = false; +- static bool mbox_warning = false; + + if (! initialized) { + g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS); +@@ -471,16 +470,6 @@ _notmuch_message_index_file (notmuch_message_t *message, + ret = NOTMUCH_STATUS_FILE_NOT_EMAIL; + goto DONE; + } +- /* For historical reasons, we support single-message mboxes, +- * but this behavior is likely to change in the future, so +- * warn. */ +- if (!mbox_warning) { +- mbox_warning = true; +- fprintf (stderr, "\ +-Warning: %s is an mbox containing a single message,\n\ +-likely caused by misconfigured mail delivery. Support for single-message\n\ +-mboxes is deprecated and may be removed in the future.\n", filename); +- } + } + + from = g_mime_message_get_sender (mime_message); +diff --git a/lib/message-file.c b/lib/message-file.c +index a2850c2..3c653d1 100644 +--- a/lib/message-file.c ++++ b/lib/message-file.c +@@ -26,30 +26,19 @@ + + #include /* GHashTable */ + +-typedef struct { +- char *str; +- size_t size; +- size_t len; +-} header_value_closure_t; +- + struct _notmuch_message_file { + /* File object */ + FILE *file; ++ char *filename; + + /* Header storage */ + int restrict_headers; + GHashTable *headers; +- int broken_headers; +- int good_headers; +- size_t header_size; /* Length of full message header in bytes. */ +- +- /* Parsing state */ +- char *line; +- size_t line_size; +- header_value_closure_t value; + +- int parsing_started; +- int parsing_finished; ++ GMimeStream *stream; ++ GMimeParser *parser; ++ GMimeMessage *message; ++ notmuch_bool_t parsed; + }; + + static int +@@ -76,16 +65,18 @@ strcase_hash (const void *ptr) + static int + _notmuch_message_file_destructor (notmuch_message_file_t *message) + { +- if (message->line) +- free (message->line); +- +- if (message->value.size) +- free (message->value.str); +- + if (message->headers) + g_hash_table_destroy (message->headers); + +- if (message->file) ++ if (message->message) ++ g_object_unref (message->message); ++ ++ if (message->parser) ++ g_object_unref (message->parser); ++ ++ if (message->stream) ++ g_object_unref (message->stream); ++ else if (message->file) /* GMimeStream takes over the FILE* */ + fclose (message->file); + + return 0; +@@ -102,19 +93,21 @@ _notmuch_message_file_open_ctx (void *ctx, const char *filename) + if (unlikely (message == NULL)) + return NULL; + ++ /* only needed for error messages during parsing */ ++ message->filename = talloc_strdup (message, filename); ++ if (message->filename == NULL) ++ goto FAIL; ++ + talloc_set_destructor (message, _notmuch_message_file_destructor); + + message->file = fopen (filename, "r"); + if (message->file == NULL) + goto FAIL; + +- message->headers = g_hash_table_new_full (strcase_hash, +- strcase_equal, +- free, +- g_free); +- +- message->parsing_started = 0; +- message->parsing_finished = 0; ++ message->headers = g_hash_table_new_full (strcase_hash, strcase_equal, ++ free, g_free); ++ if (message->headers == NULL) ++ goto FAIL; + + return message; + +@@ -143,7 +136,7 @@ notmuch_message_file_restrict_headersv (notmuch_message_file_t *message, + { + char *header; + +- if (message->parsing_started) ++ if (message->parsed) + INTERNAL_ERROR ("notmuch_message_file_restrict_headers called after parsing has started"); + + while (1) { +@@ -167,234 +160,111 @@ notmuch_message_file_restrict_headers (notmuch_message_file_t *message, ...) + notmuch_message_file_restrict_headersv (message, va_headers); + } + +-static void +-copy_header_unfolding (header_value_closure_t *value, +- const char *chunk) ++/* ++ * gmime does not provide access to all headers, so we need to use the ++ * parser header callback to gather them into a hash table. ++ */ ++static void header_cb (unused(GMimeParser *parser), const char *header, ++ const char *value, unused(gint64 offset), ++ gpointer user_data) + { +- char *last; ++ notmuch_message_file_t *message = (notmuch_message_file_t *) user_data; ++ char *existing = NULL; ++ notmuch_bool_t found; + +- if (chunk == NULL) ++ found = g_hash_table_lookup_extended (message->headers, header, ++ NULL, (gpointer *) &existing); ++ if (! found && message->restrict_headers) + return; + +- while (*chunk == ' ' || *chunk == '\t') +- chunk++; +- +- if (value->len + 1 + strlen (chunk) + 1 > value->size) { +- unsigned int new_size = value->size; +- if (value->size == 0) +- new_size = strlen (chunk) + 1; +- else +- while (value->len + 1 + strlen (chunk) + 1 > new_size) +- new_size *= 2; +- value->str = xrealloc (value->str, new_size); +- value->size = new_size; +- } +- +- last = value->str + value->len; +- if (value->len) { +- *last = ' '; +- last++; +- value->len++; +- } +- +- strcpy (last, chunk); +- value->len += strlen (chunk); +- +- last = value->str + value->len - 1; +- if (*last == '\n') { +- *last = '\0'; +- value->len--; ++ if (existing == NULL) { ++ /* No value, add one */ ++ char *decoded = g_mime_utils_header_decode_text (value); ++ g_hash_table_insert (message->headers, xstrdup (header), decoded); ++ } else if (strcasecmp (header, "received") == 0) { ++ /* Concat all of the Received: headers we encounter. */ ++ char *combined, *decoded; ++ size_t combined_size; ++ ++ decoded = g_mime_utils_header_decode_text (value); ++ ++ combined_size = strlen(existing) + strlen(decoded) + 2; ++ combined = g_malloc (combined_size); ++ snprintf (combined, combined_size, "%s %s", existing, decoded); ++ g_free (decoded); ++ g_hash_table_insert (message->headers, xstrdup (header), combined); + } + } + +-/* As a special-case, a value of NULL for header_desired will force +- * the entire header to be parsed if it is not parsed already. This is +- * used by the _notmuch_message_file_get_headers_end function. +- * Another special case is the Received: header. For this header we +- * want to concatenate all instances of the header instead of just +- * hashing the first instance as we use this when analyzing the path +- * the mail has taken from sender to recipient. +- */ +-const char * +-notmuch_message_file_get_header (notmuch_message_file_t *message, +- const char *header_desired) ++notmuch_status_t ++notmuch_message_file_parse (notmuch_message_file_t *message) + { +- int contains; +- char *header, *decoded_value, *header_sofar, *combined_header; +- const char *s, *colon; +- int match, newhdr, hdrsofar, is_received; + static int initialized = 0; +- +- is_received = (strcmp(header_desired,"received") == 0); ++ char from_buf[5]; ++ notmuch_bool_t is_mbox = FALSE; ++ static notmuch_bool_t mbox_warning = FALSE; + + if (! initialized) { + g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS); + initialized = 1; + } + +- message->parsing_started = 1; +- +- if (header_desired == NULL) +- contains = 0; +- else +- contains = g_hash_table_lookup_extended (message->headers, +- header_desired, NULL, +- (gpointer *) &decoded_value); +- +- if (contains && decoded_value) +- return decoded_value; +- +- if (message->parsing_finished) +- return ""; +- +-#define NEXT_HEADER_LINE(closure) \ +- while (1) { \ +- ssize_t bytes_read = getline (&message->line, \ +- &message->line_size, \ +- message->file); \ +- if (bytes_read == -1) { \ +- message->parsing_finished = 1; \ +- break; \ +- } \ +- if (*message->line == '\n') { \ +- message->parsing_finished = 1; \ +- break; \ +- } \ +- if (closure && \ +- (*message->line == ' ' || *message->line == '\t')) \ +- { \ +- copy_header_unfolding ((closure), message->line); \ +- } \ +- if (*message->line == ' ' || *message->line == '\t') \ +- message->header_size += strlen (message->line); \ +- else \ +- break; \ +- } +- +- if (message->line == NULL) +- NEXT_HEADER_LINE (NULL); +- +- while (1) { +- +- if (message->parsing_finished) +- break; +- +- colon = strchr (message->line, ':'); +- +- if (colon == NULL) { +- message->broken_headers++; +- /* A simple heuristic for giving up on things that just +- * don't look like mail messages. */ +- if (message->broken_headers >= 10 && +- message->good_headers < 5) +- { +- message->parsing_finished = 1; +- break; +- } +- NEXT_HEADER_LINE (NULL); +- continue; ++ /* Is this mbox? */ ++ if (fread (from_buf, sizeof (from_buf), 1, message->file) == 1 && ++ strncmp (from_buf, "From ", 5) == 0) ++ is_mbox = TRUE; ++ rewind (message->file); ++ ++ message->stream = g_mime_stream_file_new (message->file); ++ message->parser = g_mime_parser_new_with_stream (message->stream); ++ g_mime_parser_set_scan_from (message->parser, is_mbox); ++ g_mime_parser_set_header_regex (message->parser, ".*", header_cb, ++ (gpointer) message); ++ ++ message->message = g_mime_parser_construct_message (message->parser); ++ if (! message->message) ++ return NOTMUCH_STATUS_FILE_NOT_EMAIL; ++ ++ if (is_mbox) { ++ if (! g_mime_parser_eos (message->parser)) { ++ /* This is a multi-message mbox. */ ++ return NOTMUCH_STATUS_FILE_NOT_EMAIL; + } +- +- message->header_size += strlen (message->line); +- +- message->good_headers++; +- +- header = xstrndup (message->line, colon - message->line); +- +- if (message->restrict_headers && +- ! g_hash_table_lookup_extended (message->headers, +- header, NULL, NULL)) +- { +- free (header); +- NEXT_HEADER_LINE (NULL); +- continue; +- } +- +- s = colon + 1; +- while (*s == ' ' || *s == '\t') +- s++; +- +- message->value.len = 0; +- copy_header_unfolding (&message->value, s); +- +- NEXT_HEADER_LINE (&message->value); +- +- if (header_desired == NULL) +- match = 0; +- else +- match = (strcasecmp (header, header_desired) == 0); +- +- decoded_value = g_mime_utils_header_decode_text (message->value.str); +- header_sofar = (char *)g_hash_table_lookup (message->headers, header); +- /* we treat the Received: header special - we want to concat ALL of +- * the Received: headers we encounter. +- * for everything else we return the first instance of a header */ +- if (strcasecmp(header, "received") == 0) { +- if (header_sofar == NULL) { +- /* first Received: header we encountered; just add it */ +- g_hash_table_insert (message->headers, header, decoded_value); +- } else { +- /* we need to add the header to those we already collected */ +- newhdr = strlen(decoded_value); +- hdrsofar = strlen(header_sofar); +- combined_header = g_malloc(hdrsofar + newhdr + 2); +- strncpy(combined_header,header_sofar,hdrsofar); +- *(combined_header+hdrsofar) = ' '; +- strncpy(combined_header+hdrsofar+1,decoded_value,newhdr+1); +- g_free (decoded_value); +- g_hash_table_insert (message->headers, header, combined_header); +- } +- } else { +- if (header_sofar == NULL) { +- /* Only insert if we don't have a value for this header, yet. */ +- g_hash_table_insert (message->headers, header, decoded_value); +- } else { +- free (header); +- g_free (decoded_value); +- decoded_value = header_sofar; +- } ++ /* For historical reasons, we support single-message mboxes, ++ * but this behavior is likely to change in the future, so ++ * warn. */ ++ if (! mbox_warning) { ++ mbox_warning = TRUE; ++ fprintf (stderr, "\ ++Warning: %s is an mbox containing a single message,\n\ ++likely caused by misconfigured mail delivery. Support for single-message\n\ ++mboxes is deprecated and may be removed in the future.\n", message->filename); + } +- /* if we found a match we can bail - unless of course we are +- * collecting all the Received: headers */ +- if (match && !is_received) +- return decoded_value; + } + +- if (message->parsing_finished) { +- fclose (message->file); +- message->file = NULL; +- } ++ message->parsed = TRUE; + +- if (message->line) +- free (message->line); +- message->line = NULL; ++ return NOTMUCH_STATUS_SUCCESS; ++} + +- if (message->value.size) { +- free (message->value.str); +- message->value.str = NULL; +- message->value.size = 0; +- message->value.len = 0; +- } ++/* return NULL on errors, empty string for non-existing headers */ ++const char * ++notmuch_message_file_get_header (notmuch_message_file_t *message, ++ const char *header) ++{ ++ const char *value = NULL; ++ notmuch_bool_t found; + +- /* For the Received: header we actually might end up here even +- * though we found the header (as we force continued parsing +- * in that case). So let's check if that's the header we were +- * looking for and return the value that we found (if any) +- */ +- if (is_received) +- return (char *)g_hash_table_lookup (message->headers, "received"); +- +- /* We've parsed all headers and never found the one we're looking +- * for. It's probably just not there, but let's check that we +- * didn't make a mistake preventing us from seeing it. */ +- if (message->restrict_headers && header_desired && +- ! g_hash_table_lookup_extended (message->headers, +- header_desired, NULL, NULL)) +- { +- INTERNAL_ERROR ("Attempt to get header \"%s\" which was not\n" +- "included in call to notmuch_message_file_restrict_headers\n", +- header_desired); +- } ++ /* the caller shouldn't ask for headers before parsing */ ++ if (! message->parsed) ++ return NULL; ++ ++ found = g_hash_table_lookup_extended (message->headers, header, ++ NULL, (gpointer *) &value); ++ ++ /* the caller shouldn't ask for non-restricted headers */ ++ if (! found && message->restrict_headers) ++ return NULL; + +- return ""; ++ return value ? value : ""; + } +diff --git a/lib/message.cc b/lib/message.cc +index 1b46379..1c7b91f 100644 +--- a/lib/message.cc ++++ b/lib/message.cc +@@ -407,6 +407,12 @@ _notmuch_message_ensure_message_file (notmuch_message_t *message) + return; + + message->message_file = _notmuch_message_file_open_ctx (message, filename); ++ ++ /* XXX: better return value handling */ ++ if (notmuch_message_file_parse (message->message_file)) { ++ notmuch_message_file_close (message->message_file); ++ message->message_file = NULL; ++ } + } + + const char * +diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h +index af185c7..7277df1 100644 +--- a/lib/notmuch-private.h ++++ b/lib/notmuch-private.h +@@ -345,6 +345,10 @@ notmuch_message_file_open (const char *filename); + notmuch_message_file_t * + _notmuch_message_file_open_ctx (void *ctx, const char *filename); + ++/* Parse the message */ ++notmuch_status_t ++notmuch_message_file_parse (notmuch_message_file_t *message); ++ + /* Close a notmuch message previously opened with notmuch_message_open. */ + void + notmuch_message_file_close (notmuch_message_file_t *message); +-- +1.8.4.2 +