lib: replace the header parser with gmime
authorJani Nikula <jani@nikula.org>
Sun, 30 Mar 2014 21:21:49 +0000 (00:21 +0300)
committerDavid Bremner <david@tethera.net>
Sat, 5 Apr 2014 15:53:04 +0000 (12:53 -0300)
The notmuch library includes a full blown message header parser. Yet
the same message headers are parsed by gmime during indexing. Switch
to gmime parsing completely.

These are the main changes:

* Gmime stops header parsing at the first invalid header, and presumes
  the message body starts from there. The current parser is quite
  liberal in accepting broken headers. The change means we will be
  much pickier about accepting invalid messages.

* The current parser converts tabs used in header folding to
  spaces. Gmime preserve the tabs. Due to a broken python library used
  in mailman, there are plenty of mailing lists that produce headers
  with tabs in header folding, and we'll see plenty of tabs. (This
  change has been mitigated in preparatory patches.)

* For pure header parsing, the current parser is likely faster than
  gmime, which parses the whole message rather than just the
  headers. Since we parse the message and its headers using gmime for
  indexing anyway, this avoids and extra header parsing round when
  adding new messages. In case of duplicate messages, we'll end up
  parsing the full message although just headers would be
  sufficient. All in all this should still speed up 'notmuch new'.

* Calls to notmuch_message_get_header() may be slightly slower than
  previously for headers that are not indexed in the database, due to
  parsing of the whole message. Within the notmuch code base, notmuch
  reply is the only such user.

lib/database.cc
lib/index.cc
lib/message-file.c
lib/notmuch-private.h

index aef748f75b5d61ca6f2f34c35a2af93d0bd81f70..1efb14d4a0bdc2327a366870069867ba392ad1aa 100644 (file)
@@ -1971,15 +1971,10 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
     if (ret)
        goto DONE;
 
-    notmuch_message_file_restrict_headers (message_file,
-                                          "date",
-                                          "from",
-                                          "in-reply-to",
-                                          "message-id",
-                                          "references",
-                                          "subject",
-                                          "to",
-                                          (char *) NULL);
+    /* Parse message up front to get better error status. */
+    ret = _notmuch_message_file_parse (message_file);
+    if (ret)
+       goto DONE;
 
     try {
        /* Before we do any real work, (especially before doing a
@@ -2066,7 +2061,7 @@ notmuch_database_add_message (notmuch_database_t *notmuch,
            date = notmuch_message_file_get_header (message_file, "date");
            _notmuch_message_set_header_values (message, date, from, subject);
 
-           ret = _notmuch_message_index_file (message, filename);
+           ret = _notmuch_message_index_file (message, message_file);
            if (ret)
                goto DONE;
        } else {
index 2100cce047803f6acc059553afd32a794bb9257c..e1e2a3828f024988d59b8d28d9524a4a0ba99f28 100644 (file)
@@ -425,45 +425,17 @@ _index_mime_part (notmuch_message_t *message,
 
 notmuch_status_t
 _notmuch_message_index_file (notmuch_message_t *message,
-                            const char *filename)
+                            notmuch_message_file_t *message_file)
 {
-    GMimeStream *stream = NULL;
-    GMimeParser *parser = NULL;
-    GMimeMessage *mime_message = NULL;
+    GMimeMessage *mime_message;
     InternetAddressList *addresses;
-    FILE *file = NULL;
     const char *from, *subject;
-    notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS;
-    static int initialized = 0;
-    char from_buf[5];
+    notmuch_status_t status;
 
-    if (! initialized) {
-       g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS);
-       initialized = 1;
-    }
-
-    file = fopen (filename, "r");
-    if (! file) {
-       fprintf (stderr, "Error opening %s: %s\n", filename, strerror (errno));
-       ret = NOTMUCH_STATUS_FILE_ERROR;
-       goto DONE;
-    }
-
-    /* Is this mbox? */
-    if (fread (from_buf, sizeof (from_buf), 1, file) == 1 &&
-       strncmp (from_buf, "From ", 5) == 0) {
-       ret = NOTMUCH_STATUS_FILE_NOT_EMAIL;
-       goto DONE;
-    }
-    rewind (file);
-
-    /* Evil GMime steals my FILE* here so I won't fclose it. */
-    stream = g_mime_stream_file_new (file);
-
-    parser = g_mime_parser_new_with_stream (stream);
-    g_mime_parser_set_scan_from (parser, FALSE);
-
-    mime_message = g_mime_parser_construct_message (parser);
+    status = _notmuch_message_file_get_mime_message (message_file,
+                                                    &mime_message);
+    if (status)
+       return status;
 
     from = g_mime_message_get_sender (mime_message);
 
@@ -484,15 +456,5 @@ _notmuch_message_index_file (notmuch_message_t *message,
 
     _index_mime_part (message, g_mime_message_get_mime_part (mime_message));
 
-  DONE:
-    if (mime_message)
-       g_object_unref (mime_message);
-
-    if (parser)
-       g_object_unref (parser);
-
-    if (stream)
-       g_object_unref (stream);
-
-    return ret;
+    return NOTMUCH_STATUS_SUCCESS;
 }
index a2850c278b5a10ba9a5041a6662aab4ed09e3861..67828827e61d5a065b16e527013024c41b2abea4 100644 (file)
 
 #include <glib.h> /* GHashTable */
 
-typedef struct {
-    char *str;
-    size_t size;
-    size_t len;
-} header_value_closure_t;
-
 struct _notmuch_message_file {
     /* File object */
     FILE *file;
+    char *filename;
 
-    /* Header storage */
-    int restrict_headers;
+    /* Cache for decoded headers */
     GHashTable *headers;
-    int broken_headers;
-    int good_headers;
-    size_t header_size; /* Length of full message header in bytes. */
-
-    /* Parsing state */
-    char *line;
-    size_t line_size;
-    header_value_closure_t value;
 
-    int parsing_started;
-    int parsing_finished;
+    GMimeMessage *message;
 };
 
 static int
@@ -76,15 +61,12 @@ strcase_hash (const void *ptr)
 static int
 _notmuch_message_file_destructor (notmuch_message_file_t *message)
 {
-    if (message->line)
-       free (message->line);
-
-    if (message->value.size)
-       free (message->value.str);
-
     if (message->headers)
        g_hash_table_destroy (message->headers);
 
+    if (message->message)
+       g_object_unref (message->message);
+
     if (message->file)
        fclose (message->file);
 
@@ -102,20 +84,17 @@ _notmuch_message_file_open_ctx (void *ctx, const char *filename)
     if (unlikely (message == NULL))
        return NULL;
 
+    /* Only needed for error messages during parsing. */
+    message->filename = talloc_strdup (message, filename);
+    if (message->filename == NULL)
+       goto FAIL;
+
     talloc_set_destructor (message, _notmuch_message_file_destructor);
 
     message->file = fopen (filename, "r");
     if (message->file == NULL)
        goto FAIL;
 
-    message->headers = g_hash_table_new_full (strcase_hash,
-                                             strcase_equal,
-                                             free,
-                                             g_free);
-
-    message->parsing_started = 0;
-    message->parsing_finished = 0;
-
     return message;
 
   FAIL:
@@ -137,264 +116,202 @@ notmuch_message_file_close (notmuch_message_file_t *message)
     talloc_free (message);
 }
 
-void
-notmuch_message_file_restrict_headersv (notmuch_message_file_t *message,
-                                       va_list va_headers)
+static notmuch_bool_t
+is_mbox (FILE *file)
 {
-    char *header;
+    char from_buf[5];
+    notmuch_bool_t ret = FALSE;
 
-    if (message->parsing_started)
-       INTERNAL_ERROR ("notmuch_message_file_restrict_headers called after parsing has started");
+    /* Is this mbox? */
+    if (fread (from_buf, sizeof (from_buf), 1, file) == 1 &&
+       strncmp (from_buf, "From ", 5) == 0)
+       ret = TRUE;
 
-    while (1) {
-       header = va_arg (va_headers, char*);
-       if (header == NULL)
-           break;
-       g_hash_table_insert (message->headers,
-                            xstrdup (header), NULL);
-    }
+    rewind (file);
 
-    message->restrict_headers = 1;
+    return ret;
 }
 
-void
-notmuch_message_file_restrict_headers (notmuch_message_file_t *message, ...)
+notmuch_status_t
+_notmuch_message_file_parse (notmuch_message_file_t *message)
 {
-    va_list va_headers;
+    GMimeStream *stream;
+    GMimeParser *parser;
+    notmuch_status_t status = NOTMUCH_STATUS_SUCCESS;
+    static int initialized = 0;
 
-    va_start (va_headers, message);
+    if (message->message)
+       return NOTMUCH_STATUS_SUCCESS;
 
-    notmuch_message_file_restrict_headersv (message, va_headers);
-}
+    /* We no longer support mboxes at all. */
+    if (is_mbox (message->file))
+       return NOTMUCH_STATUS_FILE_NOT_EMAIL;
 
-static void
-copy_header_unfolding (header_value_closure_t *value,
-                      const char *chunk)
-{
-    char *last;
+    if (! initialized) {
+       g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS);
+       initialized = 1;
+    }
 
-    if (chunk == NULL)
-       return;
+    message->headers = g_hash_table_new_full (strcase_hash, strcase_equal,
+                                             free, g_free);
+    if (! message->headers)
+       return NOTMUCH_STATUS_OUT_OF_MEMORY;
 
-    while (*chunk == ' ' || *chunk == '\t')
-       chunk++;
+    stream = g_mime_stream_file_new (message->file);
 
-    if (value->len + 1 + strlen (chunk) + 1 > value->size) {
-       unsigned int new_size = value->size;
-       if (value->size == 0)
-           new_size = strlen (chunk) + 1;
-       else
-           while (value->len + 1 + strlen (chunk) + 1 > new_size)
-               new_size *= 2;
-       value->str = xrealloc (value->str, new_size);
-       value->size = new_size;
-    }
+    /* We'll own and fclose the FILE* ourselves. */
+    g_mime_stream_file_set_owner (GMIME_STREAM_FILE (stream), FALSE);
+
+    parser = g_mime_parser_new_with_stream (stream);
+    g_mime_parser_set_scan_from (parser, FALSE);
 
-    last = value->str + value->len;
-    if (value->len) {
-       *last = ' ';
-       last++;
-       value->len++;
+    message->message = g_mime_parser_construct_message (parser);
+    if (! message->message) {
+       status = NOTMUCH_STATUS_FILE_NOT_EMAIL;
+       goto DONE;
     }
 
-    strcpy (last, chunk);
-    value->len += strlen (chunk);
+  DONE:
+    g_object_unref (stream);
+    g_object_unref (parser);
+
+    if (status) {
+       g_hash_table_destroy (message->headers);
+       message->headers = NULL;
+
+       if (message->message) {
+           g_object_unref (message->message);
+           message->message = NULL;
+       }
 
-    last = value->str + value->len - 1;
-    if (*last == '\n') {
-       *last = '\0';
-       value->len--;
+       rewind (message->file);
     }
+
+    return status;
 }
 
-/* As a special-case, a value of NULL for header_desired will force
- * the entire header to be parsed if it is not parsed already. This is
- * used by the _notmuch_message_file_get_headers_end function.
- * Another special case is the Received: header. For this header we
- * want to concatenate all instances of the header instead of just
- * hashing the first instance as we use this when analyzing the path
- * the mail has taken from sender to recipient.
- */
-const char *
-notmuch_message_file_get_header (notmuch_message_file_t *message,
-                                const char *header_desired)
+notmuch_status_t
+_notmuch_message_file_get_mime_message (notmuch_message_file_t *message,
+                                       GMimeMessage **mime_message)
 {
-    int contains;
-    char *header, *decoded_value, *header_sofar, *combined_header;
-    const char *s, *colon;
-    int match, newhdr, hdrsofar, is_received;
-    static int initialized = 0;
+    notmuch_status_t status;
 
-    is_received = (strcmp(header_desired,"received") == 0);
+    status = _notmuch_message_file_parse (message);
+    if (status)
+       return status;
 
-    if (! initialized) {
-       g_mime_init (GMIME_ENABLE_RFC2047_WORKAROUNDS);
-       initialized = 1;
-    }
+    *mime_message = message->message;
 
-    message->parsing_started = 1;
-
-    if (header_desired == NULL)
-       contains = 0;
-    else
-       contains = g_hash_table_lookup_extended (message->headers,
-                                                header_desired, NULL,
-                                                (gpointer *) &decoded_value);
-
-    if (contains && decoded_value)
-       return decoded_value;
-
-    if (message->parsing_finished)
-       return "";
-
-#define NEXT_HEADER_LINE(closure)                              \
-    while (1) {                                                        \
-       ssize_t bytes_read = getline (&message->line,           \
-                                     &message->line_size,      \
-                                     message->file);           \
-       if (bytes_read == -1) {                                 \
-           message->parsing_finished = 1;                      \
-           break;                                              \
-       }                                                       \
-       if (*message->line == '\n') {                           \
-           message->parsing_finished = 1;                      \
-           break;                                              \
-       }                                                       \
-       if (closure &&                                          \
-           (*message->line == ' ' || *message->line == '\t'))  \
-       {                                                       \
-           copy_header_unfolding ((closure), message->line);   \
-       }                                                       \
-       if (*message->line == ' ' || *message->line == '\t')    \
-           message->header_size += strlen (message->line);     \
-       else                                                    \
-           break;                                              \
-    }
+    return NOTMUCH_STATUS_SUCCESS;
+}
 
-    if (message->line == NULL)
-       NEXT_HEADER_LINE (NULL);
+/*
+ * Get all instances of a header decoded and concatenated.
+ *
+ * The result must be freed using g_free().
+ *
+ * Return NULL on errors, empty string for non-existing headers.
+ */
+static char *
+_notmuch_message_file_get_combined_header (notmuch_message_file_t *message,
+                                          const char *header)
+{
+    GMimeHeaderList *headers;
+    GMimeHeaderIter *iter;
+    char *combined = NULL;
 
-    while (1) {
+    headers = g_mime_object_get_header_list (GMIME_OBJECT (message->message));
+    if (! headers)
+       return NULL;
 
-       if (message->parsing_finished)
-           break;
+    iter = g_mime_header_iter_new ();
+    if (! iter)
+       return NULL;
 
-       colon = strchr (message->line, ':');
+    if (! g_mime_header_list_get_iter (headers, iter))
+       goto DONE;
 
-       if (colon == NULL) {
-           message->broken_headers++;
-           /* A simple heuristic for giving up on things that just
-            * don't look like mail messages. */
-           if (message->broken_headers >= 10 &&
-               message->good_headers < 5)
-           {
-               message->parsing_finished = 1;
-               break;
-           }
-           NEXT_HEADER_LINE (NULL);
+    do {
+       const char *value;
+       char *decoded;
+
+       if (strcasecmp (g_mime_header_iter_get_name (iter), header) != 0)
            continue;
-       }
 
-       message->header_size += strlen (message->line);
+       /* Note that GMime retains ownership of value... */
+       value = g_mime_header_iter_get_value (iter);
 
-       message->good_headers++;
+       /* ... while decoded needs to be freed with g_free(). */
+       decoded = g_mime_utils_header_decode_text (value);
+       if (! decoded) {
+           if (combined) {
+               g_free (combined);
+               combined = NULL;
+           }
+           goto DONE;
+       }
 
-       header = xstrndup (message->line, colon - message->line);
+       if (combined) {
+           char *tmp = g_strdup_printf ("%s %s", combined, decoded);
+           g_free (decoded);
+           g_free (combined);
+           if (! tmp) {
+               combined = NULL;
+               goto DONE;
+           }
 
-       if (message->restrict_headers &&
-           ! g_hash_table_lookup_extended (message->headers,
-                                           header, NULL, NULL))
-       {
-           free (header);
-           NEXT_HEADER_LINE (NULL);
-           continue;
+           combined = tmp;
+       } else {
+           combined = decoded;
        }
+    } while (g_mime_header_iter_next (iter));
 
-       s = colon + 1;
-       while (*s == ' ' || *s == '\t')
-           s++;
+    /* Return empty string for non-existing headers. */
+    if (! combined)
+       combined = g_strdup ("");
 
-       message->value.len = 0;
-       copy_header_unfolding (&message->value, s);
+  DONE:
+    g_mime_header_iter_free (iter);
 
-       NEXT_HEADER_LINE (&message->value);
-
-       if (header_desired == NULL)
-           match = 0;
-       else
-           match = (strcasecmp (header, header_desired) == 0);
-
-       decoded_value = g_mime_utils_header_decode_text (message->value.str);
-       header_sofar = (char *)g_hash_table_lookup (message->headers, header);
-       /* we treat the Received: header special - we want to concat ALL of 
-        * the Received: headers we encounter.
-        * for everything else we return the first instance of a header */
-       if (strcasecmp(header, "received") == 0) {
-           if (header_sofar == NULL) {
-               /* first Received: header we encountered; just add it */
-               g_hash_table_insert (message->headers, header, decoded_value);
-           } else {
-               /* we need to add the header to those we already collected */
-               newhdr = strlen(decoded_value);
-               hdrsofar = strlen(header_sofar);
-               combined_header = g_malloc(hdrsofar + newhdr + 2);
-               strncpy(combined_header,header_sofar,hdrsofar);
-               *(combined_header+hdrsofar) = ' ';
-               strncpy(combined_header+hdrsofar+1,decoded_value,newhdr+1);
-               g_free (decoded_value);
-               g_hash_table_insert (message->headers, header, combined_header);
-           }
-       } else {
-           if (header_sofar == NULL) {
-               /* Only insert if we don't have a value for this header, yet. */
-               g_hash_table_insert (message->headers, header, decoded_value);
-           } else {
-               free (header);
-               g_free (decoded_value);
-               decoded_value = header_sofar;
-           }
-       }
-       /* if we found a match we can bail - unless of course we are
-        * collecting all the Received: headers */
-       if (match && !is_received)
-           return decoded_value;
-    }
+    return combined;
+}
 
-    if (message->parsing_finished) {
-        fclose (message->file);
-        message->file = NULL;
-    }
+const char *
+notmuch_message_file_get_header (notmuch_message_file_t *message,
+                                const char *header)
+{
+    const char *value;
+    char *decoded;
 
-    if (message->line)
-       free (message->line);
-    message->line = NULL;
+    if (_notmuch_message_file_parse (message))
+       return NULL;
 
-    if (message->value.size) {
-       free (message->value.str);
-       message->value.str = NULL;
-       message->value.size = 0;
-       message->value.len = 0;
+    /* If we have a cached decoded value, use it. */
+    value = g_hash_table_lookup (message->headers, header);
+    if (value)
+       return value;
+
+    if (strcasecmp (header, "received") == 0) {
+       /*
+        * The Received: header is special. We concatenate all
+        * instances of the header as we use this when analyzing the
+        * path the mail has taken from sender to recipient.
+        */
+       decoded = _notmuch_message_file_get_combined_header (message, header);
+    } else {
+       value = g_mime_object_get_header (GMIME_OBJECT (message->message),
+                                         header);
+       if (value)
+           decoded = g_mime_utils_header_decode_text (value);
+       else
+           decoded = g_strdup ("");
     }
 
-    /* For the Received: header we actually might end up here even
-     * though we found the header (as we force continued parsing
-     * in that case). So let's check if that's the header we were
-     * looking for and return the value that we found (if any)
-     */
-    if (is_received)
-       return (char *)g_hash_table_lookup (message->headers, "received");
-
-    /* We've parsed all headers and never found the one we're looking
-     * for. It's probably just not there, but let's check that we
-     * didn't make a mistake preventing us from seeing it. */
-    if (message->restrict_headers && header_desired &&
-       ! g_hash_table_lookup_extended (message->headers,
-                                       header_desired, NULL, NULL))
-    {
-       INTERNAL_ERROR ("Attempt to get header \"%s\" which was not\n"
-                       "included in call to notmuch_message_file_restrict_headers\n",
-                       header_desired);
-    }
+    if (! decoded)
+       return NULL;
+
+    /* Cache the decoded value. We also own the strings. */
+    g_hash_table_insert (message->headers, xstrdup (header), decoded);
 
-    return "";
+    return decoded;
 }
index 59eb2bc285a53b2dabfbe1e2dca271aa245e4979..703ae7bb7a014c06bb03f18d7b9994b42275ba21 100644 (file)
@@ -46,6 +46,8 @@ NOTMUCH_BEGIN_DECLS
 
 #include <talloc.h>
 
+#include <gmime/gmime.h>
+
 #include "xutil.h"
 #include "error_util.h"
 
@@ -320,13 +322,6 @@ notmuch_message_set_author (notmuch_message_t *message, const char *author);
 const char *
 notmuch_message_get_author (notmuch_message_t *message);
 
-
-/* index.cc */
-
-notmuch_status_t
-_notmuch_message_index_file (notmuch_message_t *message,
-                            const char *filename);
-
 /* message-file.c */
 
 /* XXX: I haven't decided yet whether these will actually get exported
@@ -352,31 +347,31 @@ _notmuch_message_file_open_ctx (void *ctx, const char *filename);
 void
 notmuch_message_file_close (notmuch_message_file_t *message);
 
-/* Restrict 'message' to only save the named headers.
+/* Parse the message.
  *
- * When the caller is only interested in a short list of headers,
- * known in advance, calling this function can avoid wasted time and
- * memory parsing/saving header values that will never be needed.
+ * This will be done automatically as necessary on other calls
+ * depending on it, but an explicit call allows for better error
+ * status reporting.
+ */
+notmuch_status_t
+_notmuch_message_file_parse (notmuch_message_file_t *message);
+
+/* Get the gmime message of a message file.
  *
- * The variable arguments should be a list of const char * with a
- * final '(const char *) NULL' to terminate the list.
+ * The message file is parsed as necessary.
  *
- * If this function is called, it must be called before any calls to
- * notmuch_message_get_header for this message.
+ * The GMimeMessage* is set to *mime_message on success (which the
+ * caller must not unref).
  *
- * After calling this function, if notmuch_message_get_header is
- * called with a header name not in this list, then NULL will be
- * returned even if that header exists in the actual message.
+ * XXX: Would be nice to not have to expose GMimeMessage here.
  */
-void
-notmuch_message_file_restrict_headers (notmuch_message_file_t *message, ...);
-
-/* Identical to notmuch_message_restrict_headers but accepting a va_list. */
-void
-notmuch_message_file_restrict_headersv (notmuch_message_file_t *message,
-                                       va_list va_headers);
+notmuch_status_t
+_notmuch_message_file_get_mime_message (notmuch_message_file_t *message,
+                                       GMimeMessage **mime_message);
 
 /* Get the value of the specified header from the message as a UTF-8 string.
+ *
+ * The message file is parsed as necessary.
  *
  * The header name is case insensitive.
  *
@@ -387,13 +382,19 @@ notmuch_message_file_restrict_headersv (notmuch_message_file_t *message,
  * only until the message is closed. The caller should copy it if
  * needing to modify the value or to hold onto it for longer.
  *
- * Returns NULL if the message does not contain a header line matching
- * 'header'.
+ * Returns NULL on errors, empty string if the message does not
+ * contain a header line matching 'header'.
  */
 const char *
 notmuch_message_file_get_header (notmuch_message_file_t *message,
                                 const char *header);
 
+/* index.cc */
+
+notmuch_status_t
+_notmuch_message_index_file (notmuch_message_t *message,
+                            notmuch_message_file_t *message_file);
+
 /* messages.c */
 
 typedef struct _notmuch_message_node {