From: Carl Worth Date: Mon, 19 Oct 2009 23:38:44 +0000 (-0700) Subject: Rework message parsing to use getline rather than mmap. X-Git-Tag: 0.1~826 X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=371091139abb00e0eb6b17cd19311e2a30fd7470;p=notmuch.git Rework message parsing to use getline rather than mmap. The line-based parsing can be a bit awkward when wanting to peek ahead, (say, for folded header values), but it's so convenient to be able to trust that a string terminator exists on every line so it cleans up the code considerably. --- diff --git a/message.c b/message.c index 97df4b27..03583c8d 100644 --- a/message.c +++ b/message.c @@ -24,19 +24,25 @@ #include /* GHashTable */ +typedef struct { + char *str; + size_t size; + size_t len; +} header_value_closure_t; + struct _notmuch_message { - /* File objects */ - int fd; - void *map; + /* File object */ + FILE *file; /* Header storage */ int restrict_headers; GHashTable *headers; /* Parsing state */ - char *start; - size_t size; - const char *next_line; + char *line; + size_t line_size; + header_value_closure_t value; + int parsing_started; int parsing_finished; }; @@ -66,20 +72,11 @@ notmuch_message_t * notmuch_message_open (const char *filename) { notmuch_message_t *message; - struct stat st; message = xcalloc (1, sizeof (notmuch_message_t)); - message->fd = open (filename, O_RDONLY); - if (message->fd < 0) - goto FAIL; - - if (fstat (message->fd, &st) < 0) - goto FAIL; - - message->map = mmap (NULL, st.st_size, PROT_READ, MAP_PRIVATE, - message->fd, 0); - if (message->map == MAP_FAILED) + message->file = fopen (filename, "r"); + if (message->file == NULL) goto FAIL; message->headers = g_hash_table_new_full (strcase_hash, @@ -87,9 +84,6 @@ notmuch_message_open (const char *filename) free, free); - message->start = (char *) message->map; - message->size = st.st_size; - message->next_line = message->start; message->parsing_started = 0; message->parsing_finished = 0; @@ -111,10 +105,8 @@ notmuch_message_close (notmuch_message_t *message) if (message->headers) g_hash_table_unref (message->headers); - if (message->map) - munmap (message->map, message->size); - if (message->fd) - close (message->fd); + if (message->file) + fclose (message->file); free (message); } @@ -151,80 +143,44 @@ notmuch_message_restrict_headers (notmuch_message_t *message, ...) notmuch_message_restrict_headersv (message, va_headers); } -/* With our mmapped file, we don't get the benefit of terminated - * strings, so we can't use things like strchr(). We don't even know - * if there's a newline at the end of the file so we also have to be - * careful of that. Basically, every time we advance a pointer while - * parsing we must ensure we don't go beyond our buffer. - */ -#define WITHIN(s) (((s) - message->start) < (message->size -1)) - -/* In each of the macros below, "without overrunning the buffer" means - * that the macro will never dereference a character beyond the end of - * the buffer. However, all of the macros may return a pointer - * pointing to the first character beyond the buffer. So callers - * should test with WITHIN before dereferencing the result. */ - -/* Advance 'ptr' until pointing at a non-space character in the same - * line, (without overrunning the buffer) */ -#define SKIP_SPACE_IN_LINE(ptr) \ - while (WITHIN (ptr) && (*(ptr) == ' ' || *(ptr) == '\t')) \ - (ptr)++; - -/* Advance 'ptr' until pointing at a non-space character, (without - * overrunning the buffer) */ -#define SKIP_SPACE(ptr) \ - while (WITHIN (ptr) && isspace(*(ptr))) \ - (ptr)++; - -/* Advance 'ptr' to the first occurrence of 'c' within the same - * line, (without overrunning the buffer). */ -#define ADVANCE_TO(ptr, c) \ - while (WITHIN (ptr) && *(ptr) != '\n' && \ - *(ptr) != (c)) \ - { \ - (ptr)++; \ +void +copy_header_unfolding (header_value_closure_t *value, + const char *chunk) +{ + char *last; + + if (chunk == NULL) + return; + + while (*chunk == ' ' || *chunk == '\t') + chunk++; + + if (value->len + 1 + strlen (chunk) + 1 > value->size) { + int new_size = value->size; + if (value->size == 0) + new_size = strlen (chunk) + 1; + else + while (value->len + 1 + strlen (chunk) + 1 > new_size) + new_size *= 2; + value->str = xrealloc (value->str, new_size); + value->size = new_size; } -/* Advance 'ptr' to the beginning of the next line not starting with - * an initial tab character, (without overruning the buffer). */ -#define ADVANCE_TO_NEXT_HEADER_LINE(ptr) \ - do { \ - ADVANCE_TO ((ptr), '\n'); \ - if (WITHIN (ptr)) \ - (ptr)++; \ - } while (WITHIN (ptr) && \ - (*(ptr) == '\t' || *(ptr) == ' ')); - -char * -copy_header_value (const char *start, const char *end) -{ - const char *s; - char *result, *r; - int was_newline = 0; - - result = xmalloc (end - start + 1); - - s = start; - r = result; - - while (s < end) { - if (*s == '\n') { - was_newline = 1; - } else { - if (*s == '\t' && was_newline) - *r = ' '; - else - *r = *s; - r++; - was_newline = 0; - } - s++; + last = value->str + value->len; + if (value->len) { + *last = ' '; + last++; + value->len++; } - *r = '\0'; + strcpy (last, chunk); + value->len += strlen (chunk); - return result; + last = value->str + value->len - 1; + if (*last == '\n') { + *last = '\0'; + value->len--; + } } const char * @@ -232,8 +188,8 @@ notmuch_message_get_header (notmuch_message_t *message, const char *header_desired) { int contains; - const char *s, *colon; char *header, *value; + const char *s, *colon; int match; message->parsing_started = 1; @@ -247,54 +203,82 @@ notmuch_message_get_header (notmuch_message_t *message, if (message->parsing_finished) return NULL; - while (1) { - s = message->next_line; +#define NEXT_HEADER_LINE(closure) \ + do { \ + ssize_t bytes_read = getline (&message->line, \ + &message->line_size, \ + message->file); \ + if (bytes_read == -1) { \ + message->parsing_finished = 1; \ + break; \ + } \ + if (*message->line == '\n') { \ + message->parsing_finished = 1; \ + break; \ + } \ + if (closure && \ + (*message->line == ' ' || *message->line == '\t')) \ + { \ + copy_header_unfolding ((closure), message->line); \ + } \ + } while (*message->line == ' ' || *message->line == '\t'); + + if (message->line == NULL) + NEXT_HEADER_LINE (NULL); - if (*s == '\n') { - message->parsing_finished = 1; - return NULL; - } + while (1) { - if (*s == '\t') { - fprintf (stderr, "Warning: Unexpected continued value\n"); - ADVANCE_TO_NEXT_HEADER_LINE (message->next_line); - continue; - } + if (message->parsing_finished) + break; - colon = s; - ADVANCE_TO (colon, ':'); + colon = strchr (message->line, ':'); - if (! WITHIN (colon) || *colon == '\n') { - fprintf (stderr, "Warning: Unexpected non-header line: %s\n", s); - ADVANCE_TO_NEXT_HEADER_LINE (message->next_line); + if (colon == NULL) { + fprintf (stderr, "Warning: Unexpected non-header line: %s\n", + message->line); + NEXT_HEADER_LINE (NULL); continue; } - header = xstrndup (s, colon - s); + header = xstrndup (message->line, colon - message->line); if (message->restrict_headers && ! g_hash_table_lookup_extended (message->headers, header, NULL, NULL)) { free (header); - message->next_line = colon; - ADVANCE_TO_NEXT_HEADER_LINE (message->next_line); + NEXT_HEADER_LINE (NULL); continue; } s = colon + 1; - SKIP_SPACE_IN_LINE (s); + while (*s == ' ' || *s == '\t') + s++; - message->next_line = s; - ADVANCE_TO_NEXT_HEADER_LINE (message->next_line); + message->value.len = 0; + copy_header_unfolding (&message->value, s); - value = copy_header_value (s, message->next_line); + NEXT_HEADER_LINE (&message->value); match = (strcasecmp (header, header_desired) == 0); - g_hash_table_insert (message->headers, header, value); + g_hash_table_insert (message->headers, header, + xstrdup (message->value.str)); if (match) return value; } + + if (message->line) + free (message->line); + message->line = NULL; + + if (message->value.size) { + free (message->value.str); + message->value.str = NULL; + message->value.size = 0; + message->value.len = 0; + } + + return NULL; } diff --git a/notmuch-private.h b/notmuch-private.h index b7d27e91..449aff71 100644 --- a/notmuch-private.h +++ b/notmuch-private.h @@ -23,6 +23,8 @@ #include "notmuch.h" +#define _GNU_SOURCE /* For getline */ + #include #include #include