From: Austin Clements <amdragon@MIT.EDU>
To: notmuch@notmuchmail.org
Subject: [PATCH 1/2] Implement a custom query parser with a mostly Xapian-compatible grammar.
Date: Thu, 23 Dec 2010 01:00:23 -0500
Message-Id: <1293084024-7893-2-git-send-email-amdragon@mit.edu>
In-Reply-To: <1293084024-7893-1-git-send-email-amdragon@mit.edu>
References: <1293084024-7893-1-git-send-email-amdragon@mit.edu>
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Cc: amdragon@mit.edu
Precedence: list

This parser takes an extra step through an intermediate representation
that is convenient to manipulate via pluggable transformation passes.
These are used to implement regular Xapian-style query prefixes, but
are flexible enough to accomplish far more.
---
 lib/Makefile.local     |    1 +
 lib/database-private.h |    6 +
 lib/notmuch-private.h  |  126 +++++++
 lib/qparser.cc         |  941 ++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 1074 insertions(+), 0 deletions(-)
 create mode 100644 lib/qparser.cc

diff --git a/lib/Makefile.local b/lib/Makefile.local
index 37d1c0d..37d3735 100644
--- a/lib/Makefile.local
+++ b/lib/Makefile.local
@@ -64,6 +64,7 @@ libnotmuch_cxx_srcs =		\
 	$(dir)/index.cc		\
 	$(dir)/message.cc	\
 	$(dir)/query.cc		\
+	$(dir)/qparser.cc	\
 	$(dir)/thread.cc
 
 libnotmuch_modules = $(libnotmuch_c_srcs:.c=.o) $(libnotmuch_cxx_srcs:.cc=.o)
diff --git a/lib/database-private.h b/lib/database-private.h
index 9f83407..358a71b 100644
--- a/lib/database-private.h
+++ b/lib/database-private.h
@@ -64,6 +64,12 @@ _notmuch_get_terms_with_prefix (void *ctx, Xapian::TermIterator &i,
 				Xapian::TermIterator &end,
 				const char *prefix);
 
+/* qparser.cc */
+
+/* Generate a Xapian query from a query AST. */
+Xapian::Query
+_notmuch_qparser_generate (_notmuch_qparser_t *qparser, _notmuch_token_t *root);
+
 #pragma GCC visibility pop
 
 #endif
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index b6f1095..be76c0c 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -508,6 +508,132 @@ notmuch_filenames_t *
 _notmuch_filenames_create (const void *ctx,
 			   notmuch_string_list_t *list);
 
+/* qparser.cc */
+
+typedef struct _notmuch_qparser _notmuch_qparser_t;
+
+enum _notmuch_token_type {
+    /* These first four token types appear only in the lexer output
+     * and never in the parse tree. */
+    TOK_LOVE, TOK_HATE, TOK_BRA, TOK_KET,
+    /* Binary operators.  These should have left and right children.
+     * ADJ and NEAR should have a distance. */
+    TOK_AND, TOK_OR, TOK_XOR, TOK_ADJ, TOK_NEAR,
+    /* Unary operators.  These have only a left child.  Xapian::Query
+     * has no pure NOT operator, so the generator treats NOT as the
+     * child of an AND specially, and otherwise represents it as
+     * "<all> AND_NOT x".  FILTER ignores the weights of the subquery
+     * and generates Xapian::Query::OP_FILTER if the left child of an
+     * AND or Xapian::Query::OP_SCALE_WEIGHT otherwise.  The text
+     * field of a PREFIX operator specifies the prefix.  PREFIX
+     * operators specify only syntactic prefixes, not database
+     * prefixes, and thus have no effect on the generated query. */
+    TOK_NOT, TOK_FILTER, TOK_PREFIX,
+    /* A single TOK_TERMS token can represent a single term, a quoted
+     * phrase, or an implicit phrase.  An implicit phrase is something
+     * like "foo/bar", for which the database contains two separate
+     * terms, but you want to treat them as a phrase, even though it's
+     * not quoted.  Xapian calls characters that implicitly connect
+     * terms into phrases "phrase generators."  We take a simpler
+     * approach and treat almost any non-whitespace character as a
+     * phrase generator. */
+    TOK_TERMS,
+    /* Like TOK_TERMS, but the term text should be taken literally,
+     * with no phrase splitting or whitespace removal.  The lexer
+     * only generates TOK_TERMS; the parser creates TOK_LIT. */
+    TOK_LIT,
+    /* TOK_END indicates the end of the token list.  Such tokens loop
+     * back on themselves so it's always safe to follow "next".
+     * These appear only in the lexer output. */
+    TOK_END
+};
+
+typedef struct _notmuch_token {
+    enum _notmuch_token_type type;
+    const char *text;
+
+    /* For TOK_ADJ and TOK_NEAR, this specifies the distance
+     * argument. */
+    int distance;
+
+    /* For TOK_PREFIX, the flags of this prefix. */
+    int prefixFlags;
+
+    /* For TOK_TERMS and TOK_LIT, the database prefix to use when
+     * generating database terms.  This must be filled in a
+     * transformation pass. */
+    char *prefix;
+
+    /* Link in the lexer token list. */
+    struct _notmuch_token *next;
+
+    /* Links in the intermediate AST. */
+    struct _notmuch_token *left, *right;
+} _notmuch_token_t;
+
+_notmuch_token_t *
+_notmuch_token_create (const void *ctx, enum _notmuch_token_type type,
+		       const char *text);
+
+char *
+_notmuch_token_show_list (const void *ctx, _notmuch_token_t *tok);
+
+char *
+_notmuch_token_show_tree (const void *ctx, _notmuch_token_t *root);
+
+_notmuch_qparser_t *
+_notmuch_qparser_create (const void *ctx, notmuch_database_t *notmuch);
+
+/* Add a syntactic prefix.  This will appear as a TOK_PREFIX in the
+ * AST, but does not alone affect the final query.
+ *
+ * The literal flag affects lexing.  If true, this prefix must be
+ * followed by a regular term or quoted literal, which will not be
+ * stripped of whitespace or split in to a phrase.  The boolean flag
+ * affects parsing.  If true, then terms with this prefix will be
+ * combined into the query using the FILTER operator, so they must
+ * appear in the result and will not contribute to weights.  Xapian's
+ * "boolean prefixes" are both literal and boolean.
+ */
+void
+_notmuch_qparser_add_prefix (_notmuch_qparser_t *qparser,
+			     const char *prefix, notmuch_bool_t literal,
+			     notmuch_bool_t boolean);
+
+/* Add a transform pass to a query parser.  The transform function
+ * will be called with the root of the AST and should return a new AST
+ * root (which may be the same as the old root).
+ */
+void
+_notmuch_qparser_add_transform (_notmuch_qparser_t *qparser,
+				_notmuch_token_t *(*transform) (
+				    _notmuch_token_t *ast, void *opaque),
+				void *opaque);
+
+/* Add a syntactic prefix (field) and a transform pass to transform
+ * that syntactic prefix into a database prefix (prefix).  This
+ * corresponds to Xapian's add_prefix and add_boolean_prefix
+ * functions. */
+void
+_notmuch_qparser_add_db_prefix (_notmuch_qparser_t *qparser,
+				const char *field, const char *prefix,
+				notmuch_bool_t boolean);
+
+/* Lex a query string, returning the first token in the token list.
+ * This is only meant for testing. */
+_notmuch_token_t *
+_notmuch_qparser_lex (_notmuch_qparser_t *qparser, const char *query);
+
+/* Parse a query string, returning the root of the AST. */
+_notmuch_token_t *
+_notmuch_qparser_parse (_notmuch_qparser_t *qparser, const char *query);
+
+/* Transform a parsed query, running the transforms in the order they
+ * were added to the query parser.  Return the root of the transformed
+ * AST. */
+_notmuch_token_t *
+_notmuch_qparser_transform (_notmuch_qparser_t *qparser, _notmuch_token_t *root);
+
 #pragma GCC visibility pop
 
 NOTMUCH_END_DECLS
diff --git a/lib/qparser.cc b/lib/qparser.cc
new file mode 100644
index 0000000..4804d06
--- /dev/null
+++ b/lib/qparser.cc
@@ -0,0 +1,941 @@
+/* qparser.cc - Notmuch query parser
+ *
+ * Copyright © 2010 Austin Clements
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see http://www.gnu.org/licenses/ .
+ *
+ * Author: Austin Clements <amdragon@mit.edu>
+ */
+
+/*
+ * Query parsing is performed in a series of phases similar to those
+ * of a traditional compiler.
+ *
+ * 1) We tokenize the query, identifying operators and term phrases.
+ *    Note that a phrase (quoted or implicit) generates a single token
+ *    at this point, which we split up later (unlike in the Xapian
+ *    lexer).
+ * 2) We parse the token stream, generating an intermediate
+ *    representation in the form of a binary AST.  This IR is similar
+ *    to the Xapian::Query operators, but is designed to be more
+ *    easily manipulated.
+ * 3) We transform the parse tree, running a sequence of
+ *    caller-provided transformation functions over the tree.
+ * 4) We generate the Xapian::Query from the transformed IR.  This
+ *    step also splits phrase tokens into multiple query terms.
+ *    XXX and expands wildcard terms.
+ *
+ * To use the query parser, call _notmuch_qparser_parse to perform
+ * steps 1 and 2, _notmuch_qparser_transform to perform step 3, and
+ * _notmuch_qparser_generate to perform step 4.
+ *
+ * Still missing from this implementation:
+ * * Stemming - The stemming should probably be marked on TOK_TERMS
+ *   tokens.  Ideally, we can just pass this to the term generator.
+ * * Wildcard queries - This should be available in the IR so it's
+ *   easy to generate wildcard queries in a transformer.
+ * * Value ranges in the IR
+ * * Queries "" and "*"
+ * * Memory management is poorly suited to reusing qparser's.
+ */
+
+/* XXX notmuch currently registers "tag" as an exclusive boolean
+ * prefix, which means queries like "tag:x tag:y" will return messages
+ * with tag x OR tag y.  Is this intentional? */
+
+#include "notmuch-private.h"
+#include "database-private.h"
+
+#include <glib.h>		/* GHashTable */
+
+struct _notmuch_qparser {
+    notmuch_database_t *notmuch;
+
+    /* Maps from prefix strings to PREFIX_* flags */
+    GHashTable *prefixes;
+
+    struct _notmuch_qparser_transform *transforms;
+    struct _notmuch_qparser_transform **transformsTail;
+};
+
+enum {
+    PREFIX_LITERAL = 1<<1,
+    PREFIX_PROB    = 1<<2,
+    PREFIX_BOOL    = 1<<3,
+};
+
+struct _notmuch_qparser_transform {
+    struct _notmuch_qparser_transform *next;
+    _notmuch_token_t *(*transform) (_notmuch_token_t *ast, void *opaque);
+    void *opaque;
+};
+
+struct _notmuch_lexer_state {
+    _notmuch_qparser_t *qparser;
+    _notmuch_token_t *head;
+    _notmuch_token_t **tail;
+};
+
+struct _notmuch_generate_state {
+    _notmuch_qparser_t *qparser;
+    unsigned int termpos;
+};
+
+static const char *token_types[] = {
+    "LOVE", "HATE", "BRA", "KET",
+    "AND", "OR", "XOR", "ADJ", "NEAR",
+    "NOT", "FILTER", "PREFIX",
+    "TERMS", "LIT", "END"
+};
+
+/* The distinguished end token.  This simplifies the parser since it
+ * never has to worry about dereferencing next. */
+static _notmuch_token_t tok_end = {TOK_END, NULL, -1, FALSE, NULL,
+				   &tok_end, NULL, NULL};
+
+_notmuch_token_t *
+_notmuch_token_create (const void *ctx, enum _notmuch_token_type type,
+		       const char *text)
+{
+    struct _notmuch_token *tok = talloc (ctx, struct _notmuch_token);
+    tok->type = type;
+    tok->text = text;
+    if (type == TOK_ADJ || type == TOK_NEAR)
+	tok->distance = 10;
+    else
+	tok->distance = -1;
+    tok->prefixFlags = 0;
+    tok->prefix = NULL;
+    tok->left = tok->right = NULL;
+    return tok;
+}
+
+char *
+_notmuch_token_show_list (const void *ctx, _notmuch_token_t *tok)
+{
+    char *out = talloc_strdup (ctx, "");
+
+    for (; tok->type != TOK_END; tok = tok->next) {
+	out = talloc_asprintf_append (out, "%s%s",
+				      *out == 0 ? "" : " ",
+				      token_types[tok->type]);
+	if (tok->distance != -1)
+	    out = talloc_asprintf_append (out, "/%d", tok->distance);
+	if (tok->text)
+	    out = talloc_asprintf_append (out, ":%s", tok->text);
+    }
+
+    return out;
+}
+
+char *
+_notmuch_token_show_tree (const void *ctx, _notmuch_token_t *root)
+{
+    if (!root) {
+	return talloc_strdup (ctx, "<nil>");
+    } else if (root->type == TOK_TERMS || root->type == TOK_LIT) {
+	return talloc_strdup (ctx, root->text);
+    } else {
+	char *out = talloc_asprintf (ctx, "(%s", token_types[root->type]);
+	void *local = talloc_new (ctx);
+	if (root->type == TOK_PREFIX)
+	    out = talloc_asprintf_append (out, "/%s", root->text);
+	if (root->left)
+	    out = talloc_asprintf_append
+		(out, " %s", _notmuch_token_show_tree (local, root->left));
+	if (root->right)
+	    out = talloc_asprintf_append
+		(out, " %s", _notmuch_token_show_tree (local, root->right));
+	out = talloc_asprintf_append (out, ")");
+	talloc_free (local);
+	return out;
+    }
+}
+
+using Xapian::Unicode::is_whitespace;
+using Xapian::Unicode::is_wordchar;
+
+static Xapian::Utf8Iterator
+lex_skip_ws (Xapian::Utf8Iterator it)
+{
+    while (is_whitespace (*it))
+	it++;
+    return it;
+}
+
+static struct _notmuch_token *
+lex_emit (struct _notmuch_lexer_state *s, enum _notmuch_token_type type,
+	  char *text)
+{
+    struct _notmuch_token *tok = _notmuch_token_create (s, type, text);
+    tok->next = &tok_end;
+    *(s->tail) = tok;
+    s->tail = &tok->next;
+    return tok;
+}
+
+
+/* Lex a quoted phrase, returning an iterator pointing to the
+ * character following the closing quote.  If escaped, then accept two
+ * quotes as an escaped version of a single quote.
+ */
+static Xapian::Utf8Iterator
+lex_quoted_phrase (struct _notmuch_lexer_state *s,
+		   Xapian::Utf8Iterator it, notmuch_bool_t escaped)
+{
+    Xapian::Utf8Iterator next, orig, end;
+    char *term, *src, *dst;
+
+    /* Find the end of the phrase */
+    assert (*(it++) == '"');
+    for (next = it; next != end; ++next) {
+	if (*next == '"') {
+	    if (!escaped)
+		break;
+
+	    orig = next++;
+	    if (next == end || *next != '"') {
+		next = orig;
+		break;
+	    }
+	}
+    }
+
+    /* Xapian still lexes +/-/( in quotes mode and simply doesn't
+     * generate tokens for them.  For us, the term generator will
+     * discard them. */
+    term = talloc_strndup (s, it.raw (), next.raw () - it.raw ());
+    if (escaped) {
+	/* Replace doubled quotes with a single quote. */
+	for (src = dst = term; *src; ++src, ++dst) {
+	    *dst = *src;
+	    if (*src == '"')
+		++src;
+	}
+	*dst = '\0';
+    }
+    lex_emit (s, TOK_TERMS, term);
+
+    if (next != end)
+	++next;
+    return next;
+}
+
+static Xapian::Utf8Iterator
+lex_try_consume_prefix (_notmuch_qparser_t *qparser, Xapian::Utf8Iterator it,
+			char **prefixOut, int *flagsOut)
+{
+    Xapian::Utf8Iterator next (it), end;
+    char *prefix;
+    int flags;
+
+    *prefixOut = NULL;
+    while (next != end && *next != ':' && !is_whitespace (*next))
+	++next;
+    if (*next != ':')
+	return it;
+    /* Ignore if followed by <= ' ' or ')' */
+    ++next;
+    if (*next <= ' ' || *next == ')')
+	return it;
+
+    prefix = talloc_strndup (qparser, it.raw (), next.raw () - it.raw() - 1);
+    flags = GPOINTER_TO_INT (g_hash_table_lookup (qparser->prefixes, prefix));
+    if (!flags) {
+	/* Not a known prefix */
+	talloc_free (prefix);
+	return it;
+    }
+    *prefixOut = prefix;
+    *flagsOut = flags;
+    return next;
+}
+
+static Xapian::Utf8Iterator
+lex_consume_term (const void *ctx, Xapian::Utf8Iterator it, char **termOut)
+{
+    Xapian::Utf8Iterator next (it), end;
+    /* Xapian permits other characters to separate term phrases.  For
+     * example, "x#y" is parsed as two separate (non-phrase) terms.
+     * However, because the characters allowed in a term are
+     * context-sensitive, replicating this is very hard.  Here we take
+     * a simpler approach where only whitespace and a few operator
+     * characters that are never term characters separate terms. */
+    while (next != end && !strchr ("()\"", *next) && !is_whitespace (*next))
+	++next;
+    *termOut = talloc_strndup (ctx, it.raw (), next.raw () - it.raw ());
+    return next;
+}
+
+static notmuch_bool_t
+lex_operator (struct _notmuch_lexer_state *s, char *term,
+	      const char *op, enum _notmuch_token_type type)
+{
+    size_t oplen = strlen (op);
+
+    if (strcasecmp (term, op) == 0) {
+	lex_emit (s, type, term);
+	return true;
+    }
+
+    /* Check for ADJ or NEAR with argument.  Our parsing of this is
+     * slightly incompatible with Xapian, but I believe this to be a
+     * bug in Xapian.  Xapian parses "x NEAR/y z" as three term
+     * phrases, "x", "near y", and "z", like we do.  However, it
+     * behaves differently if the bad NEAR operator is at the end of
+     * the query, parsing "x NEAR/y" like "x NEAR y". */
+    if ((type == TOK_ADJ || type == TOK_NEAR) &&
+	strncasecmp (term, op, oplen) == 0 &&
+	term[oplen] == '/') {
+	/* Try to parse the distance argument */
+	char *end;
+	int distance = strtol (&term[oplen + 1], &end, 10);
+	if (distance && !*end) {
+	    struct _notmuch_token *tok = lex_emit (s, type, term);
+	    tok->distance = distance;
+	    return true;
+	}
+    }
+    
+    return false;
+}
+
+static _notmuch_token_t *
+lex (_notmuch_qparser_t *qparser, const char *query)
+{
+    Xapian::Utf8Iterator it (query), next, end;
+    struct _notmuch_lexer_state *s;
+    struct _notmuch_token *tok;
+    char *term;
+    int prefixFlags;
+
+    s = talloc (qparser, struct _notmuch_lexer_state);
+    if (!s)
+	return NULL;
+    s->qparser = qparser;
+    s->head = &tok_end;
+    s->tail = &s->head;
+
+    while (it != end) {
+	unsigned ch;
+	if ((it = lex_skip_ws (it)) == end)
+	    break;
+
+	ch = *it;
+	switch (ch) {
+	case '+':
+	case '-':
+	    ++it;
+	    /* Xapian ignores these unless preceded by whitespace or
+	     * an open paren, which has the effect of ignoring all
+	     * +'s in "x +++y", "x#+y", and "(x)+y".  We don't
+	     * bother. */
+
+	    /* Ignore if followed by a space or another + or - */
+	    if (is_whitespace (*it) || *it == '+' || *it == '-')
+		continue;
+	    lex_emit (s, ch == '+' ? TOK_LOVE : TOK_HATE, NULL);
+	    continue;
+
+	case '"':
+	    it = lex_quoted_phrase(s, it, false);
+	    continue;
+
+	case '(':
+	    ++it;
+	    /* Xapian ignores this unless preceded by whitespace,
+	     * parens, +, or -.  We don't bother. */
+	    lex_emit (s, TOK_BRA, NULL);
+	    continue;
+
+	case ')':
+	    ++it;
+	    lex_emit (s, TOK_KET, NULL);
+	    continue;
+	}
+
+	/* Scan for a prefix */
+	next = lex_try_consume_prefix (qparser, it, &term, &prefixFlags);
+	if (term && (*next == '"' || *next == '(' || is_wordchar (*next))) {
+	    int literal = prefixFlags & PREFIX_LITERAL;
+	    tok = lex_emit (s, TOK_PREFIX, term);
+	    tok->prefixFlags = prefixFlags;
+
+	    it = next;
+	    if (literal && *it == '"') {
+		/* Literal quoted strings keep everything and allow
+		 * quote escaping, unlike regular quoted phrases. */
+		it = lex_quoted_phrase (s, it, true);
+	    } else if (literal) {
+		/* Xapian uses anything up to the next space or ')'
+		 * (because literal prefixes can't be applied to
+		 * subqueries).  I disagree with Xapian here, since
+		 * Xapian will keep the open paren but not the close
+		 * paren.  Better would be to balance them. */
+		if (*next == '(')
+		    ++next;
+		while (next != end && *next > ' ' && *next != ')')
+		    ++next;
+		term = talloc_strndup (s, it.raw (),
+				       next.raw () - it.raw ());
+		lex_emit (s, TOK_TERMS, term);
+		it = next;
+	    }
+	    /* For quoted strings and subqueries following non-literal
+	     * prefixes and regular terms, we lex as usual. */
+	    continue;
+	}
+
+	/* Scan for a term phrase or operator */
+	it = lex_consume_term (s, it, &term);
+
+	/* Check operators */
+	if (lex_operator (s, term, "and",  TOK_AND) ||
+	    lex_operator (s, term, "not",  TOK_NOT) ||
+	    lex_operator (s, term, "xor",  TOK_XOR) ||
+	    lex_operator (s, term, "or",   TOK_OR)  ||
+	    lex_operator (s, term, "adj",  TOK_ADJ) ||
+	    lex_operator (s, term, "near", TOK_NEAR))
+	    continue;
+
+	/* Must be a term */
+	lex_emit (s, TOK_TERMS, term);
+    }
+
+    return s->head;
+}
+
+static void
+add_to_query (const void *ctx, _notmuch_token_t **query,
+	      _notmuch_token_type op, _notmuch_token_t *right)
+{
+    if (!*query)
+	*query = right;
+    else if (right) {
+	_notmuch_token_t *tok = _notmuch_token_create (ctx, op, NULL);
+	tok->left = *query;
+	tok->right = right;
+	*query = tok;
+    }
+}
+
+static _notmuch_token_t *
+parse_expr (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok);
+static _notmuch_token_t *
+parse_term (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok);
+
+static _notmuch_token_t *
+parse_prob (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok)
+{
+    /* A prob is a sequence of three types of subqueries.  Because the
+     * default query operator is AND, loved terms are not treated
+     * specially.
+     * 1) Probabilistic terms (prefixed or not).  These are combined
+     *    with the default query operator, AND.
+     * 2) Terms with a boolean prefix.  All of the terms with the same
+     *    prefix are combined with OR.  Different prefixes are
+     *    combined with AND.
+     * 3) Hate terms.  These are combined with OR.
+     * The final IR looks like
+     *   (probs AND (FILTER bools)) AND (NOT hates)
+     */
+
+    _notmuch_token_t *probs, *hates, *sub, *q;
+    GHashTable *bools;
+    int done = 0;
+
+    probs = hates = NULL;
+    bools = g_hash_table_new (g_str_hash, g_str_equal);
+    while (!done) {
+	switch ((*tok)->type) {
+	case TOK_KET:
+	    if (prec < 10) {
+		/* Unmatched close paren.  Ignore it. */
+		*tok = (*tok)->next;
+		break;
+	    }
+	    /* Fall through */
+	case TOK_AND: case TOK_OR: case TOK_XOR: case TOK_NOT:
+	case TOK_END:
+	    /* End of the prob.  Might be empty. */
+	    done = 1;
+	    break;
+
+	case TOK_HATE:
+	    *tok = (*tok)->next;
+	    sub = parse_expr (qparser, prec + 1, tok);
+	    add_to_query (qparser, &hates, TOK_OR, sub);
+	    break;
+
+	case TOK_PREFIX:
+	    sub = parse_expr (qparser, prec + 1, tok);
+	    if (!sub)
+		break;
+	    if (sub->prefixFlags & PREFIX_PROB) {
+		add_to_query (qparser, &probs, TOK_AND, sub);
+	    } else {
+		_notmuch_token_t *newb, *pre = (_notmuch_token_t*)
+		    g_hash_table_lookup (bools, sub->text);
+		if (!pre) {
+		    newb = sub;
+		} else {
+		    /* OR subqueries with same prefix */
+		    newb = _notmuch_token_create (qparser, TOK_OR, NULL);
+		    newb->left = pre;
+		    newb->right = sub;
+		}
+		g_hash_table_insert (bools, (void*)sub->text, newb);
+	    }
+	    break;
+
+	case TOK_LOVE:
+	    /* Join into the query like any other term, since the
+	     * default operator is AND anyway. */
+	    *tok = (*tok)->next;
+	    /* Fall through */
+	case TOK_BRA:
+	case TOK_TERMS:
+	case TOK_LIT:
+	    sub = parse_expr (qparser, prec + 1, tok);
+	    add_to_query (qparser, &probs, TOK_AND, sub);
+	    break;
+
+	case TOK_ADJ:
+	case TOK_NEAR:
+	case TOK_FILTER:
+	    /* XXX Can ADJ or NEAR happen? */
+	    INTERNAL_ERROR ("Unexpected token type %d", (*tok)->type);
+	}
+    }
+
+    q = probs;
+    if (g_hash_table_size (bools)) {
+	/* Merge boolean filters */
+	_notmuch_token_t *filter;
+	GList *vals = g_hash_table_get_values (bools), *l;
+	sub = NULL;
+	for (l = vals; l; l = l->next)
+	    add_to_query (qparser, &sub, TOK_AND, (_notmuch_token_t *) l->data);
+	g_list_free (vals);
+
+	/* Create filter */
+	filter = _notmuch_token_create (qparser, TOK_FILTER, NULL);
+	filter->left = sub;
+	add_to_query (qparser, &q, TOK_AND, filter);
+    }
+    if (hates) {
+	sub = _notmuch_token_create (qparser, TOK_NOT, NULL);
+	sub->left = hates;
+	add_to_query (qparser, &q, TOK_AND, sub);
+    }
+    g_hash_table_unref (bools);
+    return q;
+}
+
+static _notmuch_token_t *
+parse_term (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok)
+{
+    _notmuch_token_t *sub;
+
+    if ((*tok)->type == TOK_END) {
+	/* Arises from things like "x -".  Ignore. */
+	return NULL;
+    } else if ((*tok)->type == TOK_PREFIX) {
+	sub = *tok;
+	*tok = (*tok)->next;
+	sub->left = parse_term (qparser, prec, tok);
+	if (!sub->left)
+	    return NULL;
+	if (sub->prefixFlags & PREFIX_LITERAL) {
+	    /* Convert TOK_TERMS to TOK_LIT */
+	    assert (sub->left->type == TOK_TERMS);
+	    sub->left->type = TOK_LIT;
+	} else if (sub->left->type == TOK_PREFIX) {
+	    sub->left = sub->left->left;
+	}
+	return sub;
+    } else if ((*tok)->type == TOK_BRA) {
+	*tok = (*tok)->next;
+	sub = parse_expr (qparser, prec + 10 - (prec%10), tok);
+	if ((*tok)->type == TOK_KET)
+	    *tok = (*tok)->next;
+	return sub;
+    }
+
+    if ((*tok)->type != TOK_TERMS && (*tok)->type != TOK_LIT) {
+	/* Arises from "+AND", "-AND", "prob:AND".  We could give up
+	 * and return nothing, but it seems nicer to treat the
+	 * operator as a term if it came from the original query. */
+	if (!(*tok)->text)
+	    return NULL;
+	(*tok)->type = TOK_TERMS;
+    }
+
+    sub = *tok;
+    *tok = (*tok)->next;
+    return sub;
+}
+
+static _notmuch_token_t *
+parse_expr (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok)
+{
+    /* If you squint at the Xapian grammar, it's a precedence grammar
+     * with one strange "prob" level.  This implements all but the
+     * prob level and the leaf "term" level.
+     *
+     * prec is (nesting level * 10 + precedence level).  Normally we
+     * only care about the precedence level, but the nesting level is
+     * important for recovering from unbalanced parens.
+     */
+    int bprec = prec % 10;
+    if (bprec == 3) {
+	if ((*tok)->type == TOK_NOT) {
+	    /* Unary NOT */
+	    _notmuch_token_t *root = *tok;
+	    *tok = (*tok)->next;
+	    root->left = parse_expr (qparser, prec, tok);
+	    return root;
+	}
+
+	return parse_prob (qparser, prec, tok);
+    }
+    if (bprec == 5)
+	return parse_term (qparser, prec, tok);
+
+    _notmuch_token_t *left = parse_expr (qparser, prec + 1, tok);
+    while ((bprec == 0 && (*tok)->type == TOK_OR) ||
+	   (bprec == 1 && (*tok)->type == TOK_XOR) ||
+	   (bprec == 2 && ((*tok)->type == TOK_AND ||
+			   (*tok)->type == TOK_NOT)) ||
+	   (bprec == 4 && ((*tok)->type == TOK_NEAR ||
+			   (*tok)->type == TOK_ADJ))) {
+	_notmuch_token_t *root = *tok;
+	if (root->type == TOK_NOT) {
+	    /* Replace "x NOT y" with (AND x (NOT y)) by inserting an
+	     * AND operator and leaning on the unary NOT rule. */
+	    root = _notmuch_token_create (qparser, TOK_AND, NULL);
+	} else {
+	    *tok = (*tok)->next;
+	}
+
+	/* Xapian treats x AND -y as x AND NOT y, which affects
+	 * precedence. */
+	if (root->type == TOK_AND && (*tok)->type == TOK_HATE)
+	    (*tok)->type = TOK_NOT;
+
+	_notmuch_token_t *right = parse_expr (qparser, prec + 1, tok);
+	if (left && right) {
+	    root->left = left;
+	    root->right = right;
+	} else {
+	    /* Left or right was empty.  This may be a syntax error
+	     * like an omitted expression, or an empty expression. */
+	    root = left ? left : right;
+	}
+	left = root;
+    }
+    return left;
+}
+
+static _notmuch_token_t *
+parse (_notmuch_qparser_t *qparser, _notmuch_token_t *toks)
+{
+    _notmuch_token_t *root = parse_expr (qparser, 0, &toks);
+    if (toks->type != TOK_END)
+	INTERNAL_ERROR ("Token stream not fully consumed: %s",
+			_notmuch_token_show_list (qparser, toks));
+    return root;
+}
+
+static char *
+generate_term (const void *ctx, const char *term, const char *prefix)
+{
+    notmuch_bool_t colon = FALSE;
+    if (isupper (term[0]) && strlen (prefix) > 1)
+	colon = TRUE;
+    return talloc_asprintf (ctx, "%s%s%s", prefix, colon ? ":" : "", term);
+}
+
+static Xapian::Query
+generate_terms (struct _notmuch_generate_state *s, const char *text,
+		const char *prefix)
+{
+    Xapian::TermGenerator tg;
+    Xapian::Document doc;
+    Xapian::TermIterator it, end;
+    Xapian::PositionIterator pit, pend;
+    Xapian::Query *qs, q;
+    unsigned int nterms = 0;
+
+    if (prefix)
+	tg.index_text (text, 1, prefix);
+    else
+	tg.index_text (text);
+    doc = tg.get_document ();
+
+    /* Find the highest positioned term.  Positions are 1-based. */
+    end = doc.termlist_end ();
+    for (it = doc.termlist_begin (); it != end; ++it) {
+	pend = it.positionlist_end ();
+	for (pit = it.positionlist_begin (); pit != pend; ++pit) {
+	    if (*pit > nterms)
+		nterms = *pit;
+	}
+    }
+    if (nterms == 0)
+	return Xapian::Query ();
+
+    /* Extract terms */
+    qs = new Xapian::Query[nterms];
+    for (it = doc.termlist_begin (); it != end; ++it) {
+	pend = it.positionlist_end ();
+	for (pit = it.positionlist_begin (); pit != pend; ++pit)
+	    qs[*pit - 1] = Xapian::Query (*it, 1, s->termpos + *pit - 1);
+    }
+    s->termpos += nterms;
+
+    /* Build query */
+    q = Xapian::Query (Xapian::Query::OP_PHRASE, qs, qs + nterms, nterms);
+    delete [] qs;
+    return q;
+}
+
+static Xapian::Query
+generate (struct _notmuch_generate_state *s, _notmuch_token_t *root)
+{
+    using Xapian::Query;
+    Query l, r;
+    Query::op op;
+
+    if (!root)
+	return Query ();
+
+    /* The tricky part here is that generate is allowed to return a
+     * empty query, indicating that the user's query cannot be
+     * expressed.  For example, the term "#" is an empty query. */
+
+    switch (root->type) {
+    case TOK_AND:
+	if (root->left->type == TOK_NOT && root->right->type != TOK_NOT) {
+	    _notmuch_token_t *tmp = root->left;
+	    root->left = root->right;
+	    root->right = tmp;
+	}
+	l = generate (s, root->left);
+	if (l.empty()) {
+	    return generate (s, root->right);
+	} else if (root->right->type == TOK_NOT) {
+	    r = generate (s, root->right->left);
+	    op = Query::OP_AND_NOT;
+	} else if (root->right->type == TOK_FILTER) {
+	    r = generate (s, root->right->left);
+	    op = Query::OP_FILTER;
+	} else {
+	    r = generate (s, root->right);
+	    op = Query::OP_AND;
+	}
+	if (r.empty())
+	    return l;
+	return Query (op, l, r);
+
+    case TOK_NOT:
+	l = generate (s, root->left);
+	if (l.empty())
+	    return l;
+	return Query (Query::OP_AND_NOT, Query ("", 1, 0), l);
+
+    case TOK_FILTER:
+	l = generate(s, root->left);
+	if (l.empty())
+	    return l;
+	return Query (Query::OP_SCALE_WEIGHT, l, 0.0);
+
+    case TOK_OR:
+    case TOK_XOR:
+	l = generate (s, root->left);
+	r = generate (s, root->right);
+	if (l.empty())
+	    return r;
+	if (r.empty())
+	    return l;
+	return Query (root->type == TOK_OR ? Query::OP_OR : Query::OP_XOR,
+		      l, r);
+
+    case TOK_ADJ:
+    case TOK_NEAR:
+    {
+	/* XXX This differs from Xapian.  Xapian treats ADJ and NEAR
+	 * as n-ary operators and constructs the whole list of terms
+	 * to be searched within the largest specified window. */
+	Query subs[2];
+	subs[0] = Query (generate (s, root->left));
+	subs[1] = Query (generate (s, root->right));
+	if (subs[0].empty())
+	    return subs[1];
+	if (subs[1].empty())
+	    return subs[0];
+	return Query (root->type == TOK_ADJ ? Query::OP_PHRASE : Query::OP_NEAR,
+		      subs, subs + 2, root->distance + 1);
+    }
+
+    case TOK_PREFIX:
+	return generate (s, root->left);
+
+    case TOK_TERMS:
+	return generate_terms (s, root->text, root->prefix);
+
+    case TOK_LIT:
+	return Query (generate_term (s, root->text, root->prefix));
+
+    case TOK_LOVE:
+    case TOK_HATE:
+    case TOK_BRA:
+    case TOK_KET:
+    case TOK_END:
+	INTERNAL_ERROR ("Illegal token type %s in IR", token_types[root->type]);
+    }
+    /* We leave this outside the switch so the compiler will warn us
+     * if we missed a token type. */
+    INTERNAL_ERROR ("Illegal token type %d in IR", root->type);
+    return Xapian::Query ();
+}
+
+static int
+_notmuch_qparser_destructor (_notmuch_qparser_t *qparser)
+{
+    g_hash_table_unref (qparser->prefixes);
+    return 0;
+}
+
+_notmuch_qparser_t *
+_notmuch_qparser_create (const void *ctx, notmuch_database_t *notmuch)
+{
+    _notmuch_qparser_t *qparser = talloc (ctx, _notmuch_qparser_t);
+    if (!qparser)
+	return NULL;
+    qparser->prefixes = NULL;
+    talloc_set_destructor (qparser, _notmuch_qparser_destructor);
+
+    qparser->notmuch = notmuch;
+    qparser->prefixes = g_hash_table_new (g_str_hash, g_str_equal);
+    qparser->transforms = NULL;
+    qparser->transformsTail = &qparser->transforms;
+
+    return qparser;
+}
+
+void
+_notmuch_qparser_add_prefix (_notmuch_qparser_t *qparser,
+			     const char *prefix, notmuch_bool_t literal,
+			     notmuch_bool_t boolean)
+{
+    int flags = ((literal ? PREFIX_LITERAL : 0) |
+		 (boolean ? PREFIX_BOOL : PREFIX_PROB));
+    g_hash_table_insert (qparser->prefixes, talloc_strdup (qparser, prefix),
+			 GINT_TO_POINTER (flags));
+}
+
+void
+_notmuch_qparser_add_transform (_notmuch_qparser_t *qparser,
+				_notmuch_token_t *(*transform) (
+				    _notmuch_token_t *ast, void *opaque),
+				void *opaque)
+{
+    struct _notmuch_qparser_transform *t;
+    t = talloc (qparser, struct _notmuch_qparser_transform);
+    t->next = NULL;
+    t->transform = transform;
+    t->opaque = opaque;
+    *qparser->transformsTail = t;
+    qparser->transformsTail = &t->next;
+}
+
+struct _notmuch_transform_prefix_info {
+    char *field, *prefix;
+};
+
+static _notmuch_token_t *
+transform_prefix_rec (_notmuch_token_t *root, char *field, char *prefix,
+		      notmuch_bool_t active)
+{
+    if (!root)
+	return NULL;
+    if (root->type == TOK_PREFIX) {
+	active = (strcmp (field, root->text) == 0);
+    } else if (root->type == TOK_TERMS || root->type == TOK_LIT) {
+	if (active)
+	    root->prefix = prefix;
+    }
+    transform_prefix_rec (root->left, field, prefix, active);
+    transform_prefix_rec (root->right, field, prefix, active);
+    return root;
+}
+
+static _notmuch_token_t *
+transform_prefix (_notmuch_token_t *root, void *opaque)
+{
+    struct _notmuch_transform_prefix_info *info =
+	(struct _notmuch_transform_prefix_info*)opaque;
+    return transform_prefix_rec (root, info->field, info->prefix, FALSE);
+}
+
+void
+_notmuch_qparser_add_db_prefix (_notmuch_qparser_t *qparser,
+				const char *field, const char *prefix,
+				notmuch_bool_t boolean)
+{
+    struct _notmuch_transform_prefix_info *info;
+    info = talloc (qparser, struct _notmuch_transform_prefix_info);
+    info->field = talloc_strdup (info, field);
+    info->prefix = talloc_strdup (info, prefix);
+    _notmuch_qparser_add_prefix (qparser, field, boolean, boolean);
+    _notmuch_qparser_add_transform (qparser, transform_prefix, info);
+}
+
+_notmuch_token_t *
+_notmuch_qparser_lex (_notmuch_qparser_t *qparser, const char *query)
+{
+    return lex (qparser, query);
+}
+
+_notmuch_token_t *
+_notmuch_qparser_parse (_notmuch_qparser_t *qparser, const char *query)
+{
+    _notmuch_token_t *toks = lex (qparser, query);
+    return parse (qparser, toks);
+}
+
+_notmuch_token_t *
+_notmuch_qparser_transform (_notmuch_qparser_t *qparser, _notmuch_token_t *root)
+{
+    struct _notmuch_qparser_transform *t;
+    for (t = qparser->transforms; t; t = t->next)
+	root = t->transform (root, t->opaque);
+    return root;
+}
+
+Xapian::Query
+_notmuch_qparser_generate (_notmuch_qparser_t *qparser, _notmuch_token_t *root)
+{
+    struct _notmuch_generate_state *s;
+    s = talloc (qparser, struct _notmuch_generate_state);
+    s->qparser = qparser;
+    s->termpos = 1;
+    Xapian::Query query = generate (s, root);
+    talloc_free (s);
+    if (query.empty())
+	/* Return all documents */
+	return Xapian::Query ("", 1, 0);
+    return query;
+}
-- 
1.7.2.3