Return-Path: X-Original-To: notmuch@notmuchmail.org Delivered-To: notmuch@notmuchmail.org Received: from localhost (localhost [127.0.0.1]) by olra.theworths.org (Postfix) with ESMTP id EEA3E41A578 for ; Wed, 22 Dec 2010 22:01:00 -0800 (PST) X-Virus-Scanned: Debian amavisd-new at olra.theworths.org X-Spam-Flag: NO X-Spam-Score: 0 X-Spam-Level: X-Spam-Status: No, score=0 tagged_above=-999 required=5 tests=[none] autolearn=disabled Received: from olra.theworths.org ([127.0.0.1]) by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id KHIIz+jV92tG for ; Wed, 22 Dec 2010 22:00:47 -0800 (PST) Received: from dmz-mailsec-scanner-2.mit.edu (DMZ-MAILSEC-SCANNER-2.MIT.EDU [18.9.25.13]) by olra.theworths.org (Postfix) with ESMTP id D4656431FD0 for ; Wed, 22 Dec 2010 22:00:45 -0800 (PST) X-AuditID: 1209190d-b7cacae000000a14-49-4d12e58c1219 Received: from mailhub-auth-4.mit.edu ( [18.7.62.39]) by dmz-mailsec-scanner-2.mit.edu (Symantec Brightmail Gateway) with SMTP id 52.A7.02580.C85E21D4; Thu, 23 Dec 2010 01:00:44 -0500 (EST) Received: from outgoing.mit.edu (OUTGOING-AUTH.MIT.EDU [18.7.22.103]) by mailhub-auth-4.mit.edu (8.13.8/8.9.2) with ESMTP id oBN60hbt007101; Thu, 23 Dec 2010 01:00:43 -0500 Received: from awakening.csail.mit.edu (awakening.csail.mit.edu [18.26.4.91]) (authenticated bits=0) (User authenticated as amdragon@ATHENA.MIT.EDU) by outgoing.mit.edu (8.13.6/8.12.4) with ESMTP id oBN60gnO004788 (version=TLSv1/SSLv3 cipher=AES256-SHA bits=256 verify=NOT); Thu, 23 Dec 2010 01:00:43 -0500 (EST) Received: from amthrax by awakening.csail.mit.edu with local (Exim 4.72) (envelope-from ) id 1PVeEE-00024F-Eq; Thu, 23 Dec 2010 01:00:42 -0500 From: Austin Clements To: notmuch@notmuchmail.org Subject: [PATCH 1/2] Implement a custom query parser with a mostly Xapian-compatible grammar. Date: Thu, 23 Dec 2010 01:00:23 -0500 Message-Id: <1293084024-7893-2-git-send-email-amdragon@mit.edu> X-Mailer: git-send-email 1.7.2.3 In-Reply-To: <1293084024-7893-1-git-send-email-amdragon@mit.edu> References: <1293084024-7893-1-git-send-email-amdragon@mit.edu> MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-Brightmail-Tracker: AAAAARbzB9Y= Cc: amdragon@mit.edu X-BeenThere: notmuch@notmuchmail.org X-Mailman-Version: 2.1.13 Precedence: list List-Id: "Use and development of the notmuch mail system." List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 23 Dec 2010 06:01:02 -0000 This parser takes an extra step through an intermediate representation that is convenient to manipulate via pluggable transformation passes. These are used to implement regular Xapian-style query prefixes, but are flexible enough to accomplish far more. --- lib/Makefile.local | 1 + lib/database-private.h | 6 + lib/notmuch-private.h | 126 +++++++ lib/qparser.cc | 941 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 1074 insertions(+), 0 deletions(-) create mode 100644 lib/qparser.cc diff --git a/lib/Makefile.local b/lib/Makefile.local index 37d1c0d..37d3735 100644 --- a/lib/Makefile.local +++ b/lib/Makefile.local @@ -64,6 +64,7 @@ libnotmuch_cxx_srcs = \ $(dir)/index.cc \ $(dir)/message.cc \ $(dir)/query.cc \ + $(dir)/qparser.cc \ $(dir)/thread.cc libnotmuch_modules = $(libnotmuch_c_srcs:.c=.o) $(libnotmuch_cxx_srcs:.cc=.o) diff --git a/lib/database-private.h b/lib/database-private.h index 9f83407..358a71b 100644 --- a/lib/database-private.h +++ b/lib/database-private.h @@ -64,6 +64,12 @@ _notmuch_get_terms_with_prefix (void *ctx, Xapian::TermIterator &i, Xapian::TermIterator &end, const char *prefix); +/* qparser.cc */ + +/* Generate a Xapian query from a query AST. */ +Xapian::Query +_notmuch_qparser_generate (_notmuch_qparser_t *qparser, _notmuch_token_t *root); + #pragma GCC visibility pop #endif diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h index b6f1095..be76c0c 100644 --- a/lib/notmuch-private.h +++ b/lib/notmuch-private.h @@ -508,6 +508,132 @@ notmuch_filenames_t * _notmuch_filenames_create (const void *ctx, notmuch_string_list_t *list); +/* qparser.cc */ + +typedef struct _notmuch_qparser _notmuch_qparser_t; + +enum _notmuch_token_type { + /* These first four token types appear only in the lexer output + * and never in the parse tree. */ + TOK_LOVE, TOK_HATE, TOK_BRA, TOK_KET, + /* Binary operators. These should have left and right children. + * ADJ and NEAR should have a distance. */ + TOK_AND, TOK_OR, TOK_XOR, TOK_ADJ, TOK_NEAR, + /* Unary operators. These have only a left child. Xapian::Query + * has no pure NOT operator, so the generator treats NOT as the + * child of an AND specially, and otherwise represents it as + * " AND_NOT x". FILTER ignores the weights of the subquery + * and generates Xapian::Query::OP_FILTER if the left child of an + * AND or Xapian::Query::OP_SCALE_WEIGHT otherwise. The text + * field of a PREFIX operator specifies the prefix. PREFIX + * operators specify only syntactic prefixes, not database + * prefixes, and thus have no effect on the generated query. */ + TOK_NOT, TOK_FILTER, TOK_PREFIX, + /* A single TOK_TERMS token can represent a single term, a quoted + * phrase, or an implicit phrase. An implicit phrase is something + * like "foo/bar", for which the database contains two separate + * terms, but you want to treat them as a phrase, even though it's + * not quoted. Xapian calls characters that implicitly connect + * terms into phrases "phrase generators." We take a simpler + * approach and treat almost any non-whitespace character as a + * phrase generator. */ + TOK_TERMS, + /* Like TOK_TERMS, but the term text should be taken literally, + * with no phrase splitting or whitespace removal. The lexer + * only generates TOK_TERMS; the parser creates TOK_LIT. */ + TOK_LIT, + /* TOK_END indicates the end of the token list. Such tokens loop + * back on themselves so it's always safe to follow "next". + * These appear only in the lexer output. */ + TOK_END +}; + +typedef struct _notmuch_token { + enum _notmuch_token_type type; + const char *text; + + /* For TOK_ADJ and TOK_NEAR, this specifies the distance + * argument. */ + int distance; + + /* For TOK_PREFIX, the flags of this prefix. */ + int prefixFlags; + + /* For TOK_TERMS and TOK_LIT, the database prefix to use when + * generating database terms. This must be filled in a + * transformation pass. */ + char *prefix; + + /* Link in the lexer token list. */ + struct _notmuch_token *next; + + /* Links in the intermediate AST. */ + struct _notmuch_token *left, *right; +} _notmuch_token_t; + +_notmuch_token_t * +_notmuch_token_create (const void *ctx, enum _notmuch_token_type type, + const char *text); + +char * +_notmuch_token_show_list (const void *ctx, _notmuch_token_t *tok); + +char * +_notmuch_token_show_tree (const void *ctx, _notmuch_token_t *root); + +_notmuch_qparser_t * +_notmuch_qparser_create (const void *ctx, notmuch_database_t *notmuch); + +/* Add a syntactic prefix. This will appear as a TOK_PREFIX in the + * AST, but does not alone affect the final query. + * + * The literal flag affects lexing. If true, this prefix must be + * followed by a regular term or quoted literal, which will not be + * stripped of whitespace or split in to a phrase. The boolean flag + * affects parsing. If true, then terms with this prefix will be + * combined into the query using the FILTER operator, so they must + * appear in the result and will not contribute to weights. Xapian's + * "boolean prefixes" are both literal and boolean. + */ +void +_notmuch_qparser_add_prefix (_notmuch_qparser_t *qparser, + const char *prefix, notmuch_bool_t literal, + notmuch_bool_t boolean); + +/* Add a transform pass to a query parser. The transform function + * will be called with the root of the AST and should return a new AST + * root (which may be the same as the old root). + */ +void +_notmuch_qparser_add_transform (_notmuch_qparser_t *qparser, + _notmuch_token_t *(*transform) ( + _notmuch_token_t *ast, void *opaque), + void *opaque); + +/* Add a syntactic prefix (field) and a transform pass to transform + * that syntactic prefix into a database prefix (prefix). This + * corresponds to Xapian's add_prefix and add_boolean_prefix + * functions. */ +void +_notmuch_qparser_add_db_prefix (_notmuch_qparser_t *qparser, + const char *field, const char *prefix, + notmuch_bool_t boolean); + +/* Lex a query string, returning the first token in the token list. + * This is only meant for testing. */ +_notmuch_token_t * +_notmuch_qparser_lex (_notmuch_qparser_t *qparser, const char *query); + +/* Parse a query string, returning the root of the AST. */ +_notmuch_token_t * +_notmuch_qparser_parse (_notmuch_qparser_t *qparser, const char *query); + +/* Transform a parsed query, running the transforms in the order they + * were added to the query parser. Return the root of the transformed + * AST. */ +_notmuch_token_t * +_notmuch_qparser_transform (_notmuch_qparser_t *qparser, _notmuch_token_t *root); + #pragma GCC visibility pop NOTMUCH_END_DECLS diff --git a/lib/qparser.cc b/lib/qparser.cc new file mode 100644 index 0000000..4804d06 --- /dev/null +++ b/lib/qparser.cc @@ -0,0 +1,941 @@ +/* qparser.cc - Notmuch query parser + * + * Copyright © 2010 Austin Clements + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see http://www.gnu.org/licenses/ . + * + * Author: Austin Clements + */ + +/* + * Query parsing is performed in a series of phases similar to those + * of a traditional compiler. + * + * 1) We tokenize the query, identifying operators and term phrases. + * Note that a phrase (quoted or implicit) generates a single token + * at this point, which we split up later (unlike in the Xapian + * lexer). + * 2) We parse the token stream, generating an intermediate + * representation in the form of a binary AST. This IR is similar + * to the Xapian::Query operators, but is designed to be more + * easily manipulated. + * 3) We transform the parse tree, running a sequence of + * caller-provided transformation functions over the tree. + * 4) We generate the Xapian::Query from the transformed IR. This + * step also splits phrase tokens into multiple query terms. + * XXX and expands wildcard terms. + * + * To use the query parser, call _notmuch_qparser_parse to perform + * steps 1 and 2, _notmuch_qparser_transform to perform step 3, and + * _notmuch_qparser_generate to perform step 4. + * + * Still missing from this implementation: + * * Stemming - The stemming should probably be marked on TOK_TERMS + * tokens. Ideally, we can just pass this to the term generator. + * * Wildcard queries - This should be available in the IR so it's + * easy to generate wildcard queries in a transformer. + * * Value ranges in the IR + * * Queries "" and "*" + * * Memory management is poorly suited to reusing qparser's. + */ + +/* XXX notmuch currently registers "tag" as an exclusive boolean + * prefix, which means queries like "tag:x tag:y" will return messages + * with tag x OR tag y. Is this intentional? */ + +#include "notmuch-private.h" +#include "database-private.h" + +#include /* GHashTable */ + +struct _notmuch_qparser { + notmuch_database_t *notmuch; + + /* Maps from prefix strings to PREFIX_* flags */ + GHashTable *prefixes; + + struct _notmuch_qparser_transform *transforms; + struct _notmuch_qparser_transform **transformsTail; +}; + +enum { + PREFIX_LITERAL = 1<<1, + PREFIX_PROB = 1<<2, + PREFIX_BOOL = 1<<3, +}; + +struct _notmuch_qparser_transform { + struct _notmuch_qparser_transform *next; + _notmuch_token_t *(*transform) (_notmuch_token_t *ast, void *opaque); + void *opaque; +}; + +struct _notmuch_lexer_state { + _notmuch_qparser_t *qparser; + _notmuch_token_t *head; + _notmuch_token_t **tail; +}; + +struct _notmuch_generate_state { + _notmuch_qparser_t *qparser; + unsigned int termpos; +}; + +static const char *token_types[] = { + "LOVE", "HATE", "BRA", "KET", + "AND", "OR", "XOR", "ADJ", "NEAR", + "NOT", "FILTER", "PREFIX", + "TERMS", "LIT", "END" +}; + +/* The distinguished end token. This simplifies the parser since it + * never has to worry about dereferencing next. */ +static _notmuch_token_t tok_end = {TOK_END, NULL, -1, FALSE, NULL, + &tok_end, NULL, NULL}; + +_notmuch_token_t * +_notmuch_token_create (const void *ctx, enum _notmuch_token_type type, + const char *text) +{ + struct _notmuch_token *tok = talloc (ctx, struct _notmuch_token); + tok->type = type; + tok->text = text; + if (type == TOK_ADJ || type == TOK_NEAR) + tok->distance = 10; + else + tok->distance = -1; + tok->prefixFlags = 0; + tok->prefix = NULL; + tok->left = tok->right = NULL; + return tok; +} + +char * +_notmuch_token_show_list (const void *ctx, _notmuch_token_t *tok) +{ + char *out = talloc_strdup (ctx, ""); + + for (; tok->type != TOK_END; tok = tok->next) { + out = talloc_asprintf_append (out, "%s%s", + *out == 0 ? "" : " ", + token_types[tok->type]); + if (tok->distance != -1) + out = talloc_asprintf_append (out, "/%d", tok->distance); + if (tok->text) + out = talloc_asprintf_append (out, ":%s", tok->text); + } + + return out; +} + +char * +_notmuch_token_show_tree (const void *ctx, _notmuch_token_t *root) +{ + if (!root) { + return talloc_strdup (ctx, ""); + } else if (root->type == TOK_TERMS || root->type == TOK_LIT) { + return talloc_strdup (ctx, root->text); + } else { + char *out = talloc_asprintf (ctx, "(%s", token_types[root->type]); + void *local = talloc_new (ctx); + if (root->type == TOK_PREFIX) + out = talloc_asprintf_append (out, "/%s", root->text); + if (root->left) + out = talloc_asprintf_append + (out, " %s", _notmuch_token_show_tree (local, root->left)); + if (root->right) + out = talloc_asprintf_append + (out, " %s", _notmuch_token_show_tree (local, root->right)); + out = talloc_asprintf_append (out, ")"); + talloc_free (local); + return out; + } +} + +using Xapian::Unicode::is_whitespace; +using Xapian::Unicode::is_wordchar; + +static Xapian::Utf8Iterator +lex_skip_ws (Xapian::Utf8Iterator it) +{ + while (is_whitespace (*it)) + it++; + return it; +} + +static struct _notmuch_token * +lex_emit (struct _notmuch_lexer_state *s, enum _notmuch_token_type type, + char *text) +{ + struct _notmuch_token *tok = _notmuch_token_create (s, type, text); + tok->next = &tok_end; + *(s->tail) = tok; + s->tail = &tok->next; + return tok; +} + + +/* Lex a quoted phrase, returning an iterator pointing to the + * character following the closing quote. If escaped, then accept two + * quotes as an escaped version of a single quote. + */ +static Xapian::Utf8Iterator +lex_quoted_phrase (struct _notmuch_lexer_state *s, + Xapian::Utf8Iterator it, notmuch_bool_t escaped) +{ + Xapian::Utf8Iterator next, orig, end; + char *term, *src, *dst; + + /* Find the end of the phrase */ + assert (*(it++) == '"'); + for (next = it; next != end; ++next) { + if (*next == '"') { + if (!escaped) + break; + + orig = next++; + if (next == end || *next != '"') { + next = orig; + break; + } + } + } + + /* Xapian still lexes +/-/( in quotes mode and simply doesn't + * generate tokens for them. For us, the term generator will + * discard them. */ + term = talloc_strndup (s, it.raw (), next.raw () - it.raw ()); + if (escaped) { + /* Replace doubled quotes with a single quote. */ + for (src = dst = term; *src; ++src, ++dst) { + *dst = *src; + if (*src == '"') + ++src; + } + *dst = '\0'; + } + lex_emit (s, TOK_TERMS, term); + + if (next != end) + ++next; + return next; +} + +static Xapian::Utf8Iterator +lex_try_consume_prefix (_notmuch_qparser_t *qparser, Xapian::Utf8Iterator it, + char **prefixOut, int *flagsOut) +{ + Xapian::Utf8Iterator next (it), end; + char *prefix; + int flags; + + *prefixOut = NULL; + while (next != end && *next != ':' && !is_whitespace (*next)) + ++next; + if (*next != ':') + return it; + /* Ignore if followed by <= ' ' or ')' */ + ++next; + if (*next <= ' ' || *next == ')') + return it; + + prefix = talloc_strndup (qparser, it.raw (), next.raw () - it.raw() - 1); + flags = GPOINTER_TO_INT (g_hash_table_lookup (qparser->prefixes, prefix)); + if (!flags) { + /* Not a known prefix */ + talloc_free (prefix); + return it; + } + *prefixOut = prefix; + *flagsOut = flags; + return next; +} + +static Xapian::Utf8Iterator +lex_consume_term (const void *ctx, Xapian::Utf8Iterator it, char **termOut) +{ + Xapian::Utf8Iterator next (it), end; + /* Xapian permits other characters to separate term phrases. For + * example, "x#y" is parsed as two separate (non-phrase) terms. + * However, because the characters allowed in a term are + * context-sensitive, replicating this is very hard. Here we take + * a simpler approach where only whitespace and a few operator + * characters that are never term characters separate terms. */ + while (next != end && !strchr ("()\"", *next) && !is_whitespace (*next)) + ++next; + *termOut = talloc_strndup (ctx, it.raw (), next.raw () - it.raw ()); + return next; +} + +static notmuch_bool_t +lex_operator (struct _notmuch_lexer_state *s, char *term, + const char *op, enum _notmuch_token_type type) +{ + size_t oplen = strlen (op); + + if (strcasecmp (term, op) == 0) { + lex_emit (s, type, term); + return true; + } + + /* Check for ADJ or NEAR with argument. Our parsing of this is + * slightly incompatible with Xapian, but I believe this to be a + * bug in Xapian. Xapian parses "x NEAR/y z" as three term + * phrases, "x", "near y", and "z", like we do. However, it + * behaves differently if the bad NEAR operator is at the end of + * the query, parsing "x NEAR/y" like "x NEAR y". */ + if ((type == TOK_ADJ || type == TOK_NEAR) && + strncasecmp (term, op, oplen) == 0 && + term[oplen] == '/') { + /* Try to parse the distance argument */ + char *end; + int distance = strtol (&term[oplen + 1], &end, 10); + if (distance && !*end) { + struct _notmuch_token *tok = lex_emit (s, type, term); + tok->distance = distance; + return true; + } + } + + return false; +} + +static _notmuch_token_t * +lex (_notmuch_qparser_t *qparser, const char *query) +{ + Xapian::Utf8Iterator it (query), next, end; + struct _notmuch_lexer_state *s; + struct _notmuch_token *tok; + char *term; + int prefixFlags; + + s = talloc (qparser, struct _notmuch_lexer_state); + if (!s) + return NULL; + s->qparser = qparser; + s->head = &tok_end; + s->tail = &s->head; + + while (it != end) { + unsigned ch; + if ((it = lex_skip_ws (it)) == end) + break; + + ch = *it; + switch (ch) { + case '+': + case '-': + ++it; + /* Xapian ignores these unless preceded by whitespace or + * an open paren, which has the effect of ignoring all + * +'s in "x +++y", "x#+y", and "(x)+y". We don't + * bother. */ + + /* Ignore if followed by a space or another + or - */ + if (is_whitespace (*it) || *it == '+' || *it == '-') + continue; + lex_emit (s, ch == '+' ? TOK_LOVE : TOK_HATE, NULL); + continue; + + case '"': + it = lex_quoted_phrase(s, it, false); + continue; + + case '(': + ++it; + /* Xapian ignores this unless preceded by whitespace, + * parens, +, or -. We don't bother. */ + lex_emit (s, TOK_BRA, NULL); + continue; + + case ')': + ++it; + lex_emit (s, TOK_KET, NULL); + continue; + } + + /* Scan for a prefix */ + next = lex_try_consume_prefix (qparser, it, &term, &prefixFlags); + if (term && (*next == '"' || *next == '(' || is_wordchar (*next))) { + int literal = prefixFlags & PREFIX_LITERAL; + tok = lex_emit (s, TOK_PREFIX, term); + tok->prefixFlags = prefixFlags; + + it = next; + if (literal && *it == '"') { + /* Literal quoted strings keep everything and allow + * quote escaping, unlike regular quoted phrases. */ + it = lex_quoted_phrase (s, it, true); + } else if (literal) { + /* Xapian uses anything up to the next space or ')' + * (because literal prefixes can't be applied to + * subqueries). I disagree with Xapian here, since + * Xapian will keep the open paren but not the close + * paren. Better would be to balance them. */ + if (*next == '(') + ++next; + while (next != end && *next > ' ' && *next != ')') + ++next; + term = talloc_strndup (s, it.raw (), + next.raw () - it.raw ()); + lex_emit (s, TOK_TERMS, term); + it = next; + } + /* For quoted strings and subqueries following non-literal + * prefixes and regular terms, we lex as usual. */ + continue; + } + + /* Scan for a term phrase or operator */ + it = lex_consume_term (s, it, &term); + + /* Check operators */ + if (lex_operator (s, term, "and", TOK_AND) || + lex_operator (s, term, "not", TOK_NOT) || + lex_operator (s, term, "xor", TOK_XOR) || + lex_operator (s, term, "or", TOK_OR) || + lex_operator (s, term, "adj", TOK_ADJ) || + lex_operator (s, term, "near", TOK_NEAR)) + continue; + + /* Must be a term */ + lex_emit (s, TOK_TERMS, term); + } + + return s->head; +} + +static void +add_to_query (const void *ctx, _notmuch_token_t **query, + _notmuch_token_type op, _notmuch_token_t *right) +{ + if (!*query) + *query = right; + else if (right) { + _notmuch_token_t *tok = _notmuch_token_create (ctx, op, NULL); + tok->left = *query; + tok->right = right; + *query = tok; + } +} + +static _notmuch_token_t * +parse_expr (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok); +static _notmuch_token_t * +parse_term (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok); + +static _notmuch_token_t * +parse_prob (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok) +{ + /* A prob is a sequence of three types of subqueries. Because the + * default query operator is AND, loved terms are not treated + * specially. + * 1) Probabilistic terms (prefixed or not). These are combined + * with the default query operator, AND. + * 2) Terms with a boolean prefix. All of the terms with the same + * prefix are combined with OR. Different prefixes are + * combined with AND. + * 3) Hate terms. These are combined with OR. + * The final IR looks like + * (probs AND (FILTER bools)) AND (NOT hates) + */ + + _notmuch_token_t *probs, *hates, *sub, *q; + GHashTable *bools; + int done = 0; + + probs = hates = NULL; + bools = g_hash_table_new (g_str_hash, g_str_equal); + while (!done) { + switch ((*tok)->type) { + case TOK_KET: + if (prec < 10) { + /* Unmatched close paren. Ignore it. */ + *tok = (*tok)->next; + break; + } + /* Fall through */ + case TOK_AND: case TOK_OR: case TOK_XOR: case TOK_NOT: + case TOK_END: + /* End of the prob. Might be empty. */ + done = 1; + break; + + case TOK_HATE: + *tok = (*tok)->next; + sub = parse_expr (qparser, prec + 1, tok); + add_to_query (qparser, &hates, TOK_OR, sub); + break; + + case TOK_PREFIX: + sub = parse_expr (qparser, prec + 1, tok); + if (!sub) + break; + if (sub->prefixFlags & PREFIX_PROB) { + add_to_query (qparser, &probs, TOK_AND, sub); + } else { + _notmuch_token_t *newb, *pre = (_notmuch_token_t*) + g_hash_table_lookup (bools, sub->text); + if (!pre) { + newb = sub; + } else { + /* OR subqueries with same prefix */ + newb = _notmuch_token_create (qparser, TOK_OR, NULL); + newb->left = pre; + newb->right = sub; + } + g_hash_table_insert (bools, (void*)sub->text, newb); + } + break; + + case TOK_LOVE: + /* Join into the query like any other term, since the + * default operator is AND anyway. */ + *tok = (*tok)->next; + /* Fall through */ + case TOK_BRA: + case TOK_TERMS: + case TOK_LIT: + sub = parse_expr (qparser, prec + 1, tok); + add_to_query (qparser, &probs, TOK_AND, sub); + break; + + case TOK_ADJ: + case TOK_NEAR: + case TOK_FILTER: + /* XXX Can ADJ or NEAR happen? */ + INTERNAL_ERROR ("Unexpected token type %d", (*tok)->type); + } + } + + q = probs; + if (g_hash_table_size (bools)) { + /* Merge boolean filters */ + _notmuch_token_t *filter; + GList *vals = g_hash_table_get_values (bools), *l; + sub = NULL; + for (l = vals; l; l = l->next) + add_to_query (qparser, &sub, TOK_AND, (_notmuch_token_t *) l->data); + g_list_free (vals); + + /* Create filter */ + filter = _notmuch_token_create (qparser, TOK_FILTER, NULL); + filter->left = sub; + add_to_query (qparser, &q, TOK_AND, filter); + } + if (hates) { + sub = _notmuch_token_create (qparser, TOK_NOT, NULL); + sub->left = hates; + add_to_query (qparser, &q, TOK_AND, sub); + } + g_hash_table_unref (bools); + return q; +} + +static _notmuch_token_t * +parse_term (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok) +{ + _notmuch_token_t *sub; + + if ((*tok)->type == TOK_END) { + /* Arises from things like "x -". Ignore. */ + return NULL; + } else if ((*tok)->type == TOK_PREFIX) { + sub = *tok; + *tok = (*tok)->next; + sub->left = parse_term (qparser, prec, tok); + if (!sub->left) + return NULL; + if (sub->prefixFlags & PREFIX_LITERAL) { + /* Convert TOK_TERMS to TOK_LIT */ + assert (sub->left->type == TOK_TERMS); + sub->left->type = TOK_LIT; + } else if (sub->left->type == TOK_PREFIX) { + sub->left = sub->left->left; + } + return sub; + } else if ((*tok)->type == TOK_BRA) { + *tok = (*tok)->next; + sub = parse_expr (qparser, prec + 10 - (prec%10), tok); + if ((*tok)->type == TOK_KET) + *tok = (*tok)->next; + return sub; + } + + if ((*tok)->type != TOK_TERMS && (*tok)->type != TOK_LIT) { + /* Arises from "+AND", "-AND", "prob:AND". We could give up + * and return nothing, but it seems nicer to treat the + * operator as a term if it came from the original query. */ + if (!(*tok)->text) + return NULL; + (*tok)->type = TOK_TERMS; + } + + sub = *tok; + *tok = (*tok)->next; + return sub; +} + +static _notmuch_token_t * +parse_expr (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok) +{ + /* If you squint at the Xapian grammar, it's a precedence grammar + * with one strange "prob" level. This implements all but the + * prob level and the leaf "term" level. + * + * prec is (nesting level * 10 + precedence level). Normally we + * only care about the precedence level, but the nesting level is + * important for recovering from unbalanced parens. + */ + int bprec = prec % 10; + if (bprec == 3) { + if ((*tok)->type == TOK_NOT) { + /* Unary NOT */ + _notmuch_token_t *root = *tok; + *tok = (*tok)->next; + root->left = parse_expr (qparser, prec, tok); + return root; + } + + return parse_prob (qparser, prec, tok); + } + if (bprec == 5) + return parse_term (qparser, prec, tok); + + _notmuch_token_t *left = parse_expr (qparser, prec + 1, tok); + while ((bprec == 0 && (*tok)->type == TOK_OR) || + (bprec == 1 && (*tok)->type == TOK_XOR) || + (bprec == 2 && ((*tok)->type == TOK_AND || + (*tok)->type == TOK_NOT)) || + (bprec == 4 && ((*tok)->type == TOK_NEAR || + (*tok)->type == TOK_ADJ))) { + _notmuch_token_t *root = *tok; + if (root->type == TOK_NOT) { + /* Replace "x NOT y" with (AND x (NOT y)) by inserting an + * AND operator and leaning on the unary NOT rule. */ + root = _notmuch_token_create (qparser, TOK_AND, NULL); + } else { + *tok = (*tok)->next; + } + + /* Xapian treats x AND -y as x AND NOT y, which affects + * precedence. */ + if (root->type == TOK_AND && (*tok)->type == TOK_HATE) + (*tok)->type = TOK_NOT; + + _notmuch_token_t *right = parse_expr (qparser, prec + 1, tok); + if (left && right) { + root->left = left; + root->right = right; + } else { + /* Left or right was empty. This may be a syntax error + * like an omitted expression, or an empty expression. */ + root = left ? left : right; + } + left = root; + } + return left; +} + +static _notmuch_token_t * +parse (_notmuch_qparser_t *qparser, _notmuch_token_t *toks) +{ + _notmuch_token_t *root = parse_expr (qparser, 0, &toks); + if (toks->type != TOK_END) + INTERNAL_ERROR ("Token stream not fully consumed: %s", + _notmuch_token_show_list (qparser, toks)); + return root; +} + +static char * +generate_term (const void *ctx, const char *term, const char *prefix) +{ + notmuch_bool_t colon = FALSE; + if (isupper (term[0]) && strlen (prefix) > 1) + colon = TRUE; + return talloc_asprintf (ctx, "%s%s%s", prefix, colon ? ":" : "", term); +} + +static Xapian::Query +generate_terms (struct _notmuch_generate_state *s, const char *text, + const char *prefix) +{ + Xapian::TermGenerator tg; + Xapian::Document doc; + Xapian::TermIterator it, end; + Xapian::PositionIterator pit, pend; + Xapian::Query *qs, q; + unsigned int nterms = 0; + + if (prefix) + tg.index_text (text, 1, prefix); + else + tg.index_text (text); + doc = tg.get_document (); + + /* Find the highest positioned term. Positions are 1-based. */ + end = doc.termlist_end (); + for (it = doc.termlist_begin (); it != end; ++it) { + pend = it.positionlist_end (); + for (pit = it.positionlist_begin (); pit != pend; ++pit) { + if (*pit > nterms) + nterms = *pit; + } + } + if (nterms == 0) + return Xapian::Query (); + + /* Extract terms */ + qs = new Xapian::Query[nterms]; + for (it = doc.termlist_begin (); it != end; ++it) { + pend = it.positionlist_end (); + for (pit = it.positionlist_begin (); pit != pend; ++pit) + qs[*pit - 1] = Xapian::Query (*it, 1, s->termpos + *pit - 1); + } + s->termpos += nterms; + + /* Build query */ + q = Xapian::Query (Xapian::Query::OP_PHRASE, qs, qs + nterms, nterms); + delete [] qs; + return q; +} + +static Xapian::Query +generate (struct _notmuch_generate_state *s, _notmuch_token_t *root) +{ + using Xapian::Query; + Query l, r; + Query::op op; + + if (!root) + return Query (); + + /* The tricky part here is that generate is allowed to return a + * empty query, indicating that the user's query cannot be + * expressed. For example, the term "#" is an empty query. */ + + switch (root->type) { + case TOK_AND: + if (root->left->type == TOK_NOT && root->right->type != TOK_NOT) { + _notmuch_token_t *tmp = root->left; + root->left = root->right; + root->right = tmp; + } + l = generate (s, root->left); + if (l.empty()) { + return generate (s, root->right); + } else if (root->right->type == TOK_NOT) { + r = generate (s, root->right->left); + op = Query::OP_AND_NOT; + } else if (root->right->type == TOK_FILTER) { + r = generate (s, root->right->left); + op = Query::OP_FILTER; + } else { + r = generate (s, root->right); + op = Query::OP_AND; + } + if (r.empty()) + return l; + return Query (op, l, r); + + case TOK_NOT: + l = generate (s, root->left); + if (l.empty()) + return l; + return Query (Query::OP_AND_NOT, Query ("", 1, 0), l); + + case TOK_FILTER: + l = generate(s, root->left); + if (l.empty()) + return l; + return Query (Query::OP_SCALE_WEIGHT, l, 0.0); + + case TOK_OR: + case TOK_XOR: + l = generate (s, root->left); + r = generate (s, root->right); + if (l.empty()) + return r; + if (r.empty()) + return l; + return Query (root->type == TOK_OR ? Query::OP_OR : Query::OP_XOR, + l, r); + + case TOK_ADJ: + case TOK_NEAR: + { + /* XXX This differs from Xapian. Xapian treats ADJ and NEAR + * as n-ary operators and constructs the whole list of terms + * to be searched within the largest specified window. */ + Query subs[2]; + subs[0] = Query (generate (s, root->left)); + subs[1] = Query (generate (s, root->right)); + if (subs[0].empty()) + return subs[1]; + if (subs[1].empty()) + return subs[0]; + return Query (root->type == TOK_ADJ ? Query::OP_PHRASE : Query::OP_NEAR, + subs, subs + 2, root->distance + 1); + } + + case TOK_PREFIX: + return generate (s, root->left); + + case TOK_TERMS: + return generate_terms (s, root->text, root->prefix); + + case TOK_LIT: + return Query (generate_term (s, root->text, root->prefix)); + + case TOK_LOVE: + case TOK_HATE: + case TOK_BRA: + case TOK_KET: + case TOK_END: + INTERNAL_ERROR ("Illegal token type %s in IR", token_types[root->type]); + } + /* We leave this outside the switch so the compiler will warn us + * if we missed a token type. */ + INTERNAL_ERROR ("Illegal token type %d in IR", root->type); + return Xapian::Query (); +} + +static int +_notmuch_qparser_destructor (_notmuch_qparser_t *qparser) +{ + g_hash_table_unref (qparser->prefixes); + return 0; +} + +_notmuch_qparser_t * +_notmuch_qparser_create (const void *ctx, notmuch_database_t *notmuch) +{ + _notmuch_qparser_t *qparser = talloc (ctx, _notmuch_qparser_t); + if (!qparser) + return NULL; + qparser->prefixes = NULL; + talloc_set_destructor (qparser, _notmuch_qparser_destructor); + + qparser->notmuch = notmuch; + qparser->prefixes = g_hash_table_new (g_str_hash, g_str_equal); + qparser->transforms = NULL; + qparser->transformsTail = &qparser->transforms; + + return qparser; +} + +void +_notmuch_qparser_add_prefix (_notmuch_qparser_t *qparser, + const char *prefix, notmuch_bool_t literal, + notmuch_bool_t boolean) +{ + int flags = ((literal ? PREFIX_LITERAL : 0) | + (boolean ? PREFIX_BOOL : PREFIX_PROB)); + g_hash_table_insert (qparser->prefixes, talloc_strdup (qparser, prefix), + GINT_TO_POINTER (flags)); +} + +void +_notmuch_qparser_add_transform (_notmuch_qparser_t *qparser, + _notmuch_token_t *(*transform) ( + _notmuch_token_t *ast, void *opaque), + void *opaque) +{ + struct _notmuch_qparser_transform *t; + t = talloc (qparser, struct _notmuch_qparser_transform); + t->next = NULL; + t->transform = transform; + t->opaque = opaque; + *qparser->transformsTail = t; + qparser->transformsTail = &t->next; +} + +struct _notmuch_transform_prefix_info { + char *field, *prefix; +}; + +static _notmuch_token_t * +transform_prefix_rec (_notmuch_token_t *root, char *field, char *prefix, + notmuch_bool_t active) +{ + if (!root) + return NULL; + if (root->type == TOK_PREFIX) { + active = (strcmp (field, root->text) == 0); + } else if (root->type == TOK_TERMS || root->type == TOK_LIT) { + if (active) + root->prefix = prefix; + } + transform_prefix_rec (root->left, field, prefix, active); + transform_prefix_rec (root->right, field, prefix, active); + return root; +} + +static _notmuch_token_t * +transform_prefix (_notmuch_token_t *root, void *opaque) +{ + struct _notmuch_transform_prefix_info *info = + (struct _notmuch_transform_prefix_info*)opaque; + return transform_prefix_rec (root, info->field, info->prefix, FALSE); +} + +void +_notmuch_qparser_add_db_prefix (_notmuch_qparser_t *qparser, + const char *field, const char *prefix, + notmuch_bool_t boolean) +{ + struct _notmuch_transform_prefix_info *info; + info = talloc (qparser, struct _notmuch_transform_prefix_info); + info->field = talloc_strdup (info, field); + info->prefix = talloc_strdup (info, prefix); + _notmuch_qparser_add_prefix (qparser, field, boolean, boolean); + _notmuch_qparser_add_transform (qparser, transform_prefix, info); +} + +_notmuch_token_t * +_notmuch_qparser_lex (_notmuch_qparser_t *qparser, const char *query) +{ + return lex (qparser, query); +} + +_notmuch_token_t * +_notmuch_qparser_parse (_notmuch_qparser_t *qparser, const char *query) +{ + _notmuch_token_t *toks = lex (qparser, query); + return parse (qparser, toks); +} + +_notmuch_token_t * +_notmuch_qparser_transform (_notmuch_qparser_t *qparser, _notmuch_token_t *root) +{ + struct _notmuch_qparser_transform *t; + for (t = qparser->transforms; t; t = t->next) + root = t->transform (root, t->opaque); + return root; +} + +Xapian::Query +_notmuch_qparser_generate (_notmuch_qparser_t *qparser, _notmuch_token_t *root) +{ + struct _notmuch_generate_state *s; + s = talloc (qparser, struct _notmuch_generate_state); + s->qparser = qparser; + s->termpos = 1; + Xapian::Query query = generate (s, root); + talloc_free (s); + if (query.empty()) + /* Return all documents */ + return Xapian::Query ("", 1, 0); + return query; +} -- 1.7.2.3