1 Return-Path: <amthrax@awakening.csail.mit.edu>
\r
2 X-Original-To: notmuch@notmuchmail.org
\r
3 Delivered-To: notmuch@notmuchmail.org
\r
4 Received: from localhost (localhost [127.0.0.1])
\r
5 by olra.theworths.org (Postfix) with ESMTP id EEA3E41A578
\r
6 for <notmuch@notmuchmail.org>; Wed, 22 Dec 2010 22:01:00 -0800 (PST)
\r
7 X-Virus-Scanned: Debian amavisd-new at olra.theworths.org
\r
11 X-Spam-Status: No, score=0 tagged_above=-999 required=5 tests=[none]
\r
13 Received: from olra.theworths.org ([127.0.0.1])
\r
14 by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024)
\r
15 with ESMTP id KHIIz+jV92tG for <notmuch@notmuchmail.org>;
\r
16 Wed, 22 Dec 2010 22:00:47 -0800 (PST)
\r
17 Received: from dmz-mailsec-scanner-2.mit.edu (DMZ-MAILSEC-SCANNER-2.MIT.EDU
\r
19 by olra.theworths.org (Postfix) with ESMTP id D4656431FD0
\r
20 for <notmuch@notmuchmail.org>; Wed, 22 Dec 2010 22:00:45 -0800 (PST)
\r
21 X-AuditID: 1209190d-b7cacae000000a14-49-4d12e58c1219
\r
22 Received: from mailhub-auth-4.mit.edu ( [18.7.62.39])
\r
23 by dmz-mailsec-scanner-2.mit.edu (Symantec Brightmail Gateway) with
\r
24 SMTP id 52.A7.02580.C85E21D4; Thu, 23 Dec 2010 01:00:44 -0500 (EST)
\r
25 Received: from outgoing.mit.edu (OUTGOING-AUTH.MIT.EDU [18.7.22.103])
\r
26 by mailhub-auth-4.mit.edu (8.13.8/8.9.2) with ESMTP id oBN60hbt007101;
\r
27 Thu, 23 Dec 2010 01:00:43 -0500
\r
28 Received: from awakening.csail.mit.edu (awakening.csail.mit.edu [18.26.4.91])
\r
29 (authenticated bits=0)
\r
30 (User authenticated as amdragon@ATHENA.MIT.EDU)
\r
31 by outgoing.mit.edu (8.13.6/8.12.4) with ESMTP id oBN60gnO004788
\r
32 (version=TLSv1/SSLv3 cipher=AES256-SHA bits=256 verify=NOT);
\r
33 Thu, 23 Dec 2010 01:00:43 -0500 (EST)
\r
34 Received: from amthrax by awakening.csail.mit.edu with local (Exim 4.72)
\r
35 (envelope-from <amthrax@awakening.csail.mit.edu>)
\r
36 id 1PVeEE-00024F-Eq; Thu, 23 Dec 2010 01:00:42 -0500
\r
37 From: Austin Clements <amdragon@MIT.EDU>
\r
38 To: notmuch@notmuchmail.org
\r
39 Subject: [PATCH 1/2] Implement a custom query parser with a mostly Xapian-compatible grammar.
\r
40 Date: Thu, 23 Dec 2010 01:00:23 -0500
\r
41 Message-Id: <1293084024-7893-2-git-send-email-amdragon@mit.edu>
\r
42 X-Mailer: git-send-email 1.7.2.3
\r
43 In-Reply-To: <1293084024-7893-1-git-send-email-amdragon@mit.edu>
\r
44 References: <1293084024-7893-1-git-send-email-amdragon@mit.edu>
\r
46 Content-Type: text/plain; charset=UTF-8
\r
47 Content-Transfer-Encoding: 8bit
\r
48 X-Brightmail-Tracker: AAAAARbzB9Y=
\r
49 Cc: amdragon@mit.edu
\r
50 X-BeenThere: notmuch@notmuchmail.org
\r
51 X-Mailman-Version: 2.1.13
\r
53 List-Id: "Use and development of the notmuch mail system."
\r
54 <notmuch.notmuchmail.org>
\r
55 List-Unsubscribe: <http://notmuchmail.org/mailman/options/notmuch>,
\r
56 <mailto:notmuch-request@notmuchmail.org?subject=unsubscribe>
\r
57 List-Archive: <http://notmuchmail.org/pipermail/notmuch>
\r
58 List-Post: <mailto:notmuch@notmuchmail.org>
\r
59 List-Help: <mailto:notmuch-request@notmuchmail.org?subject=help>
\r
60 List-Subscribe: <http://notmuchmail.org/mailman/listinfo/notmuch>,
\r
61 <mailto:notmuch-request@notmuchmail.org?subject=subscribe>
\r
62 X-List-Received-Date: Thu, 23 Dec 2010 06:01:02 -0000
\r
64 This parser takes an extra step through an intermediate representation
\r
65 that is convenient to manipulate via pluggable transformation passes.
\r
66 These are used to implement regular Xapian-style query prefixes, but
\r
67 are flexible enough to accomplish far more.
\r
69 lib/Makefile.local | 1 +
\r
70 lib/database-private.h | 6 +
\r
71 lib/notmuch-private.h | 126 +++++++
\r
72 lib/qparser.cc | 941 ++++++++++++++++++++++++++++++++++++++++++++++++
\r
73 4 files changed, 1074 insertions(+), 0 deletions(-)
\r
74 create mode 100644 lib/qparser.cc
\r
76 diff --git a/lib/Makefile.local b/lib/Makefile.local
\r
77 index 37d1c0d..37d3735 100644
\r
78 --- a/lib/Makefile.local
\r
79 +++ b/lib/Makefile.local
\r
80 @@ -64,6 +64,7 @@ libnotmuch_cxx_srcs = \
\r
84 + $(dir)/qparser.cc \
\r
87 libnotmuch_modules = $(libnotmuch_c_srcs:.c=.o) $(libnotmuch_cxx_srcs:.cc=.o)
\r
88 diff --git a/lib/database-private.h b/lib/database-private.h
\r
89 index 9f83407..358a71b 100644
\r
90 --- a/lib/database-private.h
\r
91 +++ b/lib/database-private.h
\r
92 @@ -64,6 +64,12 @@ _notmuch_get_terms_with_prefix (void *ctx, Xapian::TermIterator &i,
\r
93 Xapian::TermIterator &end,
\r
94 const char *prefix);
\r
98 +/* Generate a Xapian query from a query AST. */
\r
100 +_notmuch_qparser_generate (_notmuch_qparser_t *qparser, _notmuch_token_t *root);
\r
102 #pragma GCC visibility pop
\r
105 diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
\r
106 index b6f1095..be76c0c 100644
\r
107 --- a/lib/notmuch-private.h
\r
108 +++ b/lib/notmuch-private.h
\r
109 @@ -508,6 +508,132 @@ notmuch_filenames_t *
\r
110 _notmuch_filenames_create (const void *ctx,
\r
111 notmuch_string_list_t *list);
\r
115 +typedef struct _notmuch_qparser _notmuch_qparser_t;
\r
117 +enum _notmuch_token_type {
\r
118 + /* These first four token types appear only in the lexer output
\r
119 + * and never in the parse tree. */
\r
120 + TOK_LOVE, TOK_HATE, TOK_BRA, TOK_KET,
\r
121 + /* Binary operators. These should have left and right children.
\r
122 + * ADJ and NEAR should have a distance. */
\r
123 + TOK_AND, TOK_OR, TOK_XOR, TOK_ADJ, TOK_NEAR,
\r
124 + /* Unary operators. These have only a left child. Xapian::Query
\r
125 + * has no pure NOT operator, so the generator treats NOT as the
\r
126 + * child of an AND specially, and otherwise represents it as
\r
127 + * "<all> AND_NOT x". FILTER ignores the weights of the subquery
\r
128 + * and generates Xapian::Query::OP_FILTER if the left child of an
\r
129 + * AND or Xapian::Query::OP_SCALE_WEIGHT otherwise. The text
\r
130 + * field of a PREFIX operator specifies the prefix. PREFIX
\r
131 + * operators specify only syntactic prefixes, not database
\r
132 + * prefixes, and thus have no effect on the generated query. */
\r
133 + TOK_NOT, TOK_FILTER, TOK_PREFIX,
\r
134 + /* A single TOK_TERMS token can represent a single term, a quoted
\r
135 + * phrase, or an implicit phrase. An implicit phrase is something
\r
136 + * like "foo/bar", for which the database contains two separate
\r
137 + * terms, but you want to treat them as a phrase, even though it's
\r
138 + * not quoted. Xapian calls characters that implicitly connect
\r
139 + * terms into phrases "phrase generators." We take a simpler
\r
140 + * approach and treat almost any non-whitespace character as a
\r
141 + * phrase generator. */
\r
143 + /* Like TOK_TERMS, but the term text should be taken literally,
\r
144 + * with no phrase splitting or whitespace removal. The lexer
\r
145 + * only generates TOK_TERMS; the parser creates TOK_LIT. */
\r
147 + /* TOK_END indicates the end of the token list. Such tokens loop
\r
148 + * back on themselves so it's always safe to follow "next".
\r
149 + * These appear only in the lexer output. */
\r
153 +typedef struct _notmuch_token {
\r
154 + enum _notmuch_token_type type;
\r
155 + const char *text;
\r
157 + /* For TOK_ADJ and TOK_NEAR, this specifies the distance
\r
161 + /* For TOK_PREFIX, the flags of this prefix. */
\r
164 + /* For TOK_TERMS and TOK_LIT, the database prefix to use when
\r
165 + * generating database terms. This must be filled in a
\r
166 + * transformation pass. */
\r
169 + /* Link in the lexer token list. */
\r
170 + struct _notmuch_token *next;
\r
172 + /* Links in the intermediate AST. */
\r
173 + struct _notmuch_token *left, *right;
\r
174 +} _notmuch_token_t;
\r
176 +_notmuch_token_t *
\r
177 +_notmuch_token_create (const void *ctx, enum _notmuch_token_type type,
\r
178 + const char *text);
\r
181 +_notmuch_token_show_list (const void *ctx, _notmuch_token_t *tok);
\r
184 +_notmuch_token_show_tree (const void *ctx, _notmuch_token_t *root);
\r
186 +_notmuch_qparser_t *
\r
187 +_notmuch_qparser_create (const void *ctx, notmuch_database_t *notmuch);
\r
189 +/* Add a syntactic prefix. This will appear as a TOK_PREFIX in the
\r
190 + * AST, but does not alone affect the final query.
\r
192 + * The literal flag affects lexing. If true, this prefix must be
\r
193 + * followed by a regular term or quoted literal, which will not be
\r
194 + * stripped of whitespace or split in to a phrase. The boolean flag
\r
195 + * affects parsing. If true, then terms with this prefix will be
\r
196 + * combined into the query using the FILTER operator, so they must
\r
197 + * appear in the result and will not contribute to weights. Xapian's
\r
198 + * "boolean prefixes" are both literal and boolean.
\r
201 +_notmuch_qparser_add_prefix (_notmuch_qparser_t *qparser,
\r
202 + const char *prefix, notmuch_bool_t literal,
\r
203 + notmuch_bool_t boolean);
\r
205 +/* Add a transform pass to a query parser. The transform function
\r
206 + * will be called with the root of the AST and should return a new AST
\r
207 + * root (which may be the same as the old root).
\r
210 +_notmuch_qparser_add_transform (_notmuch_qparser_t *qparser,
\r
211 + _notmuch_token_t *(*transform) (
\r
212 + _notmuch_token_t *ast, void *opaque),
\r
215 +/* Add a syntactic prefix (field) and a transform pass to transform
\r
216 + * that syntactic prefix into a database prefix (prefix). This
\r
217 + * corresponds to Xapian's add_prefix and add_boolean_prefix
\r
220 +_notmuch_qparser_add_db_prefix (_notmuch_qparser_t *qparser,
\r
221 + const char *field, const char *prefix,
\r
222 + notmuch_bool_t boolean);
\r
224 +/* Lex a query string, returning the first token in the token list.
\r
225 + * This is only meant for testing. */
\r
226 +_notmuch_token_t *
\r
227 +_notmuch_qparser_lex (_notmuch_qparser_t *qparser, const char *query);
\r
229 +/* Parse a query string, returning the root of the AST. */
\r
230 +_notmuch_token_t *
\r
231 +_notmuch_qparser_parse (_notmuch_qparser_t *qparser, const char *query);
\r
233 +/* Transform a parsed query, running the transforms in the order they
\r
234 + * were added to the query parser. Return the root of the transformed
\r
236 +_notmuch_token_t *
\r
237 +_notmuch_qparser_transform (_notmuch_qparser_t *qparser, _notmuch_token_t *root);
\r
239 #pragma GCC visibility pop
\r
242 diff --git a/lib/qparser.cc b/lib/qparser.cc
\r
243 new file mode 100644
\r
244 index 0000000..4804d06
\r
246 +++ b/lib/qparser.cc
\r
248 +/* qparser.cc - Notmuch query parser
\r
250 + * Copyright © 2010 Austin Clements
\r
252 + * This program is free software: you can redistribute it and/or modify
\r
253 + * it under the terms of the GNU General Public License as published by
\r
254 + * the Free Software Foundation, either version 3 of the License, or
\r
255 + * (at your option) any later version.
\r
257 + * This program is distributed in the hope that it will be useful,
\r
258 + * but WITHOUT ANY WARRANTY; without even the implied warranty of
\r
259 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
\r
260 + * GNU General Public License for more details.
\r
262 + * You should have received a copy of the GNU General Public License
\r
263 + * along with this program. If not, see http://www.gnu.org/licenses/ .
\r
265 + * Author: Austin Clements <amdragon@mit.edu>
\r
269 + * Query parsing is performed in a series of phases similar to those
\r
270 + * of a traditional compiler.
\r
272 + * 1) We tokenize the query, identifying operators and term phrases.
\r
273 + * Note that a phrase (quoted or implicit) generates a single token
\r
274 + * at this point, which we split up later (unlike in the Xapian
\r
276 + * 2) We parse the token stream, generating an intermediate
\r
277 + * representation in the form of a binary AST. This IR is similar
\r
278 + * to the Xapian::Query operators, but is designed to be more
\r
279 + * easily manipulated.
\r
280 + * 3) We transform the parse tree, running a sequence of
\r
281 + * caller-provided transformation functions over the tree.
\r
282 + * 4) We generate the Xapian::Query from the transformed IR. This
\r
283 + * step also splits phrase tokens into multiple query terms.
\r
284 + * XXX and expands wildcard terms.
\r
286 + * To use the query parser, call _notmuch_qparser_parse to perform
\r
287 + * steps 1 and 2, _notmuch_qparser_transform to perform step 3, and
\r
288 + * _notmuch_qparser_generate to perform step 4.
\r
290 + * Still missing from this implementation:
\r
291 + * * Stemming - The stemming should probably be marked on TOK_TERMS
\r
292 + * tokens. Ideally, we can just pass this to the term generator.
\r
293 + * * Wildcard queries - This should be available in the IR so it's
\r
294 + * easy to generate wildcard queries in a transformer.
\r
295 + * * Value ranges in the IR
\r
296 + * * Queries "" and "*"
\r
297 + * * Memory management is poorly suited to reusing qparser's.
\r
300 +/* XXX notmuch currently registers "tag" as an exclusive boolean
\r
301 + * prefix, which means queries like "tag:x tag:y" will return messages
\r
302 + * with tag x OR tag y. Is this intentional? */
\r
304 +#include "notmuch-private.h"
\r
305 +#include "database-private.h"
\r
307 +#include <glib.h> /* GHashTable */
\r
309 +struct _notmuch_qparser {
\r
310 + notmuch_database_t *notmuch;
\r
312 + /* Maps from prefix strings to PREFIX_* flags */
\r
313 + GHashTable *prefixes;
\r
315 + struct _notmuch_qparser_transform *transforms;
\r
316 + struct _notmuch_qparser_transform **transformsTail;
\r
320 + PREFIX_LITERAL = 1<<1,
\r
321 + PREFIX_PROB = 1<<2,
\r
322 + PREFIX_BOOL = 1<<3,
\r
325 +struct _notmuch_qparser_transform {
\r
326 + struct _notmuch_qparser_transform *next;
\r
327 + _notmuch_token_t *(*transform) (_notmuch_token_t *ast, void *opaque);
\r
331 +struct _notmuch_lexer_state {
\r
332 + _notmuch_qparser_t *qparser;
\r
333 + _notmuch_token_t *head;
\r
334 + _notmuch_token_t **tail;
\r
337 +struct _notmuch_generate_state {
\r
338 + _notmuch_qparser_t *qparser;
\r
339 + unsigned int termpos;
\r
342 +static const char *token_types[] = {
\r
343 + "LOVE", "HATE", "BRA", "KET",
\r
344 + "AND", "OR", "XOR", "ADJ", "NEAR",
\r
345 + "NOT", "FILTER", "PREFIX",
\r
346 + "TERMS", "LIT", "END"
\r
349 +/* The distinguished end token. This simplifies the parser since it
\r
350 + * never has to worry about dereferencing next. */
\r
351 +static _notmuch_token_t tok_end = {TOK_END, NULL, -1, FALSE, NULL,
\r
352 + &tok_end, NULL, NULL};
\r
354 +_notmuch_token_t *
\r
355 +_notmuch_token_create (const void *ctx, enum _notmuch_token_type type,
\r
356 + const char *text)
\r
358 + struct _notmuch_token *tok = talloc (ctx, struct _notmuch_token);
\r
359 + tok->type = type;
\r
360 + tok->text = text;
\r
361 + if (type == TOK_ADJ || type == TOK_NEAR)
\r
362 + tok->distance = 10;
\r
364 + tok->distance = -1;
\r
365 + tok->prefixFlags = 0;
\r
366 + tok->prefix = NULL;
\r
367 + tok->left = tok->right = NULL;
\r
372 +_notmuch_token_show_list (const void *ctx, _notmuch_token_t *tok)
\r
374 + char *out = talloc_strdup (ctx, "");
\r
376 + for (; tok->type != TOK_END; tok = tok->next) {
\r
377 + out = talloc_asprintf_append (out, "%s%s",
\r
378 + *out == 0 ? "" : " ",
\r
379 + token_types[tok->type]);
\r
380 + if (tok->distance != -1)
\r
381 + out = talloc_asprintf_append (out, "/%d", tok->distance);
\r
383 + out = talloc_asprintf_append (out, ":%s", tok->text);
\r
390 +_notmuch_token_show_tree (const void *ctx, _notmuch_token_t *root)
\r
393 + return talloc_strdup (ctx, "<nil>");
\r
394 + } else if (root->type == TOK_TERMS || root->type == TOK_LIT) {
\r
395 + return talloc_strdup (ctx, root->text);
\r
397 + char *out = talloc_asprintf (ctx, "(%s", token_types[root->type]);
\r
398 + void *local = talloc_new (ctx);
\r
399 + if (root->type == TOK_PREFIX)
\r
400 + out = talloc_asprintf_append (out, "/%s", root->text);
\r
402 + out = talloc_asprintf_append
\r
403 + (out, " %s", _notmuch_token_show_tree (local, root->left));
\r
405 + out = talloc_asprintf_append
\r
406 + (out, " %s", _notmuch_token_show_tree (local, root->right));
\r
407 + out = talloc_asprintf_append (out, ")");
\r
408 + talloc_free (local);
\r
413 +using Xapian::Unicode::is_whitespace;
\r
414 +using Xapian::Unicode::is_wordchar;
\r
416 +static Xapian::Utf8Iterator
\r
417 +lex_skip_ws (Xapian::Utf8Iterator it)
\r
419 + while (is_whitespace (*it))
\r
424 +static struct _notmuch_token *
\r
425 +lex_emit (struct _notmuch_lexer_state *s, enum _notmuch_token_type type,
\r
428 + struct _notmuch_token *tok = _notmuch_token_create (s, type, text);
\r
429 + tok->next = &tok_end;
\r
430 + *(s->tail) = tok;
\r
431 + s->tail = &tok->next;
\r
436 +/* Lex a quoted phrase, returning an iterator pointing to the
\r
437 + * character following the closing quote. If escaped, then accept two
\r
438 + * quotes as an escaped version of a single quote.
\r
440 +static Xapian::Utf8Iterator
\r
441 +lex_quoted_phrase (struct _notmuch_lexer_state *s,
\r
442 + Xapian::Utf8Iterator it, notmuch_bool_t escaped)
\r
444 + Xapian::Utf8Iterator next, orig, end;
\r
445 + char *term, *src, *dst;
\r
447 + /* Find the end of the phrase */
\r
448 + assert (*(it++) == '"');
\r
449 + for (next = it; next != end; ++next) {
\r
450 + if (*next == '"') {
\r
455 + if (next == end || *next != '"') {
\r
462 + /* Xapian still lexes +/-/( in quotes mode and simply doesn't
\r
463 + * generate tokens for them. For us, the term generator will
\r
464 + * discard them. */
\r
465 + term = talloc_strndup (s, it.raw (), next.raw () - it.raw ());
\r
467 + /* Replace doubled quotes with a single quote. */
\r
468 + for (src = dst = term; *src; ++src, ++dst) {
\r
475 + lex_emit (s, TOK_TERMS, term);
\r
482 +static Xapian::Utf8Iterator
\r
483 +lex_try_consume_prefix (_notmuch_qparser_t *qparser, Xapian::Utf8Iterator it,
\r
484 + char **prefixOut, int *flagsOut)
\r
486 + Xapian::Utf8Iterator next (it), end;
\r
490 + *prefixOut = NULL;
\r
491 + while (next != end && *next != ':' && !is_whitespace (*next))
\r
493 + if (*next != ':')
\r
495 + /* Ignore if followed by <= ' ' or ')' */
\r
497 + if (*next <= ' ' || *next == ')')
\r
500 + prefix = talloc_strndup (qparser, it.raw (), next.raw () - it.raw() - 1);
\r
501 + flags = GPOINTER_TO_INT (g_hash_table_lookup (qparser->prefixes, prefix));
\r
503 + /* Not a known prefix */
\r
504 + talloc_free (prefix);
\r
507 + *prefixOut = prefix;
\r
508 + *flagsOut = flags;
\r
512 +static Xapian::Utf8Iterator
\r
513 +lex_consume_term (const void *ctx, Xapian::Utf8Iterator it, char **termOut)
\r
515 + Xapian::Utf8Iterator next (it), end;
\r
516 + /* Xapian permits other characters to separate term phrases. For
\r
517 + * example, "x#y" is parsed as two separate (non-phrase) terms.
\r
518 + * However, because the characters allowed in a term are
\r
519 + * context-sensitive, replicating this is very hard. Here we take
\r
520 + * a simpler approach where only whitespace and a few operator
\r
521 + * characters that are never term characters separate terms. */
\r
522 + while (next != end && !strchr ("()\"", *next) && !is_whitespace (*next))
\r
524 + *termOut = talloc_strndup (ctx, it.raw (), next.raw () - it.raw ());
\r
528 +static notmuch_bool_t
\r
529 +lex_operator (struct _notmuch_lexer_state *s, char *term,
\r
530 + const char *op, enum _notmuch_token_type type)
\r
532 + size_t oplen = strlen (op);
\r
534 + if (strcasecmp (term, op) == 0) {
\r
535 + lex_emit (s, type, term);
\r
539 + /* Check for ADJ or NEAR with argument. Our parsing of this is
\r
540 + * slightly incompatible with Xapian, but I believe this to be a
\r
541 + * bug in Xapian. Xapian parses "x NEAR/y z" as three term
\r
542 + * phrases, "x", "near y", and "z", like we do. However, it
\r
543 + * behaves differently if the bad NEAR operator is at the end of
\r
544 + * the query, parsing "x NEAR/y" like "x NEAR y". */
\r
545 + if ((type == TOK_ADJ || type == TOK_NEAR) &&
\r
546 + strncasecmp (term, op, oplen) == 0 &&
\r
547 + term[oplen] == '/') {
\r
548 + /* Try to parse the distance argument */
\r
550 + int distance = strtol (&term[oplen + 1], &end, 10);
\r
551 + if (distance && !*end) {
\r
552 + struct _notmuch_token *tok = lex_emit (s, type, term);
\r
553 + tok->distance = distance;
\r
561 +static _notmuch_token_t *
\r
562 +lex (_notmuch_qparser_t *qparser, const char *query)
\r
564 + Xapian::Utf8Iterator it (query), next, end;
\r
565 + struct _notmuch_lexer_state *s;
\r
566 + struct _notmuch_token *tok;
\r
570 + s = talloc (qparser, struct _notmuch_lexer_state);
\r
573 + s->qparser = qparser;
\r
574 + s->head = &tok_end;
\r
575 + s->tail = &s->head;
\r
577 + while (it != end) {
\r
579 + if ((it = lex_skip_ws (it)) == end)
\r
587 + /* Xapian ignores these unless preceded by whitespace or
\r
588 + * an open paren, which has the effect of ignoring all
\r
589 + * +'s in "x +++y", "x#+y", and "(x)+y". We don't
\r
592 + /* Ignore if followed by a space or another + or - */
\r
593 + if (is_whitespace (*it) || *it == '+' || *it == '-')
\r
595 + lex_emit (s, ch == '+' ? TOK_LOVE : TOK_HATE, NULL);
\r
599 + it = lex_quoted_phrase(s, it, false);
\r
604 + /* Xapian ignores this unless preceded by whitespace,
\r
605 + * parens, +, or -. We don't bother. */
\r
606 + lex_emit (s, TOK_BRA, NULL);
\r
611 + lex_emit (s, TOK_KET, NULL);
\r
615 + /* Scan for a prefix */
\r
616 + next = lex_try_consume_prefix (qparser, it, &term, &prefixFlags);
\r
617 + if (term && (*next == '"' || *next == '(' || is_wordchar (*next))) {
\r
618 + int literal = prefixFlags & PREFIX_LITERAL;
\r
619 + tok = lex_emit (s, TOK_PREFIX, term);
\r
620 + tok->prefixFlags = prefixFlags;
\r
623 + if (literal && *it == '"') {
\r
624 + /* Literal quoted strings keep everything and allow
\r
625 + * quote escaping, unlike regular quoted phrases. */
\r
626 + it = lex_quoted_phrase (s, it, true);
\r
627 + } else if (literal) {
\r
628 + /* Xapian uses anything up to the next space or ')'
\r
629 + * (because literal prefixes can't be applied to
\r
630 + * subqueries). I disagree with Xapian here, since
\r
631 + * Xapian will keep the open paren but not the close
\r
632 + * paren. Better would be to balance them. */
\r
633 + if (*next == '(')
\r
635 + while (next != end && *next > ' ' && *next != ')')
\r
637 + term = talloc_strndup (s, it.raw (),
\r
638 + next.raw () - it.raw ());
\r
639 + lex_emit (s, TOK_TERMS, term);
\r
642 + /* For quoted strings and subqueries following non-literal
\r
643 + * prefixes and regular terms, we lex as usual. */
\r
647 + /* Scan for a term phrase or operator */
\r
648 + it = lex_consume_term (s, it, &term);
\r
650 + /* Check operators */
\r
651 + if (lex_operator (s, term, "and", TOK_AND) ||
\r
652 + lex_operator (s, term, "not", TOK_NOT) ||
\r
653 + lex_operator (s, term, "xor", TOK_XOR) ||
\r
654 + lex_operator (s, term, "or", TOK_OR) ||
\r
655 + lex_operator (s, term, "adj", TOK_ADJ) ||
\r
656 + lex_operator (s, term, "near", TOK_NEAR))
\r
659 + /* Must be a term */
\r
660 + lex_emit (s, TOK_TERMS, term);
\r
667 +add_to_query (const void *ctx, _notmuch_token_t **query,
\r
668 + _notmuch_token_type op, _notmuch_token_t *right)
\r
672 + else if (right) {
\r
673 + _notmuch_token_t *tok = _notmuch_token_create (ctx, op, NULL);
\r
674 + tok->left = *query;
\r
675 + tok->right = right;
\r
680 +static _notmuch_token_t *
\r
681 +parse_expr (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok);
\r
682 +static _notmuch_token_t *
\r
683 +parse_term (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok);
\r
685 +static _notmuch_token_t *
\r
686 +parse_prob (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok)
\r
688 + /* A prob is a sequence of three types of subqueries. Because the
\r
689 + * default query operator is AND, loved terms are not treated
\r
691 + * 1) Probabilistic terms (prefixed or not). These are combined
\r
692 + * with the default query operator, AND.
\r
693 + * 2) Terms with a boolean prefix. All of the terms with the same
\r
694 + * prefix are combined with OR. Different prefixes are
\r
695 + * combined with AND.
\r
696 + * 3) Hate terms. These are combined with OR.
\r
697 + * The final IR looks like
\r
698 + * (probs AND (FILTER bools)) AND (NOT hates)
\r
701 + _notmuch_token_t *probs, *hates, *sub, *q;
\r
702 + GHashTable *bools;
\r
705 + probs = hates = NULL;
\r
706 + bools = g_hash_table_new (g_str_hash, g_str_equal);
\r
708 + switch ((*tok)->type) {
\r
711 + /* Unmatched close paren. Ignore it. */
\r
712 + *tok = (*tok)->next;
\r
715 + /* Fall through */
\r
716 + case TOK_AND: case TOK_OR: case TOK_XOR: case TOK_NOT:
\r
718 + /* End of the prob. Might be empty. */
\r
723 + *tok = (*tok)->next;
\r
724 + sub = parse_expr (qparser, prec + 1, tok);
\r
725 + add_to_query (qparser, &hates, TOK_OR, sub);
\r
729 + sub = parse_expr (qparser, prec + 1, tok);
\r
732 + if (sub->prefixFlags & PREFIX_PROB) {
\r
733 + add_to_query (qparser, &probs, TOK_AND, sub);
\r
735 + _notmuch_token_t *newb, *pre = (_notmuch_token_t*)
\r
736 + g_hash_table_lookup (bools, sub->text);
\r
740 + /* OR subqueries with same prefix */
\r
741 + newb = _notmuch_token_create (qparser, TOK_OR, NULL);
\r
742 + newb->left = pre;
\r
743 + newb->right = sub;
\r
745 + g_hash_table_insert (bools, (void*)sub->text, newb);
\r
750 + /* Join into the query like any other term, since the
\r
751 + * default operator is AND anyway. */
\r
752 + *tok = (*tok)->next;
\r
753 + /* Fall through */
\r
757 + sub = parse_expr (qparser, prec + 1, tok);
\r
758 + add_to_query (qparser, &probs, TOK_AND, sub);
\r
764 + /* XXX Can ADJ or NEAR happen? */
\r
765 + INTERNAL_ERROR ("Unexpected token type %d", (*tok)->type);
\r
770 + if (g_hash_table_size (bools)) {
\r
771 + /* Merge boolean filters */
\r
772 + _notmuch_token_t *filter;
\r
773 + GList *vals = g_hash_table_get_values (bools), *l;
\r
775 + for (l = vals; l; l = l->next)
\r
776 + add_to_query (qparser, &sub, TOK_AND, (_notmuch_token_t *) l->data);
\r
777 + g_list_free (vals);
\r
779 + /* Create filter */
\r
780 + filter = _notmuch_token_create (qparser, TOK_FILTER, NULL);
\r
781 + filter->left = sub;
\r
782 + add_to_query (qparser, &q, TOK_AND, filter);
\r
785 + sub = _notmuch_token_create (qparser, TOK_NOT, NULL);
\r
786 + sub->left = hates;
\r
787 + add_to_query (qparser, &q, TOK_AND, sub);
\r
789 + g_hash_table_unref (bools);
\r
793 +static _notmuch_token_t *
\r
794 +parse_term (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok)
\r
796 + _notmuch_token_t *sub;
\r
798 + if ((*tok)->type == TOK_END) {
\r
799 + /* Arises from things like "x -". Ignore. */
\r
801 + } else if ((*tok)->type == TOK_PREFIX) {
\r
803 + *tok = (*tok)->next;
\r
804 + sub->left = parse_term (qparser, prec, tok);
\r
807 + if (sub->prefixFlags & PREFIX_LITERAL) {
\r
808 + /* Convert TOK_TERMS to TOK_LIT */
\r
809 + assert (sub->left->type == TOK_TERMS);
\r
810 + sub->left->type = TOK_LIT;
\r
811 + } else if (sub->left->type == TOK_PREFIX) {
\r
812 + sub->left = sub->left->left;
\r
815 + } else if ((*tok)->type == TOK_BRA) {
\r
816 + *tok = (*tok)->next;
\r
817 + sub = parse_expr (qparser, prec + 10 - (prec%10), tok);
\r
818 + if ((*tok)->type == TOK_KET)
\r
819 + *tok = (*tok)->next;
\r
823 + if ((*tok)->type != TOK_TERMS && (*tok)->type != TOK_LIT) {
\r
824 + /* Arises from "+AND", "-AND", "prob:AND". We could give up
\r
825 + * and return nothing, but it seems nicer to treat the
\r
826 + * operator as a term if it came from the original query. */
\r
827 + if (!(*tok)->text)
\r
829 + (*tok)->type = TOK_TERMS;
\r
833 + *tok = (*tok)->next;
\r
837 +static _notmuch_token_t *
\r
838 +parse_expr (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok)
\r
840 + /* If you squint at the Xapian grammar, it's a precedence grammar
\r
841 + * with one strange "prob" level. This implements all but the
\r
842 + * prob level and the leaf "term" level.
\r
844 + * prec is (nesting level * 10 + precedence level). Normally we
\r
845 + * only care about the precedence level, but the nesting level is
\r
846 + * important for recovering from unbalanced parens.
\r
848 + int bprec = prec % 10;
\r
849 + if (bprec == 3) {
\r
850 + if ((*tok)->type == TOK_NOT) {
\r
852 + _notmuch_token_t *root = *tok;
\r
853 + *tok = (*tok)->next;
\r
854 + root->left = parse_expr (qparser, prec, tok);
\r
858 + return parse_prob (qparser, prec, tok);
\r
861 + return parse_term (qparser, prec, tok);
\r
863 + _notmuch_token_t *left = parse_expr (qparser, prec + 1, tok);
\r
864 + while ((bprec == 0 && (*tok)->type == TOK_OR) ||
\r
865 + (bprec == 1 && (*tok)->type == TOK_XOR) ||
\r
866 + (bprec == 2 && ((*tok)->type == TOK_AND ||
\r
867 + (*tok)->type == TOK_NOT)) ||
\r
868 + (bprec == 4 && ((*tok)->type == TOK_NEAR ||
\r
869 + (*tok)->type == TOK_ADJ))) {
\r
870 + _notmuch_token_t *root = *tok;
\r
871 + if (root->type == TOK_NOT) {
\r
872 + /* Replace "x NOT y" with (AND x (NOT y)) by inserting an
\r
873 + * AND operator and leaning on the unary NOT rule. */
\r
874 + root = _notmuch_token_create (qparser, TOK_AND, NULL);
\r
876 + *tok = (*tok)->next;
\r
879 + /* Xapian treats x AND -y as x AND NOT y, which affects
\r
881 + if (root->type == TOK_AND && (*tok)->type == TOK_HATE)
\r
882 + (*tok)->type = TOK_NOT;
\r
884 + _notmuch_token_t *right = parse_expr (qparser, prec + 1, tok);
\r
885 + if (left && right) {
\r
886 + root->left = left;
\r
887 + root->right = right;
\r
889 + /* Left or right was empty. This may be a syntax error
\r
890 + * like an omitted expression, or an empty expression. */
\r
891 + root = left ? left : right;
\r
898 +static _notmuch_token_t *
\r
899 +parse (_notmuch_qparser_t *qparser, _notmuch_token_t *toks)
\r
901 + _notmuch_token_t *root = parse_expr (qparser, 0, &toks);
\r
902 + if (toks->type != TOK_END)
\r
903 + INTERNAL_ERROR ("Token stream not fully consumed: %s",
\r
904 + _notmuch_token_show_list (qparser, toks));
\r
909 +generate_term (const void *ctx, const char *term, const char *prefix)
\r
911 + notmuch_bool_t colon = FALSE;
\r
912 + if (isupper (term[0]) && strlen (prefix) > 1)
\r
914 + return talloc_asprintf (ctx, "%s%s%s", prefix, colon ? ":" : "", term);
\r
917 +static Xapian::Query
\r
918 +generate_terms (struct _notmuch_generate_state *s, const char *text,
\r
919 + const char *prefix)
\r
921 + Xapian::TermGenerator tg;
\r
922 + Xapian::Document doc;
\r
923 + Xapian::TermIterator it, end;
\r
924 + Xapian::PositionIterator pit, pend;
\r
925 + Xapian::Query *qs, q;
\r
926 + unsigned int nterms = 0;
\r
929 + tg.index_text (text, 1, prefix);
\r
931 + tg.index_text (text);
\r
932 + doc = tg.get_document ();
\r
934 + /* Find the highest positioned term. Positions are 1-based. */
\r
935 + end = doc.termlist_end ();
\r
936 + for (it = doc.termlist_begin (); it != end; ++it) {
\r
937 + pend = it.positionlist_end ();
\r
938 + for (pit = it.positionlist_begin (); pit != pend; ++pit) {
\r
939 + if (*pit > nterms)
\r
944 + return Xapian::Query ();
\r
946 + /* Extract terms */
\r
947 + qs = new Xapian::Query[nterms];
\r
948 + for (it = doc.termlist_begin (); it != end; ++it) {
\r
949 + pend = it.positionlist_end ();
\r
950 + for (pit = it.positionlist_begin (); pit != pend; ++pit)
\r
951 + qs[*pit - 1] = Xapian::Query (*it, 1, s->termpos + *pit - 1);
\r
953 + s->termpos += nterms;
\r
955 + /* Build query */
\r
956 + q = Xapian::Query (Xapian::Query::OP_PHRASE, qs, qs + nterms, nterms);
\r
961 +static Xapian::Query
\r
962 +generate (struct _notmuch_generate_state *s, _notmuch_token_t *root)
\r
964 + using Xapian::Query;
\r
971 + /* The tricky part here is that generate is allowed to return a
\r
972 + * empty query, indicating that the user's query cannot be
\r
973 + * expressed. For example, the term "#" is an empty query. */
\r
975 + switch (root->type) {
\r
977 + if (root->left->type == TOK_NOT && root->right->type != TOK_NOT) {
\r
978 + _notmuch_token_t *tmp = root->left;
\r
979 + root->left = root->right;
\r
980 + root->right = tmp;
\r
982 + l = generate (s, root->left);
\r
984 + return generate (s, root->right);
\r
985 + } else if (root->right->type == TOK_NOT) {
\r
986 + r = generate (s, root->right->left);
\r
987 + op = Query::OP_AND_NOT;
\r
988 + } else if (root->right->type == TOK_FILTER) {
\r
989 + r = generate (s, root->right->left);
\r
990 + op = Query::OP_FILTER;
\r
992 + r = generate (s, root->right);
\r
993 + op = Query::OP_AND;
\r
997 + return Query (op, l, r);
\r
1000 + l = generate (s, root->left);
\r
1003 + return Query (Query::OP_AND_NOT, Query ("", 1, 0), l);
\r
1005 + case TOK_FILTER:
\r
1006 + l = generate(s, root->left);
\r
1009 + return Query (Query::OP_SCALE_WEIGHT, l, 0.0);
\r
1013 + l = generate (s, root->left);
\r
1014 + r = generate (s, root->right);
\r
1019 + return Query (root->type == TOK_OR ? Query::OP_OR : Query::OP_XOR,
\r
1025 + /* XXX This differs from Xapian. Xapian treats ADJ and NEAR
\r
1026 + * as n-ary operators and constructs the whole list of terms
\r
1027 + * to be searched within the largest specified window. */
\r
1029 + subs[0] = Query (generate (s, root->left));
\r
1030 + subs[1] = Query (generate (s, root->right));
\r
1031 + if (subs[0].empty())
\r
1033 + if (subs[1].empty())
\r
1035 + return Query (root->type == TOK_ADJ ? Query::OP_PHRASE : Query::OP_NEAR,
\r
1036 + subs, subs + 2, root->distance + 1);
\r
1039 + case TOK_PREFIX:
\r
1040 + return generate (s, root->left);
\r
1043 + return generate_terms (s, root->text, root->prefix);
\r
1046 + return Query (generate_term (s, root->text, root->prefix));
\r
1053 + INTERNAL_ERROR ("Illegal token type %s in IR", token_types[root->type]);
\r
1055 + /* We leave this outside the switch so the compiler will warn us
\r
1056 + * if we missed a token type. */
\r
1057 + INTERNAL_ERROR ("Illegal token type %d in IR", root->type);
\r
1058 + return Xapian::Query ();
\r
1062 +_notmuch_qparser_destructor (_notmuch_qparser_t *qparser)
\r
1064 + g_hash_table_unref (qparser->prefixes);
\r
1068 +_notmuch_qparser_t *
\r
1069 +_notmuch_qparser_create (const void *ctx, notmuch_database_t *notmuch)
\r
1071 + _notmuch_qparser_t *qparser = talloc (ctx, _notmuch_qparser_t);
\r
1074 + qparser->prefixes = NULL;
\r
1075 + talloc_set_destructor (qparser, _notmuch_qparser_destructor);
\r
1077 + qparser->notmuch = notmuch;
\r
1078 + qparser->prefixes = g_hash_table_new (g_str_hash, g_str_equal);
\r
1079 + qparser->transforms = NULL;
\r
1080 + qparser->transformsTail = &qparser->transforms;
\r
1086 +_notmuch_qparser_add_prefix (_notmuch_qparser_t *qparser,
\r
1087 + const char *prefix, notmuch_bool_t literal,
\r
1088 + notmuch_bool_t boolean)
\r
1090 + int flags = ((literal ? PREFIX_LITERAL : 0) |
\r
1091 + (boolean ? PREFIX_BOOL : PREFIX_PROB));
\r
1092 + g_hash_table_insert (qparser->prefixes, talloc_strdup (qparser, prefix),
\r
1093 + GINT_TO_POINTER (flags));
\r
1097 +_notmuch_qparser_add_transform (_notmuch_qparser_t *qparser,
\r
1098 + _notmuch_token_t *(*transform) (
\r
1099 + _notmuch_token_t *ast, void *opaque),
\r
1102 + struct _notmuch_qparser_transform *t;
\r
1103 + t = talloc (qparser, struct _notmuch_qparser_transform);
\r
1105 + t->transform = transform;
\r
1106 + t->opaque = opaque;
\r
1107 + *qparser->transformsTail = t;
\r
1108 + qparser->transformsTail = &t->next;
\r
1111 +struct _notmuch_transform_prefix_info {
\r
1112 + char *field, *prefix;
\r
1115 +static _notmuch_token_t *
\r
1116 +transform_prefix_rec (_notmuch_token_t *root, char *field, char *prefix,
\r
1117 + notmuch_bool_t active)
\r
1121 + if (root->type == TOK_PREFIX) {
\r
1122 + active = (strcmp (field, root->text) == 0);
\r
1123 + } else if (root->type == TOK_TERMS || root->type == TOK_LIT) {
\r
1125 + root->prefix = prefix;
\r
1127 + transform_prefix_rec (root->left, field, prefix, active);
\r
1128 + transform_prefix_rec (root->right, field, prefix, active);
\r
1132 +static _notmuch_token_t *
\r
1133 +transform_prefix (_notmuch_token_t *root, void *opaque)
\r
1135 + struct _notmuch_transform_prefix_info *info =
\r
1136 + (struct _notmuch_transform_prefix_info*)opaque;
\r
1137 + return transform_prefix_rec (root, info->field, info->prefix, FALSE);
\r
1141 +_notmuch_qparser_add_db_prefix (_notmuch_qparser_t *qparser,
\r
1142 + const char *field, const char *prefix,
\r
1143 + notmuch_bool_t boolean)
\r
1145 + struct _notmuch_transform_prefix_info *info;
\r
1146 + info = talloc (qparser, struct _notmuch_transform_prefix_info);
\r
1147 + info->field = talloc_strdup (info, field);
\r
1148 + info->prefix = talloc_strdup (info, prefix);
\r
1149 + _notmuch_qparser_add_prefix (qparser, field, boolean, boolean);
\r
1150 + _notmuch_qparser_add_transform (qparser, transform_prefix, info);
\r
1153 +_notmuch_token_t *
\r
1154 +_notmuch_qparser_lex (_notmuch_qparser_t *qparser, const char *query)
\r
1156 + return lex (qparser, query);
\r
1159 +_notmuch_token_t *
\r
1160 +_notmuch_qparser_parse (_notmuch_qparser_t *qparser, const char *query)
\r
1162 + _notmuch_token_t *toks = lex (qparser, query);
\r
1163 + return parse (qparser, toks);
\r
1166 +_notmuch_token_t *
\r
1167 +_notmuch_qparser_transform (_notmuch_qparser_t *qparser, _notmuch_token_t *root)
\r
1169 + struct _notmuch_qparser_transform *t;
\r
1170 + for (t = qparser->transforms; t; t = t->next)
\r
1171 + root = t->transform (root, t->opaque);
\r
1176 +_notmuch_qparser_generate (_notmuch_qparser_t *qparser, _notmuch_token_t *root)
\r
1178 + struct _notmuch_generate_state *s;
\r
1179 + s = talloc (qparser, struct _notmuch_generate_state);
\r
1180 + s->qparser = qparser;
\r
1182 + Xapian::Query query = generate (s, root);
\r
1183 + talloc_free (s);
\r
1184 + if (query.empty())
\r
1185 + /* Return all documents */
\r
1186 + return Xapian::Query ("", 1, 0);
\r