88/587ae9af3dd740b78b933b55d98a1fa93f5f33

   1 Return-Path: <amthrax@drake.mit.edu>\r
   2 X-Original-To: notmuch@notmuchmail.org\r
   3 Delivered-To: notmuch@notmuchmail.org\r
   4 Received: from localhost (localhost [127.0.0.1])\r
   5         by olra.theworths.org (Postfix) with ESMTP id B10C642D292\r
   6         for <notmuch@notmuchmail.org>; Sun, 16 Jan 2011 00:11:19 -0800 (PST)\r
   7 X-Virus-Scanned: Debian amavisd-new at olra.theworths.org\r
   8 X-Spam-Flag: NO\r
   9 X-Spam-Score: 0\r
  10 X-Spam-Level: \r
  11 X-Spam-Status: No, score=0 tagged_above=-999 required=5 tests=[none]\r
  12         autolearn=disabled\r
  13 Received: from olra.theworths.org ([127.0.0.1])\r
  14         by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024)\r
  15         with ESMTP id DZOMkyqUJ2Ep for <notmuch@notmuchmail.org>;\r
  16         Sun, 16 Jan 2011 00:11:16 -0800 (PST)\r
  17 Received: from dmz-mailsec-scanner-1.mit.edu (DMZ-MAILSEC-SCANNER-1.MIT.EDU\r
  18         [18.9.25.12])\r
  19         by olra.theworths.org (Postfix) with ESMTP id 6C2A842D291\r
  20         for <notmuch@notmuchmail.org>; Sun, 16 Jan 2011 00:11:16 -0800 (PST)\r
  21 X-AuditID: 1209190c-b7ba9ae0000009f8-b0-4d32a823744c\r
  22 Received: from mailhub-auth-4.mit.edu ( [18.7.62.39])\r
  23         by dmz-mailsec-scanner-1.mit.edu (Symantec Brightmail Gateway) with\r
  24         SMTP id 8B.ED.02552.328A23D4; Sun, 16 Jan 2011 03:11:15 -0500 (EST)\r
  25 Received: from outgoing.mit.edu (OUTGOING-AUTH.MIT.EDU [18.7.22.103])\r
  26         by mailhub-auth-4.mit.edu (8.13.8/8.9.2) with ESMTP id p0G8BFTo024328; \r
  27         Sun, 16 Jan 2011 03:11:15 -0500\r
  28 Received: from drake.mit.edu (a074.catapulsion.net [70.36.81.74])\r
  29         (authenticated bits=0)\r
  30         (User authenticated as amdragon@ATHENA.MIT.EDU)\r
  31         by outgoing.mit.edu (8.13.6/8.12.4) with ESMTP id p0G8BCnm010513\r
  32         (version=TLSv1/SSLv3 cipher=AES256-SHA bits=256 verify=NOT);\r
  33         Sun, 16 Jan 2011 03:11:14 -0500 (EST)\r
  34 Received: from amthrax by drake.mit.edu with local (Exim 4.72)\r
  35         (envelope-from <amthrax@drake.mit.edu>)\r
  36         id 1PeNhg-0002XO-Ko; Sun, 16 Jan 2011 03:11:12 -0500\r
  37 From: Austin Clements <amdragon@MIT.EDU>\r
  38 To: notmuch@notmuchmail.org\r
  39 Subject: [PATCH 1/8] Implement a custom query parser with a mostly Xapian-compatible grammar.\r
  40 Date: Sun, 16 Jan 2011 03:10:51 -0500\r
  41 Message-Id: <1295165458-9573-2-git-send-email-amdragon@mit.edu>\r
  42 X-Mailer: git-send-email 1.7.2.3\r
  43 In-Reply-To: <1295165458-9573-1-git-send-email-amdragon@mit.edu>\r
  44 References: <1295165458-9573-1-git-send-email-amdragon@mit.edu>\r
  45 MIME-Version: 1.0\r
  46 Content-Type: text/plain; charset=UTF-8\r
  47 Content-Transfer-Encoding: 8bit\r
  48 X-Brightmail-Tracker: AAAAARcg3TI=\r
  49 Cc: amdragon@mit.edu\r
  50 X-BeenThere: notmuch@notmuchmail.org\r
  51 X-Mailman-Version: 2.1.13\r
  52 Precedence: list\r
  53 List-Id: "Use and development of the notmuch mail system."\r
  54         <notmuch.notmuchmail.org>\r
  55 List-Unsubscribe: <http://notmuchmail.org/mailman/options/notmuch>,\r
  56         <mailto:notmuch-request@notmuchmail.org?subject=unsubscribe>\r
  57 List-Archive: <http://notmuchmail.org/pipermail/notmuch>\r
  58 List-Post: <mailto:notmuch@notmuchmail.org>\r
  59 List-Help: <mailto:notmuch-request@notmuchmail.org?subject=help>\r
  60 List-Subscribe: <http://notmuchmail.org/mailman/listinfo/notmuch>,\r
  61         <mailto:notmuch-request@notmuchmail.org?subject=subscribe>\r
  62 X-List-Received-Date: Sun, 16 Jan 2011 08:11:19 -0000\r
  63 \r
  64 This parser takes an extra step through an intermediate representation\r
  65 that is convenient to manipulate via pluggable transformation passes.\r
  66 These are used to implement regular Xapian-style query prefixes, but\r
  67 are flexible enough to accomplish far more.\r
  68 ---\r
  69  lib/Makefile.local     |    1 +\r
  70  lib/database-private.h |    9 +\r
  71  lib/notmuch-private.h  |  134 +++++++\r
  72  lib/qparser.cc         |  920 ++++++++++++++++++++++++++++++++++++++++++++++++\r
  73  4 files changed, 1064 insertions(+), 0 deletions(-)\r
  74  create mode 100644 lib/qparser.cc\r
  75 \r
  76 diff --git a/lib/Makefile.local b/lib/Makefile.local\r
  77 index 37d1c0d..37d3735 100644\r
  78 --- a/lib/Makefile.local\r
  79 +++ b/lib/Makefile.local\r
  80 @@ -64,6 +64,7 @@ libnotmuch_cxx_srcs =         \\r
  81         $(dir)/index.cc         \\r
  82         $(dir)/message.cc       \\r
  83         $(dir)/query.cc         \\r
  84 +       $(dir)/qparser.cc       \\r
  85         $(dir)/thread.cc\r
  86  \r
  87  libnotmuch_modules = $(libnotmuch_c_srcs:.c=.o) $(libnotmuch_cxx_srcs:.cc=.o)\r
  88 diff --git a/lib/database-private.h b/lib/database-private.h\r
  89 index 9f83407..5d2fa02 100644\r
  90 --- a/lib/database-private.h\r
  91 +++ b/lib/database-private.h\r
  92 @@ -64,6 +64,15 @@ _notmuch_get_terms_with_prefix (void *ctx, Xapian::TermIterator &i,\r
  93                                 Xapian::TermIterator &end,\r
  94                                 const char *prefix);\r
  95  \r
  96 +/* qparser.cc */\r
  97 +\r
  98 +/* Generate a Xapian query from a query AST.  If an error occurs,\r
  99 + * *error_out will be set to the text of that error.  Otherwise,\r
 100 + * *error_out will be set to NULL. */\r
 101 +Xapian::Query\r
 102 +_notmuch_qparser_generate (const void *ctx, _notmuch_qparser_t *qparser,\r
 103 +                          _notmuch_token_t *root, char **error_out);\r
 104 +\r
 105  #pragma GCC visibility pop\r
 106  \r
 107  #endif\r
 108 diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h\r
 109 index b6f1095..06239b9 100644\r
 110 --- a/lib/notmuch-private.h\r
 111 +++ b/lib/notmuch-private.h\r
 112 @@ -508,6 +508,140 @@ notmuch_filenames_t *\r
 113  _notmuch_filenames_create (const void *ctx,\r
 114                            notmuch_string_list_t *list);\r
 115  \r
 116 +/* qparser.cc */\r
 117 +\r
 118 +typedef struct _notmuch_qparser _notmuch_qparser_t;\r
 119 +\r
 120 +enum _notmuch_token_type {\r
 121 +    /* These first four token types appear only in the lexer output\r
 122 +     * and never in the parse tree. */\r
 123 +    TOK_LOVE, TOK_HATE, TOK_BRA, TOK_KET,\r
 124 +    /* Binary operators.  These should have left and right children. */\r
 125 +    TOK_AND, TOK_OR, TOK_XOR,\r
 126 +    /* Unary operators.  These have only a left child.  Xapian::Query\r
 127 +     * has no pure NOT operator, so the generator treats NOT as the\r
 128 +     * child of an AND specially, and otherwise represents it as\r
 129 +     * "<all> AND_NOT x".  FILTER ignores the weights of the subquery\r
 130 +     * and generates Xapian::Query::OP_FILTER if the left child of an\r
 131 +     * AND or Xapian::Query::OP_SCALE_WEIGHT otherwise.  The text\r
 132 +     * field of a PREFIX operator specifies the prefix.  PREFIX\r
 133 +     * operators specify only syntactic prefixes, not database\r
 134 +     * prefixes, and thus have no effect on the generated query. */\r
 135 +    TOK_NOT, TOK_FILTER, TOK_PREFIX,\r
 136 +    /* A single TOK_TERMS token can represent a single term, a quoted\r
 137 +     * phrase, or an implicit phrase.  An implicit phrase is something\r
 138 +     * like "foo/bar", for which the database contains two separate\r
 139 +     * terms, but you want to treat them as a phrase, even though it's\r
 140 +     * not quoted.  Xapian calls characters that implicitly connect\r
 141 +     * terms into phrases "phrase generators."  We take a simpler\r
 142 +     * approach and treat almost any non-whitespace character as a\r
 143 +     * phrase generator. */\r
 144 +    TOK_TERMS,\r
 145 +    /* Like TOK_TERMS, but the term text should be taken literally,\r
 146 +     * with no phrase splitting or whitespace removal.  The lexer\r
 147 +     * only generates TOK_TERMS; the parser creates TOK_LIT. */\r
 148 +    TOK_LIT,\r
 149 +    /* An error token.  An error token anywhere in the parse tree will\r
 150 +     * be propagated up by the generator and returned to the caller.\r
 151 +     * The error message should be in the text. */\r
 152 +    TOK_ERROR,\r
 153 +    /* TOK_END indicates the end of the token list.  Such tokens loop\r
 154 +     * back on themselves so it's always safe to follow "next".\r
 155 +     * These appear only in the lexer output. */\r
 156 +    TOK_END\r
 157 +};\r
 158 +\r
 159 +typedef struct _notmuch_token {\r
 160 +    enum _notmuch_token_type type;\r
 161 +    const char *text;\r
 162 +\r
 163 +    /* For TOK_PREFIX, the flags of this prefix. */\r
 164 +    int prefixFlags;\r
 165 +\r
 166 +    /* For TOK_TERMS and TOK_LIT, the database prefix to use when\r
 167 +     * generating database terms.  This must be filled in a\r
 168 +     * transformation pass. */\r
 169 +    const char *prefix;\r
 170 +\r
 171 +    /* Link in the lexer token list. */\r
 172 +    struct _notmuch_token *next;\r
 173 +\r
 174 +    /* Links in the intermediate AST. */\r
 175 +    struct _notmuch_token *left, *right;\r
 176 +} _notmuch_token_t;\r
 177 +\r
 178 +_notmuch_token_t *\r
 179 +_notmuch_token_create_op (const void *ctx, enum _notmuch_token_type type,\r
 180 +                         _notmuch_token_t *left, _notmuch_token_t *right);\r
 181 +\r
 182 +_notmuch_token_t *\r
 183 +_notmuch_token_create_term (const void *ctx, enum _notmuch_token_type type,\r
 184 +                           const char *text);\r
 185 +\r
 186 +char *\r
 187 +_notmuch_token_show (const void *ctx, _notmuch_token_t *tok);\r
 188 +\r
 189 +char *\r
 190 +_notmuch_token_show_list (const void *ctx, _notmuch_token_t *tok);\r
 191 +\r
 192 +char *\r
 193 +_notmuch_token_show_tree (const void *ctx, _notmuch_token_t *root);\r
 194 +\r
 195 +_notmuch_qparser_t *\r
 196 +_notmuch_qparser_create (const void *ctx, notmuch_database_t *notmuch);\r
 197 +\r
 198 +/* Add a syntactic prefix.  This will appear as a TOK_PREFIX in the\r
 199 + * AST, but does not alone affect the final query.\r
 200 + *\r
 201 + * The literal flag affects lexing.  If true, this prefix must be\r
 202 + * followed by a regular term or quoted literal, which will not be\r
 203 + * stripped of whitespace or split in to a phrase.  The boolean flag\r
 204 + * affects parsing.  If true, then terms with this prefix will be\r
 205 + * combined into the query using the FILTER operator, so they must\r
 206 + * appear in the result and will not contribute to weights.  Xapian's\r
 207 + * "boolean prefixes" are both literal and boolean.\r
 208 + */\r
 209 +void\r
 210 +_notmuch_qparser_add_prefix (_notmuch_qparser_t *qparser,\r
 211 +                            const char *prefix, notmuch_bool_t literal,\r
 212 +                            notmuch_bool_t boolean);\r
 213 +\r
 214 +/* Add a transform pass to a query parser.  The transform function\r
 215 + * will be called with the root of the AST and should return a new AST\r
 216 + * root (which may be the same as the old root).\r
 217 + */\r
 218 +void\r
 219 +_notmuch_qparser_add_transform (_notmuch_qparser_t *qparser,\r
 220 +                               _notmuch_token_t *(*transform) (\r
 221 +                                   _notmuch_token_t *ast, void *opaque),\r
 222 +                               void *opaque);\r
 223 +\r
 224 +/* Add a syntactic prefix (field) and a transform pass to transform\r
 225 + * that syntactic prefix into a database prefix (prefix).  This\r
 226 + * corresponds to Xapian's add_prefix and add_boolean_prefix\r
 227 + * functions. */\r
 228 +void\r
 229 +_notmuch_qparser_add_db_prefix (_notmuch_qparser_t *qparser,\r
 230 +                               const char *field, const char *prefix,\r
 231 +                               notmuch_bool_t boolean);\r
 232 +\r
 233 +/* Lex a query string, returning the first token in the token list.\r
 234 + * This is only meant for testing. */\r
 235 +_notmuch_token_t *\r
 236 +_notmuch_qparser_lex (const void *ctx, _notmuch_qparser_t *qparser,\r
 237 +                     const char *query);\r
 238 +\r
 239 +/* Parse a query string, returning the root of the AST. */\r
 240 +_notmuch_token_t *\r
 241 +_notmuch_qparser_parse (const void *ctx, _notmuch_qparser_t *qparser,\r
 242 +                       const char *query);\r
 243 +\r
 244 +/* Transform a parsed query, running the transforms in the order they\r
 245 + * were added to the query parser.  Return the root of the transformed\r
 246 + * AST. */\r
 247 +_notmuch_token_t *\r
 248 +_notmuch_qparser_transform (_notmuch_qparser_t *qparser, _notmuch_token_t *root);\r
 249 +\r
 250  #pragma GCC visibility pop\r
 251  \r
 252  NOTMUCH_END_DECLS\r
 253 diff --git a/lib/qparser.cc b/lib/qparser.cc\r
 254 new file mode 100644\r
 255 index 0000000..b86a445\r
 256 --- /dev/null\r
 257 +++ b/lib/qparser.cc\r
 258 @@ -0,0 +1,920 @@\r
 259 +/* qparser.cc - Notmuch query parser\r
 260 + *\r
 261 + * Copyright © 2010 Austin Clements\r
 262 + *\r
 263 + * This program is free software: you can redistribute it and/or modify\r
 264 + * it under the terms of the GNU General Public License as published by\r
 265 + * the Free Software Foundation, either version 3 of the License, or\r
 266 + * (at your option) any later version.\r
 267 + *\r
 268 + * This program is distributed in the hope that it will be useful,\r
 269 + * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
 270 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
 271 + * GNU General Public License for more details.\r
 272 + *\r
 273 + * You should have received a copy of the GNU General Public License\r
 274 + * along with this program.  If not, see http://www.gnu.org/licenses/ .\r
 275 + *\r
 276 + * Author: Austin Clements <amdragon@mit.edu>\r
 277 + */\r
 278 +\r
 279 +/*\r
 280 + * Query parsing is performed in a series of phases similar to those\r
 281 + * of a traditional compiler.\r
 282 + *\r
 283 + * 1) We tokenize the query, identifying operators and term phrases.\r
 284 + *    Note that a phrase (quoted or implicit) generates a single token\r
 285 + *    at this point, which we split up later (unlike in the Xapian\r
 286 + *    lexer).\r
 287 + * 2) We parse the token stream, generating an intermediate\r
 288 + *    representation in the form of a binary AST.  This IR is similar\r
 289 + *    to the Xapian::Query operators, but is designed to be more\r
 290 + *    easily manipulated.\r
 291 + * 3) We transform the parse tree, running a sequence of\r
 292 + *    caller-provided transformation functions over the tree.\r
 293 + * 4) We generate the Xapian::Query from the transformed IR.  This\r
 294 + *    step also splits phrase tokens into multiple query terms.\r
 295 + *\r
 296 + * To use the query parser, call _notmuch_qparser_parse to perform\r
 297 + * steps 1 and 2, _notmuch_qparser_transform to perform step 3, and\r
 298 + * _notmuch_qparser_generate to perform step 4.\r
 299 + *\r
 300 + * Still missing from this implementation:\r
 301 + * * NEAR/ADJ operators\r
 302 + * * Stemming - The stemming should probably be marked on TOK_TERMS\r
 303 + *   tokens.  Ideally, we can just pass this to the term generator.\r
 304 + * * Wildcard queries - This should be available in the IR so it's\r
 305 + *   easy to generate wildcard queries in a transformer.\r
 306 + * * Value ranges in the IR\r
 307 + * * Queries "" and "*"\r
 308 + */\r
 309 +\r
 310 +/* XXX notmuch currently registers "tag" as an exclusive boolean\r
 311 + * prefix, which means queries like "tag:x tag:y" will return messages\r
 312 + * with tag x OR tag y.  Is this intentional? */\r
 313 +\r
 314 +#include "notmuch-private.h"\r
 315 +#include "database-private.h"\r
 316 +\r
 317 +#include <glib.h>              /* GHashTable */\r
 318 +\r
 319 +struct _notmuch_qparser {\r
 320 +    notmuch_database_t *notmuch;\r
 321 +\r
 322 +    /* Maps from prefix strings to PREFIX_* flags */\r
 323 +    GHashTable *prefixes;\r
 324 +\r
 325 +    struct _notmuch_qparser_transform *transforms;\r
 326 +    struct _notmuch_qparser_transform **transformsTail;\r
 327 +};\r
 328 +\r
 329 +enum {\r
 330 +    PREFIX_LITERAL = 1<<1,\r
 331 +    PREFIX_PROB    = 1<<2,\r
 332 +    PREFIX_BOOL    = 1<<3,\r
 333 +};\r
 334 +\r
 335 +struct _notmuch_qparser_transform {\r
 336 +    struct _notmuch_qparser_transform *next;\r
 337 +    _notmuch_token_t *(*transform) (_notmuch_token_t *ast, void *opaque);\r
 338 +    void *opaque;\r
 339 +};\r
 340 +\r
 341 +struct _notmuch_lex_state {\r
 342 +    const void *ctx;\r
 343 +    _notmuch_qparser_t *qparser;\r
 344 +    _notmuch_token_t *head;\r
 345 +    _notmuch_token_t **tail;\r
 346 +};\r
 347 +\r
 348 +struct _notmuch_parse_state {\r
 349 +    const void *ctx;\r
 350 +    _notmuch_qparser_t *qparser;\r
 351 +};\r
 352 +\r
 353 +struct _notmuch_generate_state {\r
 354 +    const void *ctx;\r
 355 +    _notmuch_qparser_t *qparser;\r
 356 +    unsigned int termpos;\r
 357 +    char *error;\r
 358 +};\r
 359 +\r
 360 +static const char *token_types[] = {\r
 361 +    "LOVE", "HATE", "BRA", "KET",\r
 362 +    "AND", "OR", "XOR",\r
 363 +    "NOT", "FILTER", "PREFIX",\r
 364 +    "TERMS", "LIT", "ERROR", "END"\r
 365 +};\r
 366 +\r
 367 +/* The distinguished end token.  This simplifies the parser since it\r
 368 + * never has to worry about dereferencing next. */\r
 369 +static _notmuch_token_t tok_end = {TOK_END, NULL, FALSE, NULL,\r
 370 +                                  &tok_end, NULL, NULL};\r
 371 +\r
 372 +_notmuch_token_t *\r
 373 +_notmuch_token_create_op (const void *ctx, enum _notmuch_token_type type,\r
 374 +                         _notmuch_token_t *left, _notmuch_token_t *right)\r
 375 +{\r
 376 +    _notmuch_token_t *tok = talloc (ctx, struct _notmuch_token);\r
 377 +    memset (tok, 0, sizeof (*tok));\r
 378 +    tok->type = type;\r
 379 +    tok->left = left;\r
 380 +    tok->right = right;\r
 381 +    return tok;\r
 382 +}\r
 383 +\r
 384 +_notmuch_token_t *\r
 385 +_notmuch_token_create_term (const void *ctx, enum _notmuch_token_type type,\r
 386 +                           const char *text)\r
 387 +{\r
 388 +    _notmuch_token_t *tok = _notmuch_token_create_op (ctx, type, NULL, NULL);\r
 389 +    tok->text = text;\r
 390 +    return tok;\r
 391 +}\r
 392 +\r
 393 +char *\r
 394 +_notmuch_token_show (const void *ctx, _notmuch_token_t *tok)\r
 395 +{\r
 396 +    int ispre = tok->type == TOK_PREFIX;\r
 397 +\r
 398 +    if ((unsigned)tok->type > TOK_END)\r
 399 +       return talloc_asprintf (ctx, "<bad type %d>", tok->type);\r
 400 +\r
 401 +    if (tok->type == TOK_TERMS)\r
 402 +       return talloc_asprintf (ctx, "\"%s\"", tok->text);\r
 403 +    else if (tok->type == TOK_LIT)\r
 404 +       return talloc_asprintf (ctx, "'%s'", tok->text);\r
 405 +    else if (tok->type == TOK_ERROR)\r
 406 +       return talloc_asprintf (ctx, "ERROR/\"%s\"", tok->text);\r
 407 +\r
 408 +    return talloc_asprintf (ctx, "%s%s%s",\r
 409 +                           token_types[tok->type],\r
 410 +                           ispre ? "/" : "",\r
 411 +                           ispre ? tok->text : "");\r
 412 +}\r
 413 +\r
 414 +char *\r
 415 +_notmuch_token_show_list (const void *ctx, _notmuch_token_t *tok)\r
 416 +{\r
 417 +    char *out = talloc_strdup (ctx, "");\r
 418 +\r
 419 +    for (; tok->type != TOK_END; tok = tok->next) {\r
 420 +       char *t = _notmuch_token_show (ctx, tok);\r
 421 +       out = talloc_asprintf_append (out, "%s%s", *out == 0 ? "" : " ", t);\r
 422 +       talloc_free (t);\r
 423 +    }\r
 424 +\r
 425 +    return out;\r
 426 +}\r
 427 +\r
 428 +char *\r
 429 +_notmuch_token_show_tree (const void *ctx, _notmuch_token_t *root)\r
 430 +{\r
 431 +    if (!root) {\r
 432 +       return talloc_strdup (ctx, "<nil>");\r
 433 +    } else if (!root->left && !root->right) {\r
 434 +       return _notmuch_token_show (ctx, root);\r
 435 +    } else {\r
 436 +       void *local = talloc_new (ctx);\r
 437 +       char *out = talloc_asprintf\r
 438 +           (ctx, "(%s%s%s%s%s)", _notmuch_token_show (local, root),\r
 439 +            root->left ? " " : "",\r
 440 +            root->left ? _notmuch_token_show_tree (local, root->left) : "",\r
 441 +            root->right ? " " : "",\r
 442 +            root->right ? _notmuch_token_show_tree (local, root->right) : "");\r
 443 +       talloc_free (local);\r
 444 +       return out;\r
 445 +    }\r
 446 +}\r
 447 +\r
 448 +using Xapian::Unicode::is_whitespace;\r
 449 +using Xapian::Unicode::is_wordchar;\r
 450 +\r
 451 +static Xapian::Utf8Iterator\r
 452 +lex_skip_ws (Xapian::Utf8Iterator it)\r
 453 +{\r
 454 +    while (is_whitespace (*it))\r
 455 +       it++;\r
 456 +    return it;\r
 457 +}\r
 458 +\r
 459 +static struct _notmuch_token *\r
 460 +lex_emit (struct _notmuch_lex_state *s, enum _notmuch_token_type type,\r
 461 +         char *text)\r
 462 +{\r
 463 +    _notmuch_token_t *tok = _notmuch_token_create_term (s->ctx, type, text);\r
 464 +    tok->next = &tok_end;\r
 465 +    *(s->tail) = tok;\r
 466 +    s->tail = &tok->next;\r
 467 +    return tok;\r
 468 +}\r
 469 +\r
 470 +/* Lex a quoted phrase, returning an iterator pointing to the\r
 471 + * character following the closing quote.  If escaped, then accept two\r
 472 + * quotes as an escaped version of a single quote.\r
 473 + */\r
 474 +static Xapian::Utf8Iterator\r
 475 +lex_quoted_phrase (struct _notmuch_lex_state *s,\r
 476 +                  Xapian::Utf8Iterator it, notmuch_bool_t escaped)\r
 477 +{\r
 478 +    Xapian::Utf8Iterator next, orig, end;\r
 479 +    char *term, *src, *dst;\r
 480 +\r
 481 +    /* Find the end of the phrase */\r
 482 +    assert (*(it++) == '"');\r
 483 +    for (next = it; next != end; ++next) {\r
 484 +       if (*next == '"') {\r
 485 +           if (!escaped)\r
 486 +               break;\r
 487 +\r
 488 +           orig = next++;\r
 489 +           if (next == end || *next != '"') {\r
 490 +               next = orig;\r
 491 +               break;\r
 492 +           }\r
 493 +       }\r
 494 +    }\r
 495 +\r
 496 +    /* Xapian still lexes +/-/( in quotes mode and simply doesn't\r
 497 +     * generate tokens for them.  For us, the term generator will\r
 498 +     * discard them. */\r
 499 +    term = talloc_strndup (s->ctx, it.raw (), next.raw () - it.raw ());\r
 500 +    if (escaped) {\r
 501 +       /* Replace doubled quotes with a single quote. */\r
 502 +       for (src = dst = term; *src; ++src, ++dst) {\r
 503 +           *dst = *src;\r
 504 +           if (*src == '"')\r
 505 +               ++src;\r
 506 +       }\r
 507 +       *dst = '\0';\r
 508 +    }\r
 509 +    lex_emit (s, TOK_TERMS, term);\r
 510 +\r
 511 +    if (next != end)\r
 512 +       ++next;\r
 513 +    return next;\r
 514 +}\r
 515 +\r
 516 +static Xapian::Utf8Iterator\r
 517 +lex_try_consume_prefix (struct _notmuch_lex_state *s, Xapian::Utf8Iterator it,\r
 518 +                       char **prefixOut, int *flagsOut)\r
 519 +{\r
 520 +    Xapian::Utf8Iterator next (it), end;\r
 521 +    char *prefix;\r
 522 +    gpointer value = 0, orig;\r
 523 +    int flags;\r
 524 +\r
 525 +    *prefixOut = NULL;\r
 526 +    while (next != end && *next != ':' && !is_whitespace (*next))\r
 527 +       ++next;\r
 528 +    if (*next != ':')\r
 529 +       return it;\r
 530 +    /* Ignore if followed by <= ' ' or ')' */\r
 531 +    ++next;\r
 532 +    if (*next <= ' ' || *next == ')')\r
 533 +       return it;\r
 534 +\r
 535 +    prefix = talloc_strndup (s->ctx, it.raw (), next.raw () - it.raw() - 1);\r
 536 +    g_hash_table_lookup_extended (s->qparser->prefixes, prefix, &orig, &value);\r
 537 +    flags = GPOINTER_TO_INT (value);\r
 538 +    talloc_free (prefix);\r
 539 +\r
 540 +    if (!flags)\r
 541 +       /* Not a known prefix */\r
 542 +       return it;\r
 543 +    *prefixOut = (char*)orig;\r
 544 +    *flagsOut = flags;\r
 545 +    return next;\r
 546 +}\r
 547 +\r
 548 +static Xapian::Utf8Iterator\r
 549 +lex_consume_term (struct _notmuch_lex_state *s, Xapian::Utf8Iterator it,\r
 550 +                 char **termOut)\r
 551 +{\r
 552 +    Xapian::Utf8Iterator next (it), end;\r
 553 +    /* Xapian permits other characters to separate term phrases.  For\r
 554 +     * example, "x#y" is parsed as two separate (non-phrase) terms.\r
 555 +     * However, because the characters allowed in a term are\r
 556 +     * context-sensitive, replicating this is very hard.  Here we take\r
 557 +     * a simpler approach where only whitespace and a few operator\r
 558 +     * characters that are never term characters separate terms. */\r
 559 +    while (next != end && !strchr ("()\"", *next) && !is_whitespace (*next))\r
 560 +       ++next;\r
 561 +    *termOut = talloc_strndup (s->ctx, it.raw (), next.raw () - it.raw ());\r
 562 +    return next;\r
 563 +}\r
 564 +\r
 565 +static notmuch_bool_t\r
 566 +lex_operator (struct _notmuch_lex_state *s, char *term,\r
 567 +             const char *op, enum _notmuch_token_type type)\r
 568 +{\r
 569 +    if (strcasecmp (term, op) == 0) {\r
 570 +       lex_emit (s, type, term);\r
 571 +       return true;\r
 572 +    }\r
 573 +    \r
 574 +    return false;\r
 575 +}\r
 576 +\r
 577 +static _notmuch_token_t *\r
 578 +lex (const void *ctx, _notmuch_qparser_t *qparser, const char *query)\r
 579 +{\r
 580 +    Xapian::Utf8Iterator it (query), next, end;\r
 581 +    struct _notmuch_lex_state state = {ctx, qparser, &tok_end, &state.head};\r
 582 +    struct _notmuch_lex_state *s = &state;\r
 583 +    struct _notmuch_token *tok;\r
 584 +    char *term;\r
 585 +    int prefixFlags, literal;\r
 586 +\r
 587 +    while (it != end) {\r
 588 +       unsigned ch;\r
 589 +       if ((it = lex_skip_ws (it)) == end)\r
 590 +           break;\r
 591 +\r
 592 +       ch = *it;\r
 593 +       switch (ch) {\r
 594 +       case '+':\r
 595 +       case '-':\r
 596 +           ++it;\r
 597 +           /* Xapian ignores these unless preceded by whitespace or\r
 598 +            * an open paren, which has the effect of ignoring all\r
 599 +            * +'s in "x +++y", "x#+y", and "(x)+y".  We don't\r
 600 +            * bother. */\r
 601 +\r
 602 +           /* Ignore if followed by a space or another + or - */\r
 603 +           if (is_whitespace (*it) || *it == '+' || *it == '-')\r
 604 +               continue;\r
 605 +           lex_emit (s, ch == '+' ? TOK_LOVE : TOK_HATE, NULL);\r
 606 +           continue;\r
 607 +\r
 608 +       case '"':\r
 609 +           it = lex_quoted_phrase(s, it, false);\r
 610 +           continue;\r
 611 +\r
 612 +       case '(':\r
 613 +           ++it;\r
 614 +           /* Xapian ignores this unless preceded by whitespace,\r
 615 +            * parens, +, or -.  We don't bother. */\r
 616 +           lex_emit (s, TOK_BRA, NULL);\r
 617 +           continue;\r
 618 +\r
 619 +       case ')':\r
 620 +           ++it;\r
 621 +           lex_emit (s, TOK_KET, NULL);\r
 622 +           continue;\r
 623 +       }\r
 624 +\r
 625 +       /* Scan for a prefix */\r
 626 +       next = lex_try_consume_prefix (s, it, &term, &prefixFlags);\r
 627 +       literal = prefixFlags & PREFIX_LITERAL;\r
 628 +       if (term && next != end && *next > ' ' && *next != ')' &&\r
 629 +           /* Non-literal prefixes are picky about the next character. */\r
 630 +           (literal || *next == '"' || *next == '(' || is_wordchar (*next))) {\r
 631 +           tok = lex_emit (s, TOK_PREFIX, term);\r
 632 +           tok->prefixFlags = prefixFlags;\r
 633 +\r
 634 +           it = next;\r
 635 +           if (literal && *it == '"') {\r
 636 +               /* Literal quoted strings keep everything and allow\r
 637 +                * quote escaping, unlike regular quoted phrases. */\r
 638 +               it = lex_quoted_phrase (s, it, true);\r
 639 +           } else if (literal) {\r
 640 +               /* Xapian uses anything up to the next space or ')'\r
 641 +                * (because literal prefixes can't be applied to\r
 642 +                * subqueries).  I disagree with Xapian here, since\r
 643 +                * Xapian will keep the open paren but not the close\r
 644 +                * paren.  Better would be to balance them. */\r
 645 +               if (*next == '(')\r
 646 +                   ++next;\r
 647 +               while (next != end && *next > ' ' && *next != ')')\r
 648 +                   ++next;\r
 649 +               term = talloc_strndup (s->ctx, it.raw (),\r
 650 +                                      next.raw () - it.raw ());\r
 651 +               lex_emit (s, TOK_TERMS, term);\r
 652 +               it = next;\r
 653 +           }\r
 654 +           continue;\r
 655 +       }\r
 656 +\r
 657 +       /* Scan for a term phrase or operator */\r
 658 +       it = lex_consume_term (s, it, &term);\r
 659 +\r
 660 +       /* Check operators */\r
 661 +       if (lex_operator (s, term, "and",  TOK_AND) ||\r
 662 +           lex_operator (s, term, "not",  TOK_NOT) ||\r
 663 +           lex_operator (s, term, "xor",  TOK_XOR) ||\r
 664 +           lex_operator (s, term, "or",   TOK_OR))\r
 665 +           continue;\r
 666 +\r
 667 +       /* Must be a term */\r
 668 +       lex_emit (s, TOK_TERMS, term);\r
 669 +    }\r
 670 +\r
 671 +    return s->head;\r
 672 +}\r
 673 +\r
 674 +static void\r
 675 +add_to_query (const void *ctx, _notmuch_token_t **query,\r
 676 +             _notmuch_token_type op, _notmuch_token_t *right)\r
 677 +{\r
 678 +    if (!*query)\r
 679 +       *query = right;\r
 680 +    else if (right)\r
 681 +       *query = _notmuch_token_create_op (ctx, op, *query, right);\r
 682 +}\r
 683 +\r
 684 +static _notmuch_token_t *\r
 685 +parse_expr (struct _notmuch_parse_state *s, int prec, _notmuch_token_t **tok);\r
 686 +\r
 687 +static _notmuch_token_t *\r
 688 +parse_prob (struct _notmuch_parse_state *s, int prec, _notmuch_token_t **tok)\r
 689 +{\r
 690 +    /* A prob is a sequence of three types of subqueries.  Because the\r
 691 +     * default query operator is AND, loved terms are not treated\r
 692 +     * specially.\r
 693 +     * 1) Probabilistic terms (prefixed or not).  These are combined\r
 694 +     *    with the default query operator, AND.\r
 695 +     * 2) Terms with a boolean prefix.  All of the terms with the same\r
 696 +     *    prefix are combined with OR.  Different prefixes are\r
 697 +     *    combined with AND.\r
 698 +     * 3) Hate terms.  These are combined with OR.\r
 699 +     * The final IR looks like\r
 700 +     *   (probs AND (FILTER bools)) AND (NOT hates)\r
 701 +     */\r
 702 +\r
 703 +    _notmuch_token_t *probs, *hates, *sub, *q;\r
 704 +    GHashTable *bools;\r
 705 +    int done = 0;\r
 706 +\r
 707 +    probs = hates = NULL;\r
 708 +    bools = g_hash_table_new (g_str_hash, g_str_equal);\r
 709 +    while (!done) {\r
 710 +       switch ((*tok)->type) {\r
 711 +       case TOK_KET:\r
 712 +           if (prec < 10) {\r
 713 +               /* Unmatched close paren.  Ignore it. */\r
 714 +               *tok = (*tok)->next;\r
 715 +               break;\r
 716 +           }\r
 717 +           /* Fall through */\r
 718 +       case TOK_AND: case TOK_OR: case TOK_XOR: case TOK_NOT:\r
 719 +       case TOK_END:\r
 720 +           /* End of the prob.  Might be empty. */\r
 721 +           done = 1;\r
 722 +           break;\r
 723 +\r
 724 +       case TOK_HATE:\r
 725 +           *tok = (*tok)->next;\r
 726 +           sub = parse_expr (s, prec + 1, tok);\r
 727 +           add_to_query (s->ctx, &hates, TOK_OR, sub);\r
 728 +           break;\r
 729 +\r
 730 +       case TOK_PREFIX:\r
 731 +           sub = parse_expr (s, prec + 1, tok);\r
 732 +           if (!sub)\r
 733 +               break;\r
 734 +           if (sub->prefixFlags & PREFIX_PROB) {\r
 735 +               add_to_query (s->ctx, &probs, TOK_AND, sub);\r
 736 +           } else {\r
 737 +               _notmuch_token_t *newb, *pre = (_notmuch_token_t*)\r
 738 +                   g_hash_table_lookup (bools, sub->text);\r
 739 +               if (!pre)\r
 740 +                   newb = sub;\r
 741 +               else\r
 742 +                   /* OR subqueries with same prefix */\r
 743 +                   newb = _notmuch_token_create_op (s->ctx, TOK_OR, pre, sub);\r
 744 +               g_hash_table_insert (bools, (void*)sub->text, newb);\r
 745 +           }\r
 746 +           break;\r
 747 +\r
 748 +       case TOK_LOVE:\r
 749 +           /* Join into the query like any other term, since the\r
 750 +            * default operator is AND anyway. */\r
 751 +           *tok = (*tok)->next;\r
 752 +           /* Fall through */\r
 753 +       case TOK_BRA:\r
 754 +       case TOK_TERMS:\r
 755 +       case TOK_LIT:\r
 756 +           sub = parse_expr (s, prec + 1, tok);\r
 757 +           add_to_query (s->ctx, &probs, TOK_AND, sub);\r
 758 +           break;\r
 759 +\r
 760 +       case TOK_FILTER:\r
 761 +       case TOK_ERROR:\r
 762 +           INTERNAL_ERROR ("Unexpected token %s",\r
 763 +                           _notmuch_token_show (s->ctx, *tok));\r
 764 +       }\r
 765 +    }\r
 766 +\r
 767 +    q = probs;\r
 768 +    if (g_hash_table_size (bools)) {\r
 769 +       /* Merge boolean filters */\r
 770 +       _notmuch_token_t *filter;\r
 771 +       GList *vals = g_hash_table_get_values (bools), *l;\r
 772 +       sub = NULL;\r
 773 +       for (l = vals; l; l = l->next)\r
 774 +           add_to_query (s->ctx, &sub, TOK_AND, (_notmuch_token_t *) l->data);\r
 775 +       g_list_free (vals);\r
 776 +\r
 777 +       /* Create filter */\r
 778 +       filter = _notmuch_token_create_op (s->ctx, TOK_FILTER, sub, NULL);\r
 779 +       add_to_query (s->ctx, &q, TOK_AND, filter);\r
 780 +    }\r
 781 +    if (hates) {\r
 782 +       sub = _notmuch_token_create_op (s->ctx, TOK_NOT, hates, NULL);\r
 783 +       add_to_query (s->ctx, &q, TOK_AND, sub);\r
 784 +    }\r
 785 +    g_hash_table_unref (bools);\r
 786 +    return q;\r
 787 +}\r
 788 +\r
 789 +static _notmuch_token_t *\r
 790 +parse_term (struct _notmuch_parse_state *s, int prec, _notmuch_token_t **tok)\r
 791 +{\r
 792 +    _notmuch_token_t *sub;\r
 793 +\r
 794 +    if ((*tok)->type == TOK_END) {\r
 795 +       /* Arises from things like "x -".  Ignore. */\r
 796 +       return NULL;\r
 797 +    } else if ((*tok)->type == TOK_PREFIX) {\r
 798 +       sub = *tok;\r
 799 +       *tok = (*tok)->next;\r
 800 +       sub->left = parse_term (s, prec, tok);\r
 801 +       if (!sub->left)\r
 802 +           return NULL;\r
 803 +       if (sub->prefixFlags & PREFIX_LITERAL) {\r
 804 +           /* Convert TOK_TERMS to TOK_LIT */\r
 805 +           assert (sub->left->type == TOK_TERMS);\r
 806 +           sub->left->type = TOK_LIT;\r
 807 +       } else if (sub->left->type == TOK_PREFIX) {\r
 808 +           sub->left = sub->left->left;\r
 809 +       }\r
 810 +       return sub;\r
 811 +    } else if ((*tok)->type == TOK_BRA) {\r
 812 +       *tok = (*tok)->next;\r
 813 +       sub = parse_expr (s, prec + 10 - (prec%10), tok);\r
 814 +       if ((*tok)->type == TOK_KET)\r
 815 +           *tok = (*tok)->next;\r
 816 +       return sub;\r
 817 +    }\r
 818 +\r
 819 +    if ((*tok)->type != TOK_TERMS && (*tok)->type != TOK_LIT) {\r
 820 +       /* Arises from "+AND", "-AND", "prob:AND".  We could give up\r
 821 +        * and return nothing, but it seems nicer to treat the\r
 822 +        * operator as a term if it came from the original query. */\r
 823 +       if (!(*tok)->text)\r
 824 +           return NULL;\r
 825 +       (*tok)->type = TOK_TERMS;\r
 826 +    }\r
 827 +\r
 828 +    sub = *tok;\r
 829 +    *tok = (*tok)->next;\r
 830 +    return sub;\r
 831 +}\r
 832 +\r
 833 +static _notmuch_token_t *\r
 834 +parse_expr (struct _notmuch_parse_state *s, int prec, _notmuch_token_t **tok)\r
 835 +{\r
 836 +    /* If you squint at the Xapian grammar, it's a precedence grammar\r
 837 +     * with one strange "prob" level.  This implements all but the\r
 838 +     * prob level and the leaf "term" level.\r
 839 +     *\r
 840 +     * prec is (nesting level * 10 + precedence level).  Normally we\r
 841 +     * only care about the precedence level, but the nesting level is\r
 842 +     * important for recovering from unbalanced parens.\r
 843 +     */\r
 844 +    int bprec = prec % 10;\r
 845 +    if (bprec == 3) {\r
 846 +       if ((*tok)->type == TOK_NOT) {\r
 847 +           /* Unary NOT */\r
 848 +           _notmuch_token_t *root = *tok;\r
 849 +           *tok = (*tok)->next;\r
 850 +           root->left = parse_expr (s, prec, tok);\r
 851 +           return root;\r
 852 +       }\r
 853 +\r
 854 +       return parse_prob (s, prec, tok);\r
 855 +    }\r
 856 +    if (bprec == 4)\r
 857 +       return parse_term (s, prec, tok);\r
 858 +\r
 859 +    _notmuch_token_t *left = parse_expr (s, prec + 1, tok);\r
 860 +    while ((bprec == 0 && (*tok)->type == TOK_OR) ||\r
 861 +          (bprec == 1 && (*tok)->type == TOK_XOR) ||\r
 862 +          (bprec == 2 && ((*tok)->type == TOK_AND ||\r
 863 +                          (*tok)->type == TOK_NOT))) {\r
 864 +       _notmuch_token_t *root = *tok;\r
 865 +       if (root->type == TOK_NOT) {\r
 866 +           /* Replace "x NOT y" with (AND x (NOT y)) by inserting an\r
 867 +            * AND operator and leaning on the unary NOT rule. */\r
 868 +           root = _notmuch_token_create_term (s->ctx, TOK_AND, NULL);\r
 869 +       } else {\r
 870 +           *tok = (*tok)->next;\r
 871 +       }\r
 872 +\r
 873 +       /* Xapian treats x AND -y as x AND NOT y, which affects\r
 874 +        * precedence. */\r
 875 +       if (root->type == TOK_AND && (*tok)->type == TOK_HATE)\r
 876 +           (*tok)->type = TOK_NOT;\r
 877 +\r
 878 +       _notmuch_token_t *right = parse_expr (s, prec + 1, tok);\r
 879 +       if (left && right) {\r
 880 +           root->left = left;\r
 881 +           root->right = right;\r
 882 +       } else {\r
 883 +           /* Left or right was empty.  This may be a syntax error\r
 884 +            * like an omitted expression, or an empty expression. */\r
 885 +           root = left ? left : right;\r
 886 +       }\r
 887 +       left = root;\r
 888 +    }\r
 889 +    return left;\r
 890 +}\r
 891 +\r
 892 +static _notmuch_token_t *\r
 893 +parse (struct _notmuch_parse_state *s, _notmuch_token_t *toks)\r
 894 +{\r
 895 +    _notmuch_token_t *root = parse_expr (s, 0, &toks);\r
 896 +    if (toks->type != TOK_END)\r
 897 +       INTERNAL_ERROR ("Token stream not fully consumed: %s",\r
 898 +                       _notmuch_token_show_list (s->ctx, toks));\r
 899 +    return root;\r
 900 +}\r
 901 +\r
 902 +static char *\r
 903 +generate_term (struct _notmuch_generate_state *s, const char *term,\r
 904 +              const char *prefix)\r
 905 +{\r
 906 +    notmuch_bool_t colon = FALSE;\r
 907 +    if (isupper (term[0]) && strlen (prefix) > 1)\r
 908 +       colon = TRUE;\r
 909 +    return talloc_asprintf (s->ctx, "%s%s%s", prefix, colon ? ":" : "", term);\r
 910 +}\r
 911 +\r
 912 +static Xapian::Query\r
 913 +generate_terms (struct _notmuch_generate_state *s, const char *text,\r
 914 +               const char *prefix, int distance, Xapian::Query::op op)\r
 915 +{\r
 916 +    Xapian::TermGenerator tg;\r
 917 +    Xapian::Document doc;\r
 918 +    Xapian::TermIterator it, end;\r
 919 +    Xapian::PositionIterator pit, pend;\r
 920 +    Xapian::Query *qs, q;\r
 921 +    unsigned int nterms = 0;\r
 922 +\r
 923 +    if (prefix)\r
 924 +       tg.index_text (text, 1, prefix);\r
 925 +    else\r
 926 +       tg.index_text (text);\r
 927 +    doc = tg.get_document ();\r
 928 +\r
 929 +    /* Find the highest positioned term.  Positions are 1-based. */\r
 930 +    end = doc.termlist_end ();\r
 931 +    for (it = doc.termlist_begin (); it != end; ++it) {\r
 932 +       pend = it.positionlist_end ();\r
 933 +       for (pit = it.positionlist_begin (); pit != pend; ++pit) {\r
 934 +           if (*pit > nterms)\r
 935 +               nterms = *pit;\r
 936 +       }\r
 937 +    }\r
 938 +    if (nterms == 0)\r
 939 +       return Xapian::Query ();\r
 940 +\r
 941 +    /* Extract terms */\r
 942 +    qs = new Xapian::Query[nterms];\r
 943 +    for (it = doc.termlist_begin (); it != end; ++it) {\r
 944 +       pend = it.positionlist_end ();\r
 945 +       for (pit = it.positionlist_begin (); pit != pend; ++pit)\r
 946 +           qs[*pit - 1] = Xapian::Query (*it, 1, s->termpos + *pit - 1);\r
 947 +    }\r
 948 +    s->termpos += nterms;\r
 949 +\r
 950 +    /* Build query */\r
 951 +    q = Xapian::Query (op, qs, qs + nterms, distance + nterms);\r
 952 +    delete [] qs;\r
 953 +    return q;\r
 954 +}\r
 955 +\r
 956 +static Xapian::Query\r
 957 +generate (struct _notmuch_generate_state *s, _notmuch_token_t *root)\r
 958 +{\r
 959 +    using Xapian::Query;\r
 960 +    Query l, r;\r
 961 +    Query::op op;\r
 962 +\r
 963 +    if (!root)\r
 964 +       return Query ();\r
 965 +\r
 966 +    /* The tricky part here is that generate is allowed to return a\r
 967 +     * empty query, indicating that the user's query cannot be\r
 968 +     * expressed.  For example, the term "#" is an empty query. */\r
 969 +\r
 970 +    switch (root->type) {\r
 971 +    case TOK_AND:\r
 972 +       if (root->left->type == TOK_NOT && root->right->type != TOK_NOT) {\r
 973 +           _notmuch_token_t *tmp = root->left;\r
 974 +           root->left = root->right;\r
 975 +           root->right = tmp;\r
 976 +       }\r
 977 +       l = generate (s, root->left);\r
 978 +       if (l.empty()) {\r
 979 +           return generate (s, root->right);\r
 980 +       } else if (root->right->type == TOK_NOT) {\r
 981 +           r = generate (s, root->right->left);\r
 982 +           op = Query::OP_AND_NOT;\r
 983 +       } else if (root->right->type == TOK_FILTER) {\r
 984 +           r = generate (s, root->right->left);\r
 985 +           op = Query::OP_FILTER;\r
 986 +       } else {\r
 987 +           r = generate (s, root->right);\r
 988 +           op = Query::OP_AND;\r
 989 +       }\r
 990 +       if (r.empty())\r
 991 +           return l;\r
 992 +       return Query (op, l, r);\r
 993 +\r
 994 +    case TOK_NOT:\r
 995 +       l = generate (s, root->left);\r
 996 +       if (l.empty())\r
 997 +           return l;\r
 998 +       return Query (Query::OP_AND_NOT, Query ("", 1, 0), l);\r
 999 +\r
1000 +    case TOK_FILTER:\r
1001 +       l = generate(s, root->left);\r
1002 +       if (l.empty())\r
1003 +           return l;\r
1004 +       return Query (Query::OP_SCALE_WEIGHT, l, 0.0);\r
1005 +\r
1006 +    case TOK_OR:\r
1007 +    case TOK_XOR:\r
1008 +       l = generate (s, root->left);\r
1009 +       r = generate (s, root->right);\r
1010 +       if (l.empty())\r
1011 +           return r;\r
1012 +       if (r.empty())\r
1013 +           return l;\r
1014 +       return Query (root->type == TOK_OR ? Query::OP_OR : Query::OP_XOR,\r
1015 +                     l, r);\r
1016 +\r
1017 +    case TOK_PREFIX:\r
1018 +       return generate (s, root->left);\r
1019 +\r
1020 +    case TOK_TERMS:\r
1021 +       return generate_terms (s, root->text, root->prefix, 0,\r
1022 +                              Query::OP_PHRASE);\r
1023 +\r
1024 +    case TOK_LIT:\r
1025 +       return Query (generate_term (s, root->text, root->prefix));\r
1026 +\r
1027 +    case TOK_ERROR:\r
1028 +       if (!s->error)\r
1029 +           s->error = talloc_strdup (s->ctx, root->text);\r
1030 +       return Query ();\r
1031 +\r
1032 +    case TOK_LOVE:\r
1033 +    case TOK_HATE:\r
1034 +    case TOK_BRA:\r
1035 +    case TOK_KET:\r
1036 +    case TOK_END:\r
1037 +       /* Fall through to the error after the switch */\r
1038 +       break;\r
1039 +    }\r
1040 +    /* We leave this outside the switch so the compiler will warn us\r
1041 +     * if we missed a token type. */\r
1042 +    INTERNAL_ERROR ("Illegal token %s in IR",\r
1043 +                   _notmuch_token_show (s->ctx, root));\r
1044 +    return Xapian::Query ();\r
1045 +}\r
1046 +\r
1047 +static int\r
1048 +_notmuch_qparser_destructor (_notmuch_qparser_t *qparser)\r
1049 +{\r
1050 +    g_hash_table_unref (qparser->prefixes);\r
1051 +    return 0;\r
1052 +}\r
1053 +\r
1054 +_notmuch_qparser_t *\r
1055 +_notmuch_qparser_create (const void *ctx, notmuch_database_t *notmuch)\r
1056 +{\r
1057 +    _notmuch_qparser_t *qparser = talloc (ctx, _notmuch_qparser_t);\r
1058 +    if (!qparser)\r
1059 +       return NULL;\r
1060 +    qparser->prefixes = NULL;\r
1061 +    talloc_set_destructor (qparser, _notmuch_qparser_destructor);\r
1062 +\r
1063 +    qparser->notmuch = notmuch;\r
1064 +    qparser->prefixes = g_hash_table_new (g_str_hash, g_str_equal);\r
1065 +    qparser->transforms = NULL;\r
1066 +    qparser->transformsTail = &qparser->transforms;\r
1067 +\r
1068 +    return qparser;\r
1069 +}\r
1070 +\r
1071 +void\r
1072 +_notmuch_qparser_add_prefix (_notmuch_qparser_t *qparser,\r
1073 +                            const char *prefix, notmuch_bool_t literal,\r
1074 +                            notmuch_bool_t boolean)\r
1075 +{\r
1076 +    int flags = ((literal ? PREFIX_LITERAL : 0) |\r
1077 +                (boolean ? PREFIX_BOOL : PREFIX_PROB));\r
1078 +    g_hash_table_insert (qparser->prefixes, talloc_strdup (qparser, prefix),\r
1079 +                        GINT_TO_POINTER (flags));\r
1080 +}\r
1081 +\r
1082 +void\r
1083 +_notmuch_qparser_add_transform (_notmuch_qparser_t *qparser,\r
1084 +                               _notmuch_token_t *(*transform) (\r
1085 +                                   _notmuch_token_t *ast, void *opaque),\r
1086 +                               void *opaque)\r
1087 +{\r
1088 +    struct _notmuch_qparser_transform *t;\r
1089 +    t = talloc (qparser, struct _notmuch_qparser_transform);\r
1090 +    t->next = NULL;\r
1091 +    t->transform = transform;\r
1092 +    t->opaque = opaque;\r
1093 +    *qparser->transformsTail = t;\r
1094 +    qparser->transformsTail = &t->next;\r
1095 +}\r
1096 +\r
1097 +struct _notmuch_transform_prefix_info {\r
1098 +    char *field, *prefix;\r
1099 +};\r
1100 +\r
1101 +static _notmuch_token_t *\r
1102 +transform_prefix_rec (struct _notmuch_transform_prefix_info *info,\r
1103 +                     _notmuch_token_t *root, notmuch_bool_t active)\r
1104 +{\r
1105 +    if (!root)\r
1106 +       return NULL;\r
1107 +    if (root->type == TOK_PREFIX) {\r
1108 +       active = (strcmp (info->field, root->text) == 0);\r
1109 +    } else if (active && (root->type == TOK_TERMS || root->type == TOK_LIT)) {\r
1110 +       root->prefix = info->prefix;\r
1111 +    }\r
1112 +    transform_prefix_rec (info, root->left, active);\r
1113 +    transform_prefix_rec (info, root->right, active);\r
1114 +    return root;\r
1115 +}\r
1116 +\r
1117 +static _notmuch_token_t *\r
1118 +transform_prefix (_notmuch_token_t *root, void *opaque)\r
1119 +{\r
1120 +    struct _notmuch_transform_prefix_info *info =\r
1121 +       (struct _notmuch_transform_prefix_info*)opaque;\r
1122 +    return transform_prefix_rec (info, root, FALSE);\r
1123 +}\r
1124 +\r
1125 +void\r
1126 +_notmuch_qparser_add_db_prefix (_notmuch_qparser_t *qparser,\r
1127 +                               const char *field, const char *prefix,\r
1128 +                               notmuch_bool_t boolean)\r
1129 +{\r
1130 +    struct _notmuch_transform_prefix_info *info;\r
1131 +    info = talloc (qparser, struct _notmuch_transform_prefix_info);\r
1132 +    info->field = talloc_strdup (info, field);\r
1133 +    info->prefix = talloc_strdup (info, prefix);\r
1134 +    _notmuch_qparser_add_prefix (qparser, field, boolean, boolean);\r
1135 +    _notmuch_qparser_add_transform (qparser, transform_prefix, info);\r
1136 +}\r
1137 +\r
1138 +_notmuch_token_t *\r
1139 +_notmuch_qparser_lex (const void *ctx, _notmuch_qparser_t *qparser,\r
1140 +                     const char *query)\r
1141 +{\r
1142 +    return lex (ctx, qparser, query);\r
1143 +}\r
1144 +\r
1145 +_notmuch_token_t *\r
1146 +_notmuch_qparser_parse (const void *ctx, _notmuch_qparser_t *qparser,\r
1147 +                       const char *query)\r
1148 +{\r
1149 +    struct _notmuch_parse_state state = {ctx, qparser};\r
1150 +    _notmuch_token_t *toks = lex (ctx, qparser, query);\r
1151 +    return parse (&state, toks);\r
1152 +}\r
1153 +\r
1154 +_notmuch_token_t *\r
1155 +_notmuch_qparser_transform (_notmuch_qparser_t *qparser, _notmuch_token_t *root)\r
1156 +{\r
1157 +    struct _notmuch_qparser_transform *t;\r
1158 +    for (t = qparser->transforms; t; t = t->next)\r
1159 +       root = t->transform (root, t->opaque);\r
1160 +    return root;\r
1161 +}\r
1162 +\r
1163 +Xapian::Query\r
1164 +_notmuch_qparser_generate (const void *ctx, _notmuch_qparser_t *qparser,\r
1165 +                          _notmuch_token_t *root, char **error_out)\r
1166 +{\r
1167 +    struct _notmuch_generate_state state = {ctx, qparser, 1, NULL};\r
1168 +    Xapian::Query query = generate (&state, root);\r
1169 +    if (state.error) {\r
1170 +       *error_out = state.error;\r
1171 +       return Xapian::Query ();\r
1172 +    }\r
1173 +    *error_out = NULL;\r
1174 +    if (query.empty())\r
1175 +       /* Return all documents */\r
1176 +       return Xapian::Query ("", 1, 0);\r
1177 +    return query;\r
1178 +}\r
1179 -- \r
1180 1.7.2.3\r
1181 \r