d9/ae13a272b8e0b1e35d025f49df1ae6a2df2c7c

   1 Return-Path: <amthrax@awakening.csail.mit.edu>\r
   2 X-Original-To: notmuch@notmuchmail.org\r
   3 Delivered-To: notmuch@notmuchmail.org\r
   4 Received: from localhost (localhost [127.0.0.1])\r
   5         by olra.theworths.org (Postfix) with ESMTP id EEA3E41A578\r
   6         for <notmuch@notmuchmail.org>; Wed, 22 Dec 2010 22:01:00 -0800 (PST)\r
   7 X-Virus-Scanned: Debian amavisd-new at olra.theworths.org\r
   8 X-Spam-Flag: NO\r
   9 X-Spam-Score: 0\r
  10 X-Spam-Level: \r
  11 X-Spam-Status: No, score=0 tagged_above=-999 required=5 tests=[none]\r
  12         autolearn=disabled\r
  13 Received: from olra.theworths.org ([127.0.0.1])\r
  14         by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024)\r
  15         with ESMTP id KHIIz+jV92tG for <notmuch@notmuchmail.org>;\r
  16         Wed, 22 Dec 2010 22:00:47 -0800 (PST)\r
  17 Received: from dmz-mailsec-scanner-2.mit.edu (DMZ-MAILSEC-SCANNER-2.MIT.EDU\r
  18         [18.9.25.13])\r
  19         by olra.theworths.org (Postfix) with ESMTP id D4656431FD0\r
  20         for <notmuch@notmuchmail.org>; Wed, 22 Dec 2010 22:00:45 -0800 (PST)\r
  21 X-AuditID: 1209190d-b7cacae000000a14-49-4d12e58c1219\r
  22 Received: from mailhub-auth-4.mit.edu ( [18.7.62.39])\r
  23         by dmz-mailsec-scanner-2.mit.edu (Symantec Brightmail Gateway) with\r
  24         SMTP id 52.A7.02580.C85E21D4; Thu, 23 Dec 2010 01:00:44 -0500 (EST)\r
  25 Received: from outgoing.mit.edu (OUTGOING-AUTH.MIT.EDU [18.7.22.103])\r
  26         by mailhub-auth-4.mit.edu (8.13.8/8.9.2) with ESMTP id oBN60hbt007101; \r
  27         Thu, 23 Dec 2010 01:00:43 -0500\r
  28 Received: from awakening.csail.mit.edu (awakening.csail.mit.edu [18.26.4.91])\r
  29         (authenticated bits=0)\r
  30         (User authenticated as amdragon@ATHENA.MIT.EDU)\r
  31         by outgoing.mit.edu (8.13.6/8.12.4) with ESMTP id oBN60gnO004788\r
  32         (version=TLSv1/SSLv3 cipher=AES256-SHA bits=256 verify=NOT);\r
  33         Thu, 23 Dec 2010 01:00:43 -0500 (EST)\r
  34 Received: from amthrax by awakening.csail.mit.edu with local (Exim 4.72)\r
  35         (envelope-from <amthrax@awakening.csail.mit.edu>)\r
  36         id 1PVeEE-00024F-Eq; Thu, 23 Dec 2010 01:00:42 -0500\r
  37 From: Austin Clements <amdragon@MIT.EDU>\r
  38 To: notmuch@notmuchmail.org\r
  39 Subject: [PATCH 1/2] Implement a custom query parser with a mostly Xapian-compatible grammar.\r
  40 Date: Thu, 23 Dec 2010 01:00:23 -0500\r
  41 Message-Id: <1293084024-7893-2-git-send-email-amdragon@mit.edu>\r
  42 X-Mailer: git-send-email 1.7.2.3\r
  43 In-Reply-To: <1293084024-7893-1-git-send-email-amdragon@mit.edu>\r
  44 References: <1293084024-7893-1-git-send-email-amdragon@mit.edu>\r
  45 MIME-Version: 1.0\r
  46 Content-Type: text/plain; charset=UTF-8\r
  47 Content-Transfer-Encoding: 8bit\r
  48 X-Brightmail-Tracker: AAAAARbzB9Y=\r
  49 Cc: amdragon@mit.edu\r
  50 X-BeenThere: notmuch@notmuchmail.org\r
  51 X-Mailman-Version: 2.1.13\r
  52 Precedence: list\r
  53 List-Id: "Use and development of the notmuch mail system."\r
  54         <notmuch.notmuchmail.org>\r
  55 List-Unsubscribe: <http://notmuchmail.org/mailman/options/notmuch>,\r
  56         <mailto:notmuch-request@notmuchmail.org?subject=unsubscribe>\r
  57 List-Archive: <http://notmuchmail.org/pipermail/notmuch>\r
  58 List-Post: <mailto:notmuch@notmuchmail.org>\r
  59 List-Help: <mailto:notmuch-request@notmuchmail.org?subject=help>\r
  60 List-Subscribe: <http://notmuchmail.org/mailman/listinfo/notmuch>,\r
  61         <mailto:notmuch-request@notmuchmail.org?subject=subscribe>\r
  62 X-List-Received-Date: Thu, 23 Dec 2010 06:01:02 -0000\r
  63 \r
  64 This parser takes an extra step through an intermediate representation\r
  65 that is convenient to manipulate via pluggable transformation passes.\r
  66 These are used to implement regular Xapian-style query prefixes, but\r
  67 are flexible enough to accomplish far more.\r
  68 ---\r
  69  lib/Makefile.local     |    1 +\r
  70  lib/database-private.h |    6 +\r
  71  lib/notmuch-private.h  |  126 +++++++\r
  72  lib/qparser.cc         |  941 ++++++++++++++++++++++++++++++++++++++++++++++++\r
  73  4 files changed, 1074 insertions(+), 0 deletions(-)\r
  74  create mode 100644 lib/qparser.cc\r
  75 \r
  76 diff --git a/lib/Makefile.local b/lib/Makefile.local\r
  77 index 37d1c0d..37d3735 100644\r
  78 --- a/lib/Makefile.local\r
  79 +++ b/lib/Makefile.local\r
  80 @@ -64,6 +64,7 @@ libnotmuch_cxx_srcs =         \\r
  81         $(dir)/index.cc         \\r
  82         $(dir)/message.cc       \\r
  83         $(dir)/query.cc         \\r
  84 +       $(dir)/qparser.cc       \\r
  85         $(dir)/thread.cc\r
  86  \r
  87  libnotmuch_modules = $(libnotmuch_c_srcs:.c=.o) $(libnotmuch_cxx_srcs:.cc=.o)\r
  88 diff --git a/lib/database-private.h b/lib/database-private.h\r
  89 index 9f83407..358a71b 100644\r
  90 --- a/lib/database-private.h\r
  91 +++ b/lib/database-private.h\r
  92 @@ -64,6 +64,12 @@ _notmuch_get_terms_with_prefix (void *ctx, Xapian::TermIterator &i,\r
  93                                 Xapian::TermIterator &end,\r
  94                                 const char *prefix);\r
  95  \r
  96 +/* qparser.cc */\r
  97 +\r
  98 +/* Generate a Xapian query from a query AST. */\r
  99 +Xapian::Query\r
 100 +_notmuch_qparser_generate (_notmuch_qparser_t *qparser, _notmuch_token_t *root);\r
 101 +\r
 102  #pragma GCC visibility pop\r
 103  \r
 104  #endif\r
 105 diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h\r
 106 index b6f1095..be76c0c 100644\r
 107 --- a/lib/notmuch-private.h\r
 108 +++ b/lib/notmuch-private.h\r
 109 @@ -508,6 +508,132 @@ notmuch_filenames_t *\r
 110  _notmuch_filenames_create (const void *ctx,\r
 111                            notmuch_string_list_t *list);\r
 112  \r
 113 +/* qparser.cc */\r
 114 +\r
 115 +typedef struct _notmuch_qparser _notmuch_qparser_t;\r
 116 +\r
 117 +enum _notmuch_token_type {\r
 118 +    /* These first four token types appear only in the lexer output\r
 119 +     * and never in the parse tree. */\r
 120 +    TOK_LOVE, TOK_HATE, TOK_BRA, TOK_KET,\r
 121 +    /* Binary operators.  These should have left and right children.\r
 122 +     * ADJ and NEAR should have a distance. */\r
 123 +    TOK_AND, TOK_OR, TOK_XOR, TOK_ADJ, TOK_NEAR,\r
 124 +    /* Unary operators.  These have only a left child.  Xapian::Query\r
 125 +     * has no pure NOT operator, so the generator treats NOT as the\r
 126 +     * child of an AND specially, and otherwise represents it as\r
 127 +     * "<all> AND_NOT x".  FILTER ignores the weights of the subquery\r
 128 +     * and generates Xapian::Query::OP_FILTER if the left child of an\r
 129 +     * AND or Xapian::Query::OP_SCALE_WEIGHT otherwise.  The text\r
 130 +     * field of a PREFIX operator specifies the prefix.  PREFIX\r
 131 +     * operators specify only syntactic prefixes, not database\r
 132 +     * prefixes, and thus have no effect on the generated query. */\r
 133 +    TOK_NOT, TOK_FILTER, TOK_PREFIX,\r
 134 +    /* A single TOK_TERMS token can represent a single term, a quoted\r
 135 +     * phrase, or an implicit phrase.  An implicit phrase is something\r
 136 +     * like "foo/bar", for which the database contains two separate\r
 137 +     * terms, but you want to treat them as a phrase, even though it's\r
 138 +     * not quoted.  Xapian calls characters that implicitly connect\r
 139 +     * terms into phrases "phrase generators."  We take a simpler\r
 140 +     * approach and treat almost any non-whitespace character as a\r
 141 +     * phrase generator. */\r
 142 +    TOK_TERMS,\r
 143 +    /* Like TOK_TERMS, but the term text should be taken literally,\r
 144 +     * with no phrase splitting or whitespace removal.  The lexer\r
 145 +     * only generates TOK_TERMS; the parser creates TOK_LIT. */\r
 146 +    TOK_LIT,\r
 147 +    /* TOK_END indicates the end of the token list.  Such tokens loop\r
 148 +     * back on themselves so it's always safe to follow "next".\r
 149 +     * These appear only in the lexer output. */\r
 150 +    TOK_END\r
 151 +};\r
 152 +\r
 153 +typedef struct _notmuch_token {\r
 154 +    enum _notmuch_token_type type;\r
 155 +    const char *text;\r
 156 +\r
 157 +    /* For TOK_ADJ and TOK_NEAR, this specifies the distance\r
 158 +     * argument. */\r
 159 +    int distance;\r
 160 +\r
 161 +    /* For TOK_PREFIX, the flags of this prefix. */\r
 162 +    int prefixFlags;\r
 163 +\r
 164 +    /* For TOK_TERMS and TOK_LIT, the database prefix to use when\r
 165 +     * generating database terms.  This must be filled in a\r
 166 +     * transformation pass. */\r
 167 +    char *prefix;\r
 168 +\r
 169 +    /* Link in the lexer token list. */\r
 170 +    struct _notmuch_token *next;\r
 171 +\r
 172 +    /* Links in the intermediate AST. */\r
 173 +    struct _notmuch_token *left, *right;\r
 174 +} _notmuch_token_t;\r
 175 +\r
 176 +_notmuch_token_t *\r
 177 +_notmuch_token_create (const void *ctx, enum _notmuch_token_type type,\r
 178 +                      const char *text);\r
 179 +\r
 180 +char *\r
 181 +_notmuch_token_show_list (const void *ctx, _notmuch_token_t *tok);\r
 182 +\r
 183 +char *\r
 184 +_notmuch_token_show_tree (const void *ctx, _notmuch_token_t *root);\r
 185 +\r
 186 +_notmuch_qparser_t *\r
 187 +_notmuch_qparser_create (const void *ctx, notmuch_database_t *notmuch);\r
 188 +\r
 189 +/* Add a syntactic prefix.  This will appear as a TOK_PREFIX in the\r
 190 + * AST, but does not alone affect the final query.\r
 191 + *\r
 192 + * The literal flag affects lexing.  If true, this prefix must be\r
 193 + * followed by a regular term or quoted literal, which will not be\r
 194 + * stripped of whitespace or split in to a phrase.  The boolean flag\r
 195 + * affects parsing.  If true, then terms with this prefix will be\r
 196 + * combined into the query using the FILTER operator, so they must\r
 197 + * appear in the result and will not contribute to weights.  Xapian's\r
 198 + * "boolean prefixes" are both literal and boolean.\r
 199 + */\r
 200 +void\r
 201 +_notmuch_qparser_add_prefix (_notmuch_qparser_t *qparser,\r
 202 +                            const char *prefix, notmuch_bool_t literal,\r
 203 +                            notmuch_bool_t boolean);\r
 204 +\r
 205 +/* Add a transform pass to a query parser.  The transform function\r
 206 + * will be called with the root of the AST and should return a new AST\r
 207 + * root (which may be the same as the old root).\r
 208 + */\r
 209 +void\r
 210 +_notmuch_qparser_add_transform (_notmuch_qparser_t *qparser,\r
 211 +                               _notmuch_token_t *(*transform) (\r
 212 +                                   _notmuch_token_t *ast, void *opaque),\r
 213 +                               void *opaque);\r
 214 +\r
 215 +/* Add a syntactic prefix (field) and a transform pass to transform\r
 216 + * that syntactic prefix into a database prefix (prefix).  This\r
 217 + * corresponds to Xapian's add_prefix and add_boolean_prefix\r
 218 + * functions. */\r
 219 +void\r
 220 +_notmuch_qparser_add_db_prefix (_notmuch_qparser_t *qparser,\r
 221 +                               const char *field, const char *prefix,\r
 222 +                               notmuch_bool_t boolean);\r
 223 +\r
 224 +/* Lex a query string, returning the first token in the token list.\r
 225 + * This is only meant for testing. */\r
 226 +_notmuch_token_t *\r
 227 +_notmuch_qparser_lex (_notmuch_qparser_t *qparser, const char *query);\r
 228 +\r
 229 +/* Parse a query string, returning the root of the AST. */\r
 230 +_notmuch_token_t *\r
 231 +_notmuch_qparser_parse (_notmuch_qparser_t *qparser, const char *query);\r
 232 +\r
 233 +/* Transform a parsed query, running the transforms in the order they\r
 234 + * were added to the query parser.  Return the root of the transformed\r
 235 + * AST. */\r
 236 +_notmuch_token_t *\r
 237 +_notmuch_qparser_transform (_notmuch_qparser_t *qparser, _notmuch_token_t *root);\r
 238 +\r
 239  #pragma GCC visibility pop\r
 240  \r
 241  NOTMUCH_END_DECLS\r
 242 diff --git a/lib/qparser.cc b/lib/qparser.cc\r
 243 new file mode 100644\r
 244 index 0000000..4804d06\r
 245 --- /dev/null\r
 246 +++ b/lib/qparser.cc\r
 247 @@ -0,0 +1,941 @@\r
 248 +/* qparser.cc - Notmuch query parser\r
 249 + *\r
 250 + * Copyright © 2010 Austin Clements\r
 251 + *\r
 252 + * This program is free software: you can redistribute it and/or modify\r
 253 + * it under the terms of the GNU General Public License as published by\r
 254 + * the Free Software Foundation, either version 3 of the License, or\r
 255 + * (at your option) any later version.\r
 256 + *\r
 257 + * This program is distributed in the hope that it will be useful,\r
 258 + * but WITHOUT ANY WARRANTY; without even the implied warranty of\r
 259 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
 260 + * GNU General Public License for more details.\r
 261 + *\r
 262 + * You should have received a copy of the GNU General Public License\r
 263 + * along with this program.  If not, see http://www.gnu.org/licenses/ .\r
 264 + *\r
 265 + * Author: Austin Clements <amdragon@mit.edu>\r
 266 + */\r
 267 +\r
 268 +/*\r
 269 + * Query parsing is performed in a series of phases similar to those\r
 270 + * of a traditional compiler.\r
 271 + *\r
 272 + * 1) We tokenize the query, identifying operators and term phrases.\r
 273 + *    Note that a phrase (quoted or implicit) generates a single token\r
 274 + *    at this point, which we split up later (unlike in the Xapian\r
 275 + *    lexer).\r
 276 + * 2) We parse the token stream, generating an intermediate\r
 277 + *    representation in the form of a binary AST.  This IR is similar\r
 278 + *    to the Xapian::Query operators, but is designed to be more\r
 279 + *    easily manipulated.\r
 280 + * 3) We transform the parse tree, running a sequence of\r
 281 + *    caller-provided transformation functions over the tree.\r
 282 + * 4) We generate the Xapian::Query from the transformed IR.  This\r
 283 + *    step also splits phrase tokens into multiple query terms.\r
 284 + *    XXX and expands wildcard terms.\r
 285 + *\r
 286 + * To use the query parser, call _notmuch_qparser_parse to perform\r
 287 + * steps 1 and 2, _notmuch_qparser_transform to perform step 3, and\r
 288 + * _notmuch_qparser_generate to perform step 4.\r
 289 + *\r
 290 + * Still missing from this implementation:\r
 291 + * * Stemming - The stemming should probably be marked on TOK_TERMS\r
 292 + *   tokens.  Ideally, we can just pass this to the term generator.\r
 293 + * * Wildcard queries - This should be available in the IR so it's\r
 294 + *   easy to generate wildcard queries in a transformer.\r
 295 + * * Value ranges in the IR\r
 296 + * * Queries "" and "*"\r
 297 + * * Memory management is poorly suited to reusing qparser's.\r
 298 + */\r
 299 +\r
 300 +/* XXX notmuch currently registers "tag" as an exclusive boolean\r
 301 + * prefix, which means queries like "tag:x tag:y" will return messages\r
 302 + * with tag x OR tag y.  Is this intentional? */\r
 303 +\r
 304 +#include "notmuch-private.h"\r
 305 +#include "database-private.h"\r
 306 +\r
 307 +#include <glib.h>              /* GHashTable */\r
 308 +\r
 309 +struct _notmuch_qparser {\r
 310 +    notmuch_database_t *notmuch;\r
 311 +\r
 312 +    /* Maps from prefix strings to PREFIX_* flags */\r
 313 +    GHashTable *prefixes;\r
 314 +\r
 315 +    struct _notmuch_qparser_transform *transforms;\r
 316 +    struct _notmuch_qparser_transform **transformsTail;\r
 317 +};\r
 318 +\r
 319 +enum {\r
 320 +    PREFIX_LITERAL = 1<<1,\r
 321 +    PREFIX_PROB    = 1<<2,\r
 322 +    PREFIX_BOOL    = 1<<3,\r
 323 +};\r
 324 +\r
 325 +struct _notmuch_qparser_transform {\r
 326 +    struct _notmuch_qparser_transform *next;\r
 327 +    _notmuch_token_t *(*transform) (_notmuch_token_t *ast, void *opaque);\r
 328 +    void *opaque;\r
 329 +};\r
 330 +\r
 331 +struct _notmuch_lexer_state {\r
 332 +    _notmuch_qparser_t *qparser;\r
 333 +    _notmuch_token_t *head;\r
 334 +    _notmuch_token_t **tail;\r
 335 +};\r
 336 +\r
 337 +struct _notmuch_generate_state {\r
 338 +    _notmuch_qparser_t *qparser;\r
 339 +    unsigned int termpos;\r
 340 +};\r
 341 +\r
 342 +static const char *token_types[] = {\r
 343 +    "LOVE", "HATE", "BRA", "KET",\r
 344 +    "AND", "OR", "XOR", "ADJ", "NEAR",\r
 345 +    "NOT", "FILTER", "PREFIX",\r
 346 +    "TERMS", "LIT", "END"\r
 347 +};\r
 348 +\r
 349 +/* The distinguished end token.  This simplifies the parser since it\r
 350 + * never has to worry about dereferencing next. */\r
 351 +static _notmuch_token_t tok_end = {TOK_END, NULL, -1, FALSE, NULL,\r
 352 +                                  &tok_end, NULL, NULL};\r
 353 +\r
 354 +_notmuch_token_t *\r
 355 +_notmuch_token_create (const void *ctx, enum _notmuch_token_type type,\r
 356 +                      const char *text)\r
 357 +{\r
 358 +    struct _notmuch_token *tok = talloc (ctx, struct _notmuch_token);\r
 359 +    tok->type = type;\r
 360 +    tok->text = text;\r
 361 +    if (type == TOK_ADJ || type == TOK_NEAR)\r
 362 +       tok->distance = 10;\r
 363 +    else\r
 364 +       tok->distance = -1;\r
 365 +    tok->prefixFlags = 0;\r
 366 +    tok->prefix = NULL;\r
 367 +    tok->left = tok->right = NULL;\r
 368 +    return tok;\r
 369 +}\r
 370 +\r
 371 +char *\r
 372 +_notmuch_token_show_list (const void *ctx, _notmuch_token_t *tok)\r
 373 +{\r
 374 +    char *out = talloc_strdup (ctx, "");\r
 375 +\r
 376 +    for (; tok->type != TOK_END; tok = tok->next) {\r
 377 +       out = talloc_asprintf_append (out, "%s%s",\r
 378 +                                     *out == 0 ? "" : " ",\r
 379 +                                     token_types[tok->type]);\r
 380 +       if (tok->distance != -1)\r
 381 +           out = talloc_asprintf_append (out, "/%d", tok->distance);\r
 382 +       if (tok->text)\r
 383 +           out = talloc_asprintf_append (out, ":%s", tok->text);\r
 384 +    }\r
 385 +\r
 386 +    return out;\r
 387 +}\r
 388 +\r
 389 +char *\r
 390 +_notmuch_token_show_tree (const void *ctx, _notmuch_token_t *root)\r
 391 +{\r
 392 +    if (!root) {\r
 393 +       return talloc_strdup (ctx, "<nil>");\r
 394 +    } else if (root->type == TOK_TERMS || root->type == TOK_LIT) {\r
 395 +       return talloc_strdup (ctx, root->text);\r
 396 +    } else {\r
 397 +       char *out = talloc_asprintf (ctx, "(%s", token_types[root->type]);\r
 398 +       void *local = talloc_new (ctx);\r
 399 +       if (root->type == TOK_PREFIX)\r
 400 +           out = talloc_asprintf_append (out, "/%s", root->text);\r
 401 +       if (root->left)\r
 402 +           out = talloc_asprintf_append\r
 403 +               (out, " %s", _notmuch_token_show_tree (local, root->left));\r
 404 +       if (root->right)\r
 405 +           out = talloc_asprintf_append\r
 406 +               (out, " %s", _notmuch_token_show_tree (local, root->right));\r
 407 +       out = talloc_asprintf_append (out, ")");\r
 408 +       talloc_free (local);\r
 409 +       return out;\r
 410 +    }\r
 411 +}\r
 412 +\r
 413 +using Xapian::Unicode::is_whitespace;\r
 414 +using Xapian::Unicode::is_wordchar;\r
 415 +\r
 416 +static Xapian::Utf8Iterator\r
 417 +lex_skip_ws (Xapian::Utf8Iterator it)\r
 418 +{\r
 419 +    while (is_whitespace (*it))\r
 420 +       it++;\r
 421 +    return it;\r
 422 +}\r
 423 +\r
 424 +static struct _notmuch_token *\r
 425 +lex_emit (struct _notmuch_lexer_state *s, enum _notmuch_token_type type,\r
 426 +         char *text)\r
 427 +{\r
 428 +    struct _notmuch_token *tok = _notmuch_token_create (s, type, text);\r
 429 +    tok->next = &tok_end;\r
 430 +    *(s->tail) = tok;\r
 431 +    s->tail = &tok->next;\r
 432 +    return tok;\r
 433 +}\r
 434 +\r
 435 +\r
 436 +/* Lex a quoted phrase, returning an iterator pointing to the\r
 437 + * character following the closing quote.  If escaped, then accept two\r
 438 + * quotes as an escaped version of a single quote.\r
 439 + */\r
 440 +static Xapian::Utf8Iterator\r
 441 +lex_quoted_phrase (struct _notmuch_lexer_state *s,\r
 442 +                  Xapian::Utf8Iterator it, notmuch_bool_t escaped)\r
 443 +{\r
 444 +    Xapian::Utf8Iterator next, orig, end;\r
 445 +    char *term, *src, *dst;\r
 446 +\r
 447 +    /* Find the end of the phrase */\r
 448 +    assert (*(it++) == '"');\r
 449 +    for (next = it; next != end; ++next) {\r
 450 +       if (*next == '"') {\r
 451 +           if (!escaped)\r
 452 +               break;\r
 453 +\r
 454 +           orig = next++;\r
 455 +           if (next == end || *next != '"') {\r
 456 +               next = orig;\r
 457 +               break;\r
 458 +           }\r
 459 +       }\r
 460 +    }\r
 461 +\r
 462 +    /* Xapian still lexes +/-/( in quotes mode and simply doesn't\r
 463 +     * generate tokens for them.  For us, the term generator will\r
 464 +     * discard them. */\r
 465 +    term = talloc_strndup (s, it.raw (), next.raw () - it.raw ());\r
 466 +    if (escaped) {\r
 467 +       /* Replace doubled quotes with a single quote. */\r
 468 +       for (src = dst = term; *src; ++src, ++dst) {\r
 469 +           *dst = *src;\r
 470 +           if (*src == '"')\r
 471 +               ++src;\r
 472 +       }\r
 473 +       *dst = '\0';\r
 474 +    }\r
 475 +    lex_emit (s, TOK_TERMS, term);\r
 476 +\r
 477 +    if (next != end)\r
 478 +       ++next;\r
 479 +    return next;\r
 480 +}\r
 481 +\r
 482 +static Xapian::Utf8Iterator\r
 483 +lex_try_consume_prefix (_notmuch_qparser_t *qparser, Xapian::Utf8Iterator it,\r
 484 +                       char **prefixOut, int *flagsOut)\r
 485 +{\r
 486 +    Xapian::Utf8Iterator next (it), end;\r
 487 +    char *prefix;\r
 488 +    int flags;\r
 489 +\r
 490 +    *prefixOut = NULL;\r
 491 +    while (next != end && *next != ':' && !is_whitespace (*next))\r
 492 +       ++next;\r
 493 +    if (*next != ':')\r
 494 +       return it;\r
 495 +    /* Ignore if followed by <= ' ' or ')' */\r
 496 +    ++next;\r
 497 +    if (*next <= ' ' || *next == ')')\r
 498 +       return it;\r
 499 +\r
 500 +    prefix = talloc_strndup (qparser, it.raw (), next.raw () - it.raw() - 1);\r
 501 +    flags = GPOINTER_TO_INT (g_hash_table_lookup (qparser->prefixes, prefix));\r
 502 +    if (!flags) {\r
 503 +       /* Not a known prefix */\r
 504 +       talloc_free (prefix);\r
 505 +       return it;\r
 506 +    }\r
 507 +    *prefixOut = prefix;\r
 508 +    *flagsOut = flags;\r
 509 +    return next;\r
 510 +}\r
 511 +\r
 512 +static Xapian::Utf8Iterator\r
 513 +lex_consume_term (const void *ctx, Xapian::Utf8Iterator it, char **termOut)\r
 514 +{\r
 515 +    Xapian::Utf8Iterator next (it), end;\r
 516 +    /* Xapian permits other characters to separate term phrases.  For\r
 517 +     * example, "x#y" is parsed as two separate (non-phrase) terms.\r
 518 +     * However, because the characters allowed in a term are\r
 519 +     * context-sensitive, replicating this is very hard.  Here we take\r
 520 +     * a simpler approach where only whitespace and a few operator\r
 521 +     * characters that are never term characters separate terms. */\r
 522 +    while (next != end && !strchr ("()\"", *next) && !is_whitespace (*next))\r
 523 +       ++next;\r
 524 +    *termOut = talloc_strndup (ctx, it.raw (), next.raw () - it.raw ());\r
 525 +    return next;\r
 526 +}\r
 527 +\r
 528 +static notmuch_bool_t\r
 529 +lex_operator (struct _notmuch_lexer_state *s, char *term,\r
 530 +             const char *op, enum _notmuch_token_type type)\r
 531 +{\r
 532 +    size_t oplen = strlen (op);\r
 533 +\r
 534 +    if (strcasecmp (term, op) == 0) {\r
 535 +       lex_emit (s, type, term);\r
 536 +       return true;\r
 537 +    }\r
 538 +\r
 539 +    /* Check for ADJ or NEAR with argument.  Our parsing of this is\r
 540 +     * slightly incompatible with Xapian, but I believe this to be a\r
 541 +     * bug in Xapian.  Xapian parses "x NEAR/y z" as three term\r
 542 +     * phrases, "x", "near y", and "z", like we do.  However, it\r
 543 +     * behaves differently if the bad NEAR operator is at the end of\r
 544 +     * the query, parsing "x NEAR/y" like "x NEAR y". */\r
 545 +    if ((type == TOK_ADJ || type == TOK_NEAR) &&\r
 546 +       strncasecmp (term, op, oplen) == 0 &&\r
 547 +       term[oplen] == '/') {\r
 548 +       /* Try to parse the distance argument */\r
 549 +       char *end;\r
 550 +       int distance = strtol (&term[oplen + 1], &end, 10);\r
 551 +       if (distance && !*end) {\r
 552 +           struct _notmuch_token *tok = lex_emit (s, type, term);\r
 553 +           tok->distance = distance;\r
 554 +           return true;\r
 555 +       }\r
 556 +    }\r
 557 +    \r
 558 +    return false;\r
 559 +}\r
 560 +\r
 561 +static _notmuch_token_t *\r
 562 +lex (_notmuch_qparser_t *qparser, const char *query)\r
 563 +{\r
 564 +    Xapian::Utf8Iterator it (query), next, end;\r
 565 +    struct _notmuch_lexer_state *s;\r
 566 +    struct _notmuch_token *tok;\r
 567 +    char *term;\r
 568 +    int prefixFlags;\r
 569 +\r
 570 +    s = talloc (qparser, struct _notmuch_lexer_state);\r
 571 +    if (!s)\r
 572 +       return NULL;\r
 573 +    s->qparser = qparser;\r
 574 +    s->head = &tok_end;\r
 575 +    s->tail = &s->head;\r
 576 +\r
 577 +    while (it != end) {\r
 578 +       unsigned ch;\r
 579 +       if ((it = lex_skip_ws (it)) == end)\r
 580 +           break;\r
 581 +\r
 582 +       ch = *it;\r
 583 +       switch (ch) {\r
 584 +       case '+':\r
 585 +       case '-':\r
 586 +           ++it;\r
 587 +           /* Xapian ignores these unless preceded by whitespace or\r
 588 +            * an open paren, which has the effect of ignoring all\r
 589 +            * +'s in "x +++y", "x#+y", and "(x)+y".  We don't\r
 590 +            * bother. */\r
 591 +\r
 592 +           /* Ignore if followed by a space or another + or - */\r
 593 +           if (is_whitespace (*it) || *it == '+' || *it == '-')\r
 594 +               continue;\r
 595 +           lex_emit (s, ch == '+' ? TOK_LOVE : TOK_HATE, NULL);\r
 596 +           continue;\r
 597 +\r
 598 +       case '"':\r
 599 +           it = lex_quoted_phrase(s, it, false);\r
 600 +           continue;\r
 601 +\r
 602 +       case '(':\r
 603 +           ++it;\r
 604 +           /* Xapian ignores this unless preceded by whitespace,\r
 605 +            * parens, +, or -.  We don't bother. */\r
 606 +           lex_emit (s, TOK_BRA, NULL);\r
 607 +           continue;\r
 608 +\r
 609 +       case ')':\r
 610 +           ++it;\r
 611 +           lex_emit (s, TOK_KET, NULL);\r
 612 +           continue;\r
 613 +       }\r
 614 +\r
 615 +       /* Scan for a prefix */\r
 616 +       next = lex_try_consume_prefix (qparser, it, &term, &prefixFlags);\r
 617 +       if (term && (*next == '"' || *next == '(' || is_wordchar (*next))) {\r
 618 +           int literal = prefixFlags & PREFIX_LITERAL;\r
 619 +           tok = lex_emit (s, TOK_PREFIX, term);\r
 620 +           tok->prefixFlags = prefixFlags;\r
 621 +\r
 622 +           it = next;\r
 623 +           if (literal && *it == '"') {\r
 624 +               /* Literal quoted strings keep everything and allow\r
 625 +                * quote escaping, unlike regular quoted phrases. */\r
 626 +               it = lex_quoted_phrase (s, it, true);\r
 627 +           } else if (literal) {\r
 628 +               /* Xapian uses anything up to the next space or ')'\r
 629 +                * (because literal prefixes can't be applied to\r
 630 +                * subqueries).  I disagree with Xapian here, since\r
 631 +                * Xapian will keep the open paren but not the close\r
 632 +                * paren.  Better would be to balance them. */\r
 633 +               if (*next == '(')\r
 634 +                   ++next;\r
 635 +               while (next != end && *next > ' ' && *next != ')')\r
 636 +                   ++next;\r
 637 +               term = talloc_strndup (s, it.raw (),\r
 638 +                                      next.raw () - it.raw ());\r
 639 +               lex_emit (s, TOK_TERMS, term);\r
 640 +               it = next;\r
 641 +           }\r
 642 +           /* For quoted strings and subqueries following non-literal\r
 643 +            * prefixes and regular terms, we lex as usual. */\r
 644 +           continue;\r
 645 +       }\r
 646 +\r
 647 +       /* Scan for a term phrase or operator */\r
 648 +       it = lex_consume_term (s, it, &term);\r
 649 +\r
 650 +       /* Check operators */\r
 651 +       if (lex_operator (s, term, "and",  TOK_AND) ||\r
 652 +           lex_operator (s, term, "not",  TOK_NOT) ||\r
 653 +           lex_operator (s, term, "xor",  TOK_XOR) ||\r
 654 +           lex_operator (s, term, "or",   TOK_OR)  ||\r
 655 +           lex_operator (s, term, "adj",  TOK_ADJ) ||\r
 656 +           lex_operator (s, term, "near", TOK_NEAR))\r
 657 +           continue;\r
 658 +\r
 659 +       /* Must be a term */\r
 660 +       lex_emit (s, TOK_TERMS, term);\r
 661 +    }\r
 662 +\r
 663 +    return s->head;\r
 664 +}\r
 665 +\r
 666 +static void\r
 667 +add_to_query (const void *ctx, _notmuch_token_t **query,\r
 668 +             _notmuch_token_type op, _notmuch_token_t *right)\r
 669 +{\r
 670 +    if (!*query)\r
 671 +       *query = right;\r
 672 +    else if (right) {\r
 673 +       _notmuch_token_t *tok = _notmuch_token_create (ctx, op, NULL);\r
 674 +       tok->left = *query;\r
 675 +       tok->right = right;\r
 676 +       *query = tok;\r
 677 +    }\r
 678 +}\r
 679 +\r
 680 +static _notmuch_token_t *\r
 681 +parse_expr (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok);\r
 682 +static _notmuch_token_t *\r
 683 +parse_term (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok);\r
 684 +\r
 685 +static _notmuch_token_t *\r
 686 +parse_prob (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok)\r
 687 +{\r
 688 +    /* A prob is a sequence of three types of subqueries.  Because the\r
 689 +     * default query operator is AND, loved terms are not treated\r
 690 +     * specially.\r
 691 +     * 1) Probabilistic terms (prefixed or not).  These are combined\r
 692 +     *    with the default query operator, AND.\r
 693 +     * 2) Terms with a boolean prefix.  All of the terms with the same\r
 694 +     *    prefix are combined with OR.  Different prefixes are\r
 695 +     *    combined with AND.\r
 696 +     * 3) Hate terms.  These are combined with OR.\r
 697 +     * The final IR looks like\r
 698 +     *   (probs AND (FILTER bools)) AND (NOT hates)\r
 699 +     */\r
 700 +\r
 701 +    _notmuch_token_t *probs, *hates, *sub, *q;\r
 702 +    GHashTable *bools;\r
 703 +    int done = 0;\r
 704 +\r
 705 +    probs = hates = NULL;\r
 706 +    bools = g_hash_table_new (g_str_hash, g_str_equal);\r
 707 +    while (!done) {\r
 708 +       switch ((*tok)->type) {\r
 709 +       case TOK_KET:\r
 710 +           if (prec < 10) {\r
 711 +               /* Unmatched close paren.  Ignore it. */\r
 712 +               *tok = (*tok)->next;\r
 713 +               break;\r
 714 +           }\r
 715 +           /* Fall through */\r
 716 +       case TOK_AND: case TOK_OR: case TOK_XOR: case TOK_NOT:\r
 717 +       case TOK_END:\r
 718 +           /* End of the prob.  Might be empty. */\r
 719 +           done = 1;\r
 720 +           break;\r
 721 +\r
 722 +       case TOK_HATE:\r
 723 +           *tok = (*tok)->next;\r
 724 +           sub = parse_expr (qparser, prec + 1, tok);\r
 725 +           add_to_query (qparser, &hates, TOK_OR, sub);\r
 726 +           break;\r
 727 +\r
 728 +       case TOK_PREFIX:\r
 729 +           sub = parse_expr (qparser, prec + 1, tok);\r
 730 +           if (!sub)\r
 731 +               break;\r
 732 +           if (sub->prefixFlags & PREFIX_PROB) {\r
 733 +               add_to_query (qparser, &probs, TOK_AND, sub);\r
 734 +           } else {\r
 735 +               _notmuch_token_t *newb, *pre = (_notmuch_token_t*)\r
 736 +                   g_hash_table_lookup (bools, sub->text);\r
 737 +               if (!pre) {\r
 738 +                   newb = sub;\r
 739 +               } else {\r
 740 +                   /* OR subqueries with same prefix */\r
 741 +                   newb = _notmuch_token_create (qparser, TOK_OR, NULL);\r
 742 +                   newb->left = pre;\r
 743 +                   newb->right = sub;\r
 744 +               }\r
 745 +               g_hash_table_insert (bools, (void*)sub->text, newb);\r
 746 +           }\r
 747 +           break;\r
 748 +\r
 749 +       case TOK_LOVE:\r
 750 +           /* Join into the query like any other term, since the\r
 751 +            * default operator is AND anyway. */\r
 752 +           *tok = (*tok)->next;\r
 753 +           /* Fall through */\r
 754 +       case TOK_BRA:\r
 755 +       case TOK_TERMS:\r
 756 +       case TOK_LIT:\r
 757 +           sub = parse_expr (qparser, prec + 1, tok);\r
 758 +           add_to_query (qparser, &probs, TOK_AND, sub);\r
 759 +           break;\r
 760 +\r
 761 +       case TOK_ADJ:\r
 762 +       case TOK_NEAR:\r
 763 +       case TOK_FILTER:\r
 764 +           /* XXX Can ADJ or NEAR happen? */\r
 765 +           INTERNAL_ERROR ("Unexpected token type %d", (*tok)->type);\r
 766 +       }\r
 767 +    }\r
 768 +\r
 769 +    q = probs;\r
 770 +    if (g_hash_table_size (bools)) {\r
 771 +       /* Merge boolean filters */\r
 772 +       _notmuch_token_t *filter;\r
 773 +       GList *vals = g_hash_table_get_values (bools), *l;\r
 774 +       sub = NULL;\r
 775 +       for (l = vals; l; l = l->next)\r
 776 +           add_to_query (qparser, &sub, TOK_AND, (_notmuch_token_t *) l->data);\r
 777 +       g_list_free (vals);\r
 778 +\r
 779 +       /* Create filter */\r
 780 +       filter = _notmuch_token_create (qparser, TOK_FILTER, NULL);\r
 781 +       filter->left = sub;\r
 782 +       add_to_query (qparser, &q, TOK_AND, filter);\r
 783 +    }\r
 784 +    if (hates) {\r
 785 +       sub = _notmuch_token_create (qparser, TOK_NOT, NULL);\r
 786 +       sub->left = hates;\r
 787 +       add_to_query (qparser, &q, TOK_AND, sub);\r
 788 +    }\r
 789 +    g_hash_table_unref (bools);\r
 790 +    return q;\r
 791 +}\r
 792 +\r
 793 +static _notmuch_token_t *\r
 794 +parse_term (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok)\r
 795 +{\r
 796 +    _notmuch_token_t *sub;\r
 797 +\r
 798 +    if ((*tok)->type == TOK_END) {\r
 799 +       /* Arises from things like "x -".  Ignore. */\r
 800 +       return NULL;\r
 801 +    } else if ((*tok)->type == TOK_PREFIX) {\r
 802 +       sub = *tok;\r
 803 +       *tok = (*tok)->next;\r
 804 +       sub->left = parse_term (qparser, prec, tok);\r
 805 +       if (!sub->left)\r
 806 +           return NULL;\r
 807 +       if (sub->prefixFlags & PREFIX_LITERAL) {\r
 808 +           /* Convert TOK_TERMS to TOK_LIT */\r
 809 +           assert (sub->left->type == TOK_TERMS);\r
 810 +           sub->left->type = TOK_LIT;\r
 811 +       } else if (sub->left->type == TOK_PREFIX) {\r
 812 +           sub->left = sub->left->left;\r
 813 +       }\r
 814 +       return sub;\r
 815 +    } else if ((*tok)->type == TOK_BRA) {\r
 816 +       *tok = (*tok)->next;\r
 817 +       sub = parse_expr (qparser, prec + 10 - (prec%10), tok);\r
 818 +       if ((*tok)->type == TOK_KET)\r
 819 +           *tok = (*tok)->next;\r
 820 +       return sub;\r
 821 +    }\r
 822 +\r
 823 +    if ((*tok)->type != TOK_TERMS && (*tok)->type != TOK_LIT) {\r
 824 +       /* Arises from "+AND", "-AND", "prob:AND".  We could give up\r
 825 +        * and return nothing, but it seems nicer to treat the\r
 826 +        * operator as a term if it came from the original query. */\r
 827 +       if (!(*tok)->text)\r
 828 +           return NULL;\r
 829 +       (*tok)->type = TOK_TERMS;\r
 830 +    }\r
 831 +\r
 832 +    sub = *tok;\r
 833 +    *tok = (*tok)->next;\r
 834 +    return sub;\r
 835 +}\r
 836 +\r
 837 +static _notmuch_token_t *\r
 838 +parse_expr (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok)\r
 839 +{\r
 840 +    /* If you squint at the Xapian grammar, it's a precedence grammar\r
 841 +     * with one strange "prob" level.  This implements all but the\r
 842 +     * prob level and the leaf "term" level.\r
 843 +     *\r
 844 +     * prec is (nesting level * 10 + precedence level).  Normally we\r
 845 +     * only care about the precedence level, but the nesting level is\r
 846 +     * important for recovering from unbalanced parens.\r
 847 +     */\r
 848 +    int bprec = prec % 10;\r
 849 +    if (bprec == 3) {\r
 850 +       if ((*tok)->type == TOK_NOT) {\r
 851 +           /* Unary NOT */\r
 852 +           _notmuch_token_t *root = *tok;\r
 853 +           *tok = (*tok)->next;\r
 854 +           root->left = parse_expr (qparser, prec, tok);\r
 855 +           return root;\r
 856 +       }\r
 857 +\r
 858 +       return parse_prob (qparser, prec, tok);\r
 859 +    }\r
 860 +    if (bprec == 5)\r
 861 +       return parse_term (qparser, prec, tok);\r
 862 +\r
 863 +    _notmuch_token_t *left = parse_expr (qparser, prec + 1, tok);\r
 864 +    while ((bprec == 0 && (*tok)->type == TOK_OR) ||\r
 865 +          (bprec == 1 && (*tok)->type == TOK_XOR) ||\r
 866 +          (bprec == 2 && ((*tok)->type == TOK_AND ||\r
 867 +                          (*tok)->type == TOK_NOT)) ||\r
 868 +          (bprec == 4 && ((*tok)->type == TOK_NEAR ||\r
 869 +                          (*tok)->type == TOK_ADJ))) {\r
 870 +       _notmuch_token_t *root = *tok;\r
 871 +       if (root->type == TOK_NOT) {\r
 872 +           /* Replace "x NOT y" with (AND x (NOT y)) by inserting an\r
 873 +            * AND operator and leaning on the unary NOT rule. */\r
 874 +           root = _notmuch_token_create (qparser, TOK_AND, NULL);\r
 875 +       } else {\r
 876 +           *tok = (*tok)->next;\r
 877 +       }\r
 878 +\r
 879 +       /* Xapian treats x AND -y as x AND NOT y, which affects\r
 880 +        * precedence. */\r
 881 +       if (root->type == TOK_AND && (*tok)->type == TOK_HATE)\r
 882 +           (*tok)->type = TOK_NOT;\r
 883 +\r
 884 +       _notmuch_token_t *right = parse_expr (qparser, prec + 1, tok);\r
 885 +       if (left && right) {\r
 886 +           root->left = left;\r
 887 +           root->right = right;\r
 888 +       } else {\r
 889 +           /* Left or right was empty.  This may be a syntax error\r
 890 +            * like an omitted expression, or an empty expression. */\r
 891 +           root = left ? left : right;\r
 892 +       }\r
 893 +       left = root;\r
 894 +    }\r
 895 +    return left;\r
 896 +}\r
 897 +\r
 898 +static _notmuch_token_t *\r
 899 +parse (_notmuch_qparser_t *qparser, _notmuch_token_t *toks)\r
 900 +{\r
 901 +    _notmuch_token_t *root = parse_expr (qparser, 0, &toks);\r
 902 +    if (toks->type != TOK_END)\r
 903 +       INTERNAL_ERROR ("Token stream not fully consumed: %s",\r
 904 +                       _notmuch_token_show_list (qparser, toks));\r
 905 +    return root;\r
 906 +}\r
 907 +\r
 908 +static char *\r
 909 +generate_term (const void *ctx, const char *term, const char *prefix)\r
 910 +{\r
 911 +    notmuch_bool_t colon = FALSE;\r
 912 +    if (isupper (term[0]) && strlen (prefix) > 1)\r
 913 +       colon = TRUE;\r
 914 +    return talloc_asprintf (ctx, "%s%s%s", prefix, colon ? ":" : "", term);\r
 915 +}\r
 916 +\r
 917 +static Xapian::Query\r
 918 +generate_terms (struct _notmuch_generate_state *s, const char *text,\r
 919 +               const char *prefix)\r
 920 +{\r
 921 +    Xapian::TermGenerator tg;\r
 922 +    Xapian::Document doc;\r
 923 +    Xapian::TermIterator it, end;\r
 924 +    Xapian::PositionIterator pit, pend;\r
 925 +    Xapian::Query *qs, q;\r
 926 +    unsigned int nterms = 0;\r
 927 +\r
 928 +    if (prefix)\r
 929 +       tg.index_text (text, 1, prefix);\r
 930 +    else\r
 931 +       tg.index_text (text);\r
 932 +    doc = tg.get_document ();\r
 933 +\r
 934 +    /* Find the highest positioned term.  Positions are 1-based. */\r
 935 +    end = doc.termlist_end ();\r
 936 +    for (it = doc.termlist_begin (); it != end; ++it) {\r
 937 +       pend = it.positionlist_end ();\r
 938 +       for (pit = it.positionlist_begin (); pit != pend; ++pit) {\r
 939 +           if (*pit > nterms)\r
 940 +               nterms = *pit;\r
 941 +       }\r
 942 +    }\r
 943 +    if (nterms == 0)\r
 944 +       return Xapian::Query ();\r
 945 +\r
 946 +    /* Extract terms */\r
 947 +    qs = new Xapian::Query[nterms];\r
 948 +    for (it = doc.termlist_begin (); it != end; ++it) {\r
 949 +       pend = it.positionlist_end ();\r
 950 +       for (pit = it.positionlist_begin (); pit != pend; ++pit)\r
 951 +           qs[*pit - 1] = Xapian::Query (*it, 1, s->termpos + *pit - 1);\r
 952 +    }\r
 953 +    s->termpos += nterms;\r
 954 +\r
 955 +    /* Build query */\r
 956 +    q = Xapian::Query (Xapian::Query::OP_PHRASE, qs, qs + nterms, nterms);\r
 957 +    delete [] qs;\r
 958 +    return q;\r
 959 +}\r
 960 +\r
 961 +static Xapian::Query\r
 962 +generate (struct _notmuch_generate_state *s, _notmuch_token_t *root)\r
 963 +{\r
 964 +    using Xapian::Query;\r
 965 +    Query l, r;\r
 966 +    Query::op op;\r
 967 +\r
 968 +    if (!root)\r
 969 +       return Query ();\r
 970 +\r
 971 +    /* The tricky part here is that generate is allowed to return a\r
 972 +     * empty query, indicating that the user's query cannot be\r
 973 +     * expressed.  For example, the term "#" is an empty query. */\r
 974 +\r
 975 +    switch (root->type) {\r
 976 +    case TOK_AND:\r
 977 +       if (root->left->type == TOK_NOT && root->right->type != TOK_NOT) {\r
 978 +           _notmuch_token_t *tmp = root->left;\r
 979 +           root->left = root->right;\r
 980 +           root->right = tmp;\r
 981 +       }\r
 982 +       l = generate (s, root->left);\r
 983 +       if (l.empty()) {\r
 984 +           return generate (s, root->right);\r
 985 +       } else if (root->right->type == TOK_NOT) {\r
 986 +           r = generate (s, root->right->left);\r
 987 +           op = Query::OP_AND_NOT;\r
 988 +       } else if (root->right->type == TOK_FILTER) {\r
 989 +           r = generate (s, root->right->left);\r
 990 +           op = Query::OP_FILTER;\r
 991 +       } else {\r
 992 +           r = generate (s, root->right);\r
 993 +           op = Query::OP_AND;\r
 994 +       }\r
 995 +       if (r.empty())\r
 996 +           return l;\r
 997 +       return Query (op, l, r);\r
 998 +\r
 999 +    case TOK_NOT:\r
1000 +       l = generate (s, root->left);\r
1001 +       if (l.empty())\r
1002 +           return l;\r
1003 +       return Query (Query::OP_AND_NOT, Query ("", 1, 0), l);\r
1004 +\r
1005 +    case TOK_FILTER:\r
1006 +       l = generate(s, root->left);\r
1007 +       if (l.empty())\r
1008 +           return l;\r
1009 +       return Query (Query::OP_SCALE_WEIGHT, l, 0.0);\r
1010 +\r
1011 +    case TOK_OR:\r
1012 +    case TOK_XOR:\r
1013 +       l = generate (s, root->left);\r
1014 +       r = generate (s, root->right);\r
1015 +       if (l.empty())\r
1016 +           return r;\r
1017 +       if (r.empty())\r
1018 +           return l;\r
1019 +       return Query (root->type == TOK_OR ? Query::OP_OR : Query::OP_XOR,\r
1020 +                     l, r);\r
1021 +\r
1022 +    case TOK_ADJ:\r
1023 +    case TOK_NEAR:\r
1024 +    {\r
1025 +       /* XXX This differs from Xapian.  Xapian treats ADJ and NEAR\r
1026 +        * as n-ary operators and constructs the whole list of terms\r
1027 +        * to be searched within the largest specified window. */\r
1028 +       Query subs[2];\r
1029 +       subs[0] = Query (generate (s, root->left));\r
1030 +       subs[1] = Query (generate (s, root->right));\r
1031 +       if (subs[0].empty())\r
1032 +           return subs[1];\r
1033 +       if (subs[1].empty())\r
1034 +           return subs[0];\r
1035 +       return Query (root->type == TOK_ADJ ? Query::OP_PHRASE : Query::OP_NEAR,\r
1036 +                     subs, subs + 2, root->distance + 1);\r
1037 +    }\r
1038 +\r
1039 +    case TOK_PREFIX:\r
1040 +       return generate (s, root->left);\r
1041 +\r
1042 +    case TOK_TERMS:\r
1043 +       return generate_terms (s, root->text, root->prefix);\r
1044 +\r
1045 +    case TOK_LIT:\r
1046 +       return Query (generate_term (s, root->text, root->prefix));\r
1047 +\r
1048 +    case TOK_LOVE:\r
1049 +    case TOK_HATE:\r
1050 +    case TOK_BRA:\r
1051 +    case TOK_KET:\r
1052 +    case TOK_END:\r
1053 +       INTERNAL_ERROR ("Illegal token type %s in IR", token_types[root->type]);\r
1054 +    }\r
1055 +    /* We leave this outside the switch so the compiler will warn us\r
1056 +     * if we missed a token type. */\r
1057 +    INTERNAL_ERROR ("Illegal token type %d in IR", root->type);\r
1058 +    return Xapian::Query ();\r
1059 +}\r
1060 +\r
1061 +static int\r
1062 +_notmuch_qparser_destructor (_notmuch_qparser_t *qparser)\r
1063 +{\r
1064 +    g_hash_table_unref (qparser->prefixes);\r
1065 +    return 0;\r
1066 +}\r
1067 +\r
1068 +_notmuch_qparser_t *\r
1069 +_notmuch_qparser_create (const void *ctx, notmuch_database_t *notmuch)\r
1070 +{\r
1071 +    _notmuch_qparser_t *qparser = talloc (ctx, _notmuch_qparser_t);\r
1072 +    if (!qparser)\r
1073 +       return NULL;\r
1074 +    qparser->prefixes = NULL;\r
1075 +    talloc_set_destructor (qparser, _notmuch_qparser_destructor);\r
1076 +\r
1077 +    qparser->notmuch = notmuch;\r
1078 +    qparser->prefixes = g_hash_table_new (g_str_hash, g_str_equal);\r
1079 +    qparser->transforms = NULL;\r
1080 +    qparser->transformsTail = &qparser->transforms;\r
1081 +\r
1082 +    return qparser;\r
1083 +}\r
1084 +\r
1085 +void\r
1086 +_notmuch_qparser_add_prefix (_notmuch_qparser_t *qparser,\r
1087 +                            const char *prefix, notmuch_bool_t literal,\r
1088 +                            notmuch_bool_t boolean)\r
1089 +{\r
1090 +    int flags = ((literal ? PREFIX_LITERAL : 0) |\r
1091 +                (boolean ? PREFIX_BOOL : PREFIX_PROB));\r
1092 +    g_hash_table_insert (qparser->prefixes, talloc_strdup (qparser, prefix),\r
1093 +                        GINT_TO_POINTER (flags));\r
1094 +}\r
1095 +\r
1096 +void\r
1097 +_notmuch_qparser_add_transform (_notmuch_qparser_t *qparser,\r
1098 +                               _notmuch_token_t *(*transform) (\r
1099 +                                   _notmuch_token_t *ast, void *opaque),\r
1100 +                               void *opaque)\r
1101 +{\r
1102 +    struct _notmuch_qparser_transform *t;\r
1103 +    t = talloc (qparser, struct _notmuch_qparser_transform);\r
1104 +    t->next = NULL;\r
1105 +    t->transform = transform;\r
1106 +    t->opaque = opaque;\r
1107 +    *qparser->transformsTail = t;\r
1108 +    qparser->transformsTail = &t->next;\r
1109 +}\r
1110 +\r
1111 +struct _notmuch_transform_prefix_info {\r
1112 +    char *field, *prefix;\r
1113 +};\r
1114 +\r
1115 +static _notmuch_token_t *\r
1116 +transform_prefix_rec (_notmuch_token_t *root, char *field, char *prefix,\r
1117 +                     notmuch_bool_t active)\r
1118 +{\r
1119 +    if (!root)\r
1120 +       return NULL;\r
1121 +    if (root->type == TOK_PREFIX) {\r
1122 +       active = (strcmp (field, root->text) == 0);\r
1123 +    } else if (root->type == TOK_TERMS || root->type == TOK_LIT) {\r
1124 +       if (active)\r
1125 +           root->prefix = prefix;\r
1126 +    }\r
1127 +    transform_prefix_rec (root->left, field, prefix, active);\r
1128 +    transform_prefix_rec (root->right, field, prefix, active);\r
1129 +    return root;\r
1130 +}\r
1131 +\r
1132 +static _notmuch_token_t *\r
1133 +transform_prefix (_notmuch_token_t *root, void *opaque)\r
1134 +{\r
1135 +    struct _notmuch_transform_prefix_info *info =\r
1136 +       (struct _notmuch_transform_prefix_info*)opaque;\r
1137 +    return transform_prefix_rec (root, info->field, info->prefix, FALSE);\r
1138 +}\r
1139 +\r
1140 +void\r
1141 +_notmuch_qparser_add_db_prefix (_notmuch_qparser_t *qparser,\r
1142 +                               const char *field, const char *prefix,\r
1143 +                               notmuch_bool_t boolean)\r
1144 +{\r
1145 +    struct _notmuch_transform_prefix_info *info;\r
1146 +    info = talloc (qparser, struct _notmuch_transform_prefix_info);\r
1147 +    info->field = talloc_strdup (info, field);\r
1148 +    info->prefix = talloc_strdup (info, prefix);\r
1149 +    _notmuch_qparser_add_prefix (qparser, field, boolean, boolean);\r
1150 +    _notmuch_qparser_add_transform (qparser, transform_prefix, info);\r
1151 +}\r
1152 +\r
1153 +_notmuch_token_t *\r
1154 +_notmuch_qparser_lex (_notmuch_qparser_t *qparser, const char *query)\r
1155 +{\r
1156 +    return lex (qparser, query);\r
1157 +}\r
1158 +\r
1159 +_notmuch_token_t *\r
1160 +_notmuch_qparser_parse (_notmuch_qparser_t *qparser, const char *query)\r
1161 +{\r
1162 +    _notmuch_token_t *toks = lex (qparser, query);\r
1163 +    return parse (qparser, toks);\r
1164 +}\r
1165 +\r
1166 +_notmuch_token_t *\r
1167 +_notmuch_qparser_transform (_notmuch_qparser_t *qparser, _notmuch_token_t *root)\r
1168 +{\r
1169 +    struct _notmuch_qparser_transform *t;\r
1170 +    for (t = qparser->transforms; t; t = t->next)\r
1171 +       root = t->transform (root, t->opaque);\r
1172 +    return root;\r
1173 +}\r
1174 +\r
1175 +Xapian::Query\r
1176 +_notmuch_qparser_generate (_notmuch_qparser_t *qparser, _notmuch_token_t *root)\r
1177 +{\r
1178 +    struct _notmuch_generate_state *s;\r
1179 +    s = talloc (qparser, struct _notmuch_generate_state);\r
1180 +    s->qparser = qparser;\r
1181 +    s->termpos = 1;\r
1182 +    Xapian::Query query = generate (s, root);\r
1183 +    talloc_free (s);\r
1184 +    if (query.empty())\r
1185 +       /* Return all documents */\r
1186 +       return Xapian::Query ("", 1, 0);\r
1187 +    return query;\r
1188 +}\r
1189 -- \r
1190 1.7.2.3\r
1191 \r