1 Return-Path: <amthrax@drake.mit.edu>
\r
2 X-Original-To: notmuch@notmuchmail.org
\r
3 Delivered-To: notmuch@notmuchmail.org
\r
4 Received: from localhost (localhost [127.0.0.1])
\r
5 by olra.theworths.org (Postfix) with ESMTP id B3E1D42D2A1
\r
6 for <notmuch@notmuchmail.org>; Sun, 16 Jan 2011 00:16:24 -0800 (PST)
\r
7 X-Virus-Scanned: Debian amavisd-new at olra.theworths.org
\r
11 X-Spam-Status: No, score=0 tagged_above=-999 required=5 tests=[none]
\r
13 Received: from olra.theworths.org ([127.0.0.1])
\r
14 by localhost (olra.theworths.org [127.0.0.1]) (amavisd-new, port 10024)
\r
15 with ESMTP id 0UysSoUmmtej for <notmuch@notmuchmail.org>;
\r
16 Sun, 16 Jan 2011 00:16:23 -0800 (PST)
\r
17 X-Greylist: delayed 300 seconds by postgrey-1.32 at olra;
\r
18 Sun, 16 Jan 2011 00:16:23 PST
\r
19 Received: from dmz-mailsec-scanner-3.mit.edu (DMZ-MAILSEC-SCANNER-3.MIT.EDU
\r
21 by olra.theworths.org (Postfix) with ESMTP id 4D1FE42D28F
\r
22 for <notmuch@notmuchmail.org>; Sun, 16 Jan 2011 00:16:23 -0800 (PST)
\r
23 X-AuditID: 1209190e-b7b3bae000000a71-4d-4d32a8292d6d
\r
24 Received: from mailhub-auth-1.mit.edu ( [18.9.21.35])
\r
25 by dmz-mailsec-scanner-3.mit.edu (Symantec Brightmail Gateway) with
\r
26 SMTP id 9C.7B.02673.928A23D4; Sun, 16 Jan 2011 03:11:21 -0500 (EST)
\r
27 Received: from outgoing.mit.edu (OUTGOING-AUTH.MIT.EDU [18.7.22.103])
\r
28 by mailhub-auth-1.mit.edu (8.13.8/8.9.2) with ESMTP id p0G8BL6i004717;
\r
29 Sun, 16 Jan 2011 03:11:21 -0500
\r
30 Received: from drake.mit.edu (a074.catapulsion.net [70.36.81.74])
\r
31 (authenticated bits=0)
\r
32 (User authenticated as amdragon@ATHENA.MIT.EDU)
\r
33 by outgoing.mit.edu (8.13.6/8.12.4) with ESMTP id p0G8BJ6K010519
\r
34 (version=TLSv1/SSLv3 cipher=AES256-SHA bits=256 verify=NOT);
\r
35 Sun, 16 Jan 2011 03:11:21 -0500 (EST)
\r
36 Received: from amthrax by drake.mit.edu with local (Exim 4.72)
\r
37 (envelope-from <amthrax@drake.mit.edu>)
\r
38 id 1PeNhn-0002XR-HT; Sun, 16 Jan 2011 03:11:19 -0500
\r
39 From: Austin Clements <amdragon@MIT.EDU>
\r
40 To: notmuch@notmuchmail.org
\r
41 Subject: [PATCH 2/8] Parse NEAR and ADJ operators.
\r
42 Date: Sun, 16 Jan 2011 03:10:52 -0500
\r
43 Message-Id: <1295165458-9573-3-git-send-email-amdragon@mit.edu>
\r
44 X-Mailer: git-send-email 1.7.2.3
\r
45 In-Reply-To: <1295165458-9573-1-git-send-email-amdragon@mit.edu>
\r
46 References: <1295165458-9573-1-git-send-email-amdragon@mit.edu>
\r
47 X-Brightmail-Tracker: AAAAAA==
\r
48 Cc: amdragon@mit.edu
\r
49 X-BeenThere: notmuch@notmuchmail.org
\r
50 X-Mailman-Version: 2.1.13
\r
52 List-Id: "Use and development of the notmuch mail system."
\r
53 <notmuch.notmuchmail.org>
\r
54 List-Unsubscribe: <http://notmuchmail.org/mailman/options/notmuch>,
\r
55 <mailto:notmuch-request@notmuchmail.org?subject=unsubscribe>
\r
56 List-Archive: <http://notmuchmail.org/pipermail/notmuch>
\r
57 List-Post: <mailto:notmuch@notmuchmail.org>
\r
58 List-Help: <mailto:notmuch-request@notmuchmail.org?subject=help>
\r
59 List-Subscribe: <http://notmuchmail.org/mailman/listinfo/notmuch>,
\r
60 <mailto:notmuch-request@notmuchmail.org?subject=subscribe>
\r
61 X-List-Received-Date: Sun, 16 Jan 2011 08:16:24 -0000
\r
63 NEAR and ADJ are treated as n-ary operators where all operands must be
\r
64 terms, which fits with Xapian's own restrictions on near/adj queries.
\r
65 This implementation is slightly more lenient than Xapian's in that it
\r
66 allows phrases (both quoted and implicit) as operands and folds the
\r
67 phrase terms in as operands to the near/adj operator.
\r
69 lib/notmuch-private.h | 10 +++++
\r
70 lib/qparser.cc | 103 ++++++++++++++++++++++++++++++++++++++++++++++---
\r
71 2 files changed, 107 insertions(+), 6 deletions(-)
\r
73 diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
\r
74 index 06239b9..a42afd6 100644
\r
75 --- a/lib/notmuch-private.h
\r
76 +++ b/lib/notmuch-private.h
\r
77 @@ -518,6 +518,12 @@ enum _notmuch_token_type {
\r
78 TOK_LOVE, TOK_HATE, TOK_BRA, TOK_KET,
\r
79 /* Binary operators. These should have left and right children. */
\r
80 TOK_AND, TOK_OR, TOK_XOR,
\r
81 + /* n-ary operators. In the AST, these are represented like lists
\r
82 + * of TOK_TERMS, with the left child being a TOK_TERMS and the
\r
83 + * right being another TOK_ADJ/TOK_NEAR. The final right must be
\r
84 + * NULL. Both tokens can also carry distances; the highest
\r
85 + * distance in the chain will be used. */
\r
86 + TOK_ADJ, TOK_NEAR,
\r
87 /* Unary operators. These have only a left child. Xapian::Query
\r
88 * has no pure NOT operator, so the generator treats NOT as the
\r
89 * child of an AND specially, and otherwise represents it as
\r
90 @@ -555,6 +561,10 @@ typedef struct _notmuch_token {
\r
91 enum _notmuch_token_type type;
\r
94 + /* For TOK_ADJ and TOK_NEAR, this specifies the distance
\r
98 /* For TOK_PREFIX, the flags of this prefix. */
\r
101 diff --git a/lib/qparser.cc b/lib/qparser.cc
\r
102 index b86a445..5a6d39b 100644
\r
103 --- a/lib/qparser.cc
\r
104 +++ b/lib/qparser.cc
\r
106 * _notmuch_qparser_generate to perform step 4.
\r
108 * Still missing from this implementation:
\r
109 - * * NEAR/ADJ operators
\r
110 * * Stemming - The stemming should probably be marked on TOK_TERMS
\r
111 * tokens. Ideally, we can just pass this to the term generator.
\r
112 * * Wildcard queries - This should be available in the IR so it's
\r
113 @@ -101,14 +100,14 @@ struct _notmuch_generate_state {
\r
115 static const char *token_types[] = {
\r
116 "LOVE", "HATE", "BRA", "KET",
\r
117 - "AND", "OR", "XOR",
\r
118 + "AND", "OR", "XOR", "ADJ", "NEAR",
\r
119 "NOT", "FILTER", "PREFIX",
\r
120 "TERMS", "LIT", "ERROR", "END"
\r
123 /* The distinguished end token. This simplifies the parser since it
\r
124 * never has to worry about dereferencing next. */
\r
125 -static _notmuch_token_t tok_end = {TOK_END, NULL, FALSE, NULL,
\r
126 +static _notmuch_token_t tok_end = {TOK_END, NULL, -1, FALSE, NULL,
\r
127 &tok_end, NULL, NULL};
\r
130 @@ -118,6 +117,7 @@ _notmuch_token_create_op (const void *ctx, enum _notmuch_token_type type,
\r
131 _notmuch_token_t *tok = talloc (ctx, struct _notmuch_token);
\r
132 memset (tok, 0, sizeof (*tok));
\r
134 + tok->distance = -1;
\r
136 tok->right = right;
\r
138 @@ -135,6 +135,7 @@ _notmuch_token_create_term (const void *ctx, enum _notmuch_token_type type,
\r
140 _notmuch_token_show (const void *ctx, _notmuch_token_t *tok)
\r
142 + char dist[32] = "";
\r
143 int ispre = tok->type == TOK_PREFIX;
\r
145 if ((unsigned)tok->type > TOK_END)
\r
146 @@ -147,9 +148,11 @@ _notmuch_token_show (const void *ctx, _notmuch_token_t *tok)
\r
147 else if (tok->type == TOK_ERROR)
\r
148 return talloc_asprintf (ctx, "ERROR/\"%s\"", tok->text);
\r
150 - return talloc_asprintf (ctx, "%s%s%s",
\r
151 + if (tok->distance != -1)
\r
152 + sprintf(dist, "/%d", tok->distance);
\r
153 + return talloc_asprintf (ctx, "%s%s%s%s",
\r
154 token_types[tok->type],
\r
155 - ispre ? "/" : "",
\r
156 + dist, ispre ? "/" : "",
\r
157 ispre ? tok->text : "");
\r
160 @@ -308,10 +311,31 @@ static notmuch_bool_t
\r
161 lex_operator (struct _notmuch_lex_state *s, char *term,
\r
162 const char *op, enum _notmuch_token_type type)
\r
164 + size_t oplen = strlen (op);
\r
166 if (strcasecmp (term, op) == 0) {
\r
167 lex_emit (s, type, term);
\r
171 + /* Check for ADJ or NEAR with argument. Our parsing of this is
\r
172 + * slightly incompatible with Xapian, but I believe this to be a
\r
173 + * bug in Xapian. Xapian parses "x NEAR/y z" as three term
\r
174 + * phrases, "x", "near y", and "z", like we do. However, it
\r
175 + * behaves differently if the bad NEAR operator is at the end of
\r
176 + * the query, parsing "x NEAR/y" like "x NEAR y". */
\r
177 + if ((type == TOK_ADJ || type == TOK_NEAR) &&
\r
178 + strncasecmp (term, op, oplen) == 0 &&
\r
179 + term[oplen] == '/') {
\r
180 + /* Try to parse the distance argument */
\r
182 + int distance = strtol (&term[oplen + 1], &end, 10);
\r
183 + if (distance && !*end) {
\r
184 + struct _notmuch_token *tok = lex_emit (s, type, term);
\r
185 + tok->distance = distance;
\r
192 @@ -403,7 +427,9 @@ lex (const void *ctx, _notmuch_qparser_t *qparser, const char *query)
\r
193 if (lex_operator (s, term, "and", TOK_AND) ||
\r
194 lex_operator (s, term, "not", TOK_NOT) ||
\r
195 lex_operator (s, term, "xor", TOK_XOR) ||
\r
196 - lex_operator (s, term, "or", TOK_OR))
\r
197 + lex_operator (s, term, "or", TOK_OR) ||
\r
198 + lex_operator (s, term, "adj", TOK_ADJ) ||
\r
199 + lex_operator (s, term, "near", TOK_NEAR))
\r
202 /* Must be a term */
\r
203 @@ -495,6 +521,8 @@ parse_prob (struct _notmuch_parse_state *s, int prec, _notmuch_token_t **tok)
\r
209 sub = parse_expr (s, prec + 1, tok);
\r
210 add_to_query (s->ctx, &probs, TOK_AND, sub);
\r
212 @@ -529,6 +557,37 @@ parse_prob (struct _notmuch_parse_state *s, int prec, _notmuch_token_t **tok)
\r
215 static _notmuch_token_t *
\r
216 +parse_near (struct _notmuch_parse_state *s, int prec, _notmuch_token_t **tok)
\r
218 + _notmuch_token_type first = (*tok)->type, conj = (*tok)->next->type;
\r
219 + _notmuch_token_t *root = parse_expr (s, prec + 1, tok);
\r
220 + _notmuch_token_t **tail = NULL;
\r
222 + /* XXX Xapian allows prefixed terms in near/adj. */
\r
223 + if (first != TOK_TERMS || !(conj == TOK_NEAR || conj == TOK_ADJ))
\r
226 + while ((*tok)->type == conj && (*tok)->next->type == TOK_TERMS) {
\r
228 + /* First operator. Create the list root. */
\r
229 + _notmuch_token_t *nroot =
\r
230 + _notmuch_token_create_op (s->ctx, conj, root, NULL);
\r
232 + tail = &nroot->right;
\r
235 + /* Append the operator and term token to the list */
\r
237 + *tok = (*tok)->next;
\r
238 + (*tail)->left = *tok;
\r
239 + *tok = (*tok)->next;
\r
240 + tail = &(*tail)->right;
\r
246 +static _notmuch_token_t *
\r
247 parse_term (struct _notmuch_parse_state *s, int prec, _notmuch_token_t **tok)
\r
249 _notmuch_token_t *sub;
\r
250 @@ -596,6 +655,8 @@ parse_expr (struct _notmuch_parse_state *s, int prec, _notmuch_token_t **tok)
\r
251 return parse_prob (s, prec, tok);
\r
254 + return parse_near (s, prec, tok);
\r
256 return parse_term (s, prec, tok);
\r
258 _notmuch_token_t *left = parse_expr (s, prec + 1, tok);
\r
259 @@ -756,6 +817,36 @@ generate (struct _notmuch_generate_state *s, _notmuch_token_t *root)
\r
260 return Query (root->type == TOK_OR ? Query::OP_OR : Query::OP_XOR,
\r
266 + _notmuch_token_t *node;
\r
268 + char *terms = talloc_strdup (root, "");
\r
269 + /* Concatenate the operands and get the highest distance */
\r
270 + for (node = root; node; node = node->right) {
\r
271 + if (node->left->type != TOK_TERMS)
\r
272 + INTERNAL_ERROR ("Illegal token in NEAR/ADJ: %s",
\r
273 + _notmuch_token_show (s->ctx, node->left));
\r
274 + if (node->left->prefix)
\r
275 + INTERNAL_ERROR ("Prefixes not supported in NEAR/ADJ");
\r
277 + terms = talloc_asprintf_append (terms, "%s ", node->left->text);
\r
278 + if (node->distance > dist)
\r
279 + dist = node->distance;
\r
281 + /* The default distance is 10. */
\r
284 + /* Generate a PHRASE or NEAR query. If there are implicit
\r
285 + * phrases, they will be split out and treated like any other
\r
286 + * term in the operand list. */
\r
287 + op = root->type == TOK_ADJ ? Query::OP_PHRASE : Query::OP_NEAR;
\r
288 + l = generate_terms (s, terms, NULL, dist - 1, op);
\r
289 + talloc_free (terms);
\r
294 return generate (s, root->left);
\r