From 9a9802576333db4044cad2a93db79b03a28885b8 Mon Sep 17 00:00:00 2001 From: David Bremner Date: Tue, 7 Jun 2016 23:05:49 +2100 Subject: [PATCH] [PATCH] WIP: regexp matching in subjects --- e7/68ecfa992c3689768c031f7aaa3e8091f2b943 | 378 ++++++++++++++++++++++ 1 file changed, 378 insertions(+) create mode 100644 e7/68ecfa992c3689768c031f7aaa3e8091f2b943 diff --git a/e7/68ecfa992c3689768c031f7aaa3e8091f2b943 b/e7/68ecfa992c3689768c031f7aaa3e8091f2b943 new file mode 100644 index 000000000..afe3131d2 --- /dev/null +++ b/e7/68ecfa992c3689768c031f7aaa3e8091f2b943 @@ -0,0 +1,378 @@ +Return-Path: +X-Original-To: notmuch@notmuchmail.org +Delivered-To: notmuch@notmuchmail.org +Received: from localhost (localhost [127.0.0.1]) + by arlo.cworth.org (Postfix) with ESMTP id C682C6DE0130 + for ; Mon, 6 Jun 2016 19:06:20 -0700 (PDT) +X-Virus-Scanned: Debian amavisd-new at cworth.org +X-Spam-Flag: NO +X-Spam-Score: -0.011 +X-Spam-Level: +X-Spam-Status: No, score=-0.011 tagged_above=-999 required=5 + tests=[AWL=-0.000, SPF_PASS=-0.001, T_RP_MATCHES_RCVD=-0.01] + autolearn=disabled +Received: from arlo.cworth.org ([127.0.0.1]) + by localhost (arlo.cworth.org [127.0.0.1]) (amavisd-new, port 10024) + with ESMTP id rI7TzeRsZYMf for ; + Mon, 6 Jun 2016 19:06:10 -0700 (PDT) +Received: from fethera.tethera.net (fethera.tethera.net [198.245.60.197]) + by arlo.cworth.org (Postfix) with ESMTPS id C2C196DE00DA + for ; Mon, 6 Jun 2016 19:06:10 -0700 (PDT) +Received: from remotemail by fethera.tethera.net with local (Exim 4.84) + (envelope-from ) + id 1bA6Om-0000AV-CX; Mon, 06 Jun 2016 22:05:44 -0400 +Received: (nullmailer pid 7230 invoked by uid 1000); + Tue, 07 Jun 2016 02:05:54 -0000 +From: David Bremner +To: Austin Clements , + David Bremner +Cc: sfischme@uwaterloo.ca, Gaute Hope , + notmuch +Subject: [PATCH] WIP: regexp matching in subjects +Date: Mon, 6 Jun 2016 23:05:49 -0300 +Message-Id: <1465265149-7174-1-git-send-email-david@tethera.net> +X-Mailer: git-send-email 2.8.1 +In-Reply-To: + +References: + +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit +X-BeenThere: notmuch@notmuchmail.org +X-Mailman-Version: 2.1.20 +Precedence: list +List-Id: "Use and development of the notmuch mail system." + +List-Unsubscribe: , + +List-Archive: +List-Post: +List-Help: +List-Subscribe: , + +X-List-Received-Date: Tue, 07 Jun 2016 02:06:20 -0000 + +the idea is that you can run + +% notmuch search 'subject:rx:' + +or + +% notmuch search subject:"your usual phrase search" + +This should also work with bindings. +--- + +Here is Austin's "hack", crammed into the field processor framework. +I seem to have broken one of the existing subject search tests with my +recursive query parsing. I didn't have time to figure out why, yet. + + lib/Makefile.local | 2 ++ + lib/database-private.h | 1 + + lib/database.cc | 5 +++ + lib/regexp-ps.cc | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++ + lib/regexp-ps.h | 37 ++++++++++++++++++++ + lib/subject-fp.cc | 41 ++++++++++++++++++++++ + lib/subject-fp.h | 43 +++++++++++++++++++++++ + 7 files changed, 221 insertions(+) + create mode 100644 lib/regexp-ps.cc + create mode 100644 lib/regexp-ps.h + create mode 100644 lib/subject-fp.cc + create mode 100644 lib/subject-fp.h + +diff --git a/lib/Makefile.local b/lib/Makefile.local +index beb9635..0e7311f 100644 +--- a/lib/Makefile.local ++++ b/lib/Makefile.local +@@ -51,6 +51,8 @@ libnotmuch_cxx_srcs = \ + $(dir)/query.cc \ + $(dir)/query-fp.cc \ + $(dir)/config.cc \ ++ $(dir)/regexp-ps.cc \ ++ $(dir)/subject-fp.cc \ + $(dir)/thread.cc + + libnotmuch_modules := $(libnotmuch_c_srcs:.c=.o) $(libnotmuch_cxx_srcs:.cc=.o) +diff --git a/lib/database-private.h b/lib/database-private.h +index ca71a92..5de0b81 100644 +--- a/lib/database-private.h ++++ b/lib/database-private.h +@@ -186,6 +186,7 @@ struct _notmuch_database { + #if HAVE_XAPIAN_FIELD_PROCESSOR + Xapian::FieldProcessor *date_field_processor; + Xapian::FieldProcessor *query_field_processor; ++ Xapian::FieldProcessor *subject_field_processor; + #endif + Xapian::ValueRangeProcessor *last_mod_range_processor; + }; +diff --git a/lib/database.cc b/lib/database.cc +index 86bf261..adfbb81 100644 +--- a/lib/database.cc ++++ b/lib/database.cc +@@ -21,6 +21,7 @@ + #include "database-private.h" + #include "parse-time-vrp.h" + #include "query-fp.h" ++#include "subject-fp.h" + #include "string-util.h" + + #include +@@ -1008,6 +1009,8 @@ notmuch_database_open_verbose (const char *path, + notmuch->query_parser->add_boolean_prefix("date", notmuch->date_field_processor); + notmuch->query_field_processor = new QueryFieldProcessor (*notmuch->query_parser, notmuch); + notmuch->query_parser->add_boolean_prefix("query", notmuch->query_field_processor); ++ notmuch->subject_field_processor = new SubjectFieldProcessor (*notmuch->query_parser, notmuch); ++ notmuch->query_parser->add_boolean_prefix("subject", notmuch->subject_field_processor); + #endif + notmuch->last_mod_range_processor = new Xapian::NumberValueRangeProcessor (NOTMUCH_VALUE_LAST_MOD, "lastmod:"); + +@@ -1027,6 +1030,8 @@ notmuch_database_open_verbose (const char *path, + + for (i = 0; i < ARRAY_SIZE (PROBABILISTIC_PREFIX); i++) { + prefix_t *prefix = &PROBABILISTIC_PREFIX[i]; ++ if (strcmp (prefix->name, "subject") == 0) ++ continue; + notmuch->query_parser->add_prefix (prefix->name, prefix->prefix); + } + } catch (const Xapian::Error &error) { +diff --git a/lib/regexp-ps.cc b/lib/regexp-ps.cc +new file mode 100644 +index 0000000..540c7d6 +--- /dev/null ++++ b/lib/regexp-ps.cc +@@ -0,0 +1,92 @@ ++/* query-fp.cc - "query:" field processor glue ++ * ++ * This file is part of notmuch. ++ * ++ * Copyright © 2016 David Bremner ++ * ++ * This program is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see https://www.gnu.org/licenses/ . ++ * ++ * Author: Austin Clements ++ * David Bremner ++ */ ++ ++#include "regexp-ps.h" ++ ++RegexpPostingSource::RegexpPostingSource (Xapian::valueno slot, const std::string ®exp) ++ : slot_ (slot) ++{ ++ int r = regcomp (®exp_, regexp.c_str (), REG_EXTENDED | REG_NOSUB); ++ ++ if (r != 0) ++ /* XXX Report a query syntax error using regerror */ ++ throw "regcomp failed"; ++} ++ ++RegexpPostingSource::~RegexpPostingSource () ++{ ++ regfree (®exp_); ++} ++ ++void ++RegexpPostingSource::init (const Xapian::Database &db) ++{ ++ db_ = db; ++ it_ = db_.valuestream_begin (slot_); ++ end_ = db.valuestream_end (slot_); ++ started_ = false; ++} ++ ++Xapian::doccount ++RegexpPostingSource::get_termfreq_min () const ++{ ++ return 0; ++} ++ ++Xapian::doccount ++RegexpPostingSource::get_termfreq_est () const ++{ ++ return get_termfreq_max () / 2; ++} ++ ++Xapian::doccount ++RegexpPostingSource::get_termfreq_max () const ++{ ++ return db_.get_value_freq (slot_); ++} ++ ++Xapian::docid ++RegexpPostingSource::get_docid () const ++{ ++ return it_.get_docid (); ++} ++ ++bool ++RegexpPostingSource::at_end () const ++{ ++ return it_ == end_; ++} ++ ++void ++RegexpPostingSource::next (unused (double min_wt)) ++{ ++ if (started_ && ! at_end ()) ++ ++it_; ++ started_ = true; ++ ++ for (; ! at_end (); ++it_) { ++ std::string value = *it_; ++ if (regexec (®exp_, value.c_str (), 0, NULL, 0) == 0) ++ break; ++ } ++} +diff --git a/lib/regexp-ps.h b/lib/regexp-ps.h +new file mode 100644 +index 0000000..a4553a7 +--- /dev/null ++++ b/lib/regexp-ps.h +@@ -0,0 +1,37 @@ ++#ifndef NOTMUCH_REGEX_PS_H ++#define NOTMUCH_REGEX_PS_H ++ ++#include ++#include ++#include ++#include "notmuch-private.h" ++ ++/* A posting source that returns documents where a value matches a ++ * regexp. ++ */ ++class RegexpPostingSource : public Xapian::PostingSource ++{ ++protected: ++const Xapian::valueno slot_; ++regex_t regexp_; ++Xapian::Database db_; ++bool started_; ++Xapian::ValueIterator it_, end_; ++ ++/* No copying */ ++RegexpPostingSource (const RegexpPostingSource &); ++RegexpPostingSource &operator= (const RegexpPostingSource &); ++ ++public: ++ RegexpPostingSource (Xapian::valueno slot, const std::string ®exp); ++~RegexpPostingSource (); ++void init (const Xapian::Database &db); ++Xapian::doccount get_termfreq_min () const; ++Xapian::doccount get_termfreq_est () const; ++Xapian::doccount get_termfreq_max () const; ++Xapian::docid get_docid () const; ++bool at_end () const; ++void next (unused (double min_wt)); ++}; ++ ++#endif +diff --git a/lib/subject-fp.cc b/lib/subject-fp.cc +new file mode 100644 +index 0000000..1627721 +--- /dev/null ++++ b/lib/subject-fp.cc +@@ -0,0 +1,41 @@ ++/* subject-fp.cc - "subject:" field processor glue ++ * ++ * This file is part of notmuch. ++ * ++ * Copyright © 2016 David Bremner ++ * ++ * This program is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see https://www.gnu.org/licenses/ . ++ * ++ * Author: David Bremner ++ */ ++ ++#include "database-private.h" ++#include "subject-fp.h" ++#include ++ ++#if HAVE_XAPIAN_FIELD_PROCESSOR ++ ++Xapian::Query ++SubjectFieldProcessor::operator() (const std::string & str) ++{ ++ std::string prefix = "rx:"; ++ ++ if (str.compare(0,prefix.size(),prefix)==0) { ++ postings = new RegexpPostingSource(NOTMUCH_VALUE_SUBJECT, str.substr(prefix.size())); ++ return Xapian::Query(postings); ++ } else { ++ return parser.parse_query (str, NOTMUCH_QUERY_PARSER_FLAGS, _find_prefix ("subject")); ++ } ++} ++#endif +diff --git a/lib/subject-fp.h b/lib/subject-fp.h +new file mode 100644 +index 0000000..ca622ba +--- /dev/null ++++ b/lib/subject-fp.h +@@ -0,0 +1,43 @@ ++/* subject-fp.h - subject field processor glue ++ * ++ * This file is part of notmuch. ++ * ++ * Copyright © 2016 David Bremner ++ * ++ * This program is free software: you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation, either version 3 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program. If not, see https://www.gnu.org/licenses/ . ++ * ++ * Author: David Bremner ++ */ ++ ++#ifndef NOTMUCH_SUBJECT_FP_H ++#define NOTMUCH_SUBJECT_FP_H ++ ++#include ++#include "notmuch.h" ++#include "regexp-ps.h" ++ ++#if HAVE_XAPIAN_FIELD_PROCESSOR ++class SubjectFieldProcessor : public Xapian::FieldProcessor { ++ protected: ++ Xapian::QueryParser &parser; ++ notmuch_database_t *notmuch; ++ RegexpPostingSource *postings = NULL; ++ public: ++ SubjectFieldProcessor (Xapian::QueryParser &parser_, notmuch_database_t *notmuch_) ++ : parser(parser_), notmuch(notmuch_) { }; ++ ++ Xapian::Query operator()(const std::string & str); ++}; ++#endif ++#endif /* NOTMUCH_SUBJECT_FP_H */ +-- +2.8.1 + -- 2.26.2