[PATCH] WIP: regexp matching in subjects

David Bremner david at tethera.net
Mon Jun 6 19:05:49 PDT 2016


the idea is that you can run

% notmuch search 'subject:rx:<your-favourite-regexp>'

or

% notmuch search subject:"your usual phrase search"

This should also work with bindings.
---

Here is Austin's "hack", crammed into the field processor framework.
I seem to have broken one of the existing subject search tests with my
recursive query parsing. I didn't have time to figure out why, yet.

 lib/Makefile.local     |  2 ++
 lib/database-private.h |  1 +
 lib/database.cc        |  5 +++
 lib/regexp-ps.cc       | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/regexp-ps.h        | 37 ++++++++++++++++++++
 lib/subject-fp.cc      | 41 ++++++++++++++++++++++
 lib/subject-fp.h       | 43 +++++++++++++++++++++++
 7 files changed, 221 insertions(+)
 create mode 100644 lib/regexp-ps.cc
 create mode 100644 lib/regexp-ps.h
 create mode 100644 lib/subject-fp.cc
 create mode 100644 lib/subject-fp.h

diff --git a/lib/Makefile.local b/lib/Makefile.local
index beb9635..0e7311f 100644
--- a/lib/Makefile.local
+++ b/lib/Makefile.local
@@ -51,6 +51,8 @@ libnotmuch_cxx_srcs =		\
 	$(dir)/query.cc		\
 	$(dir)/query-fp.cc      \
 	$(dir)/config.cc	\
+	$(dir)/regexp-ps.cc     \
+	$(dir)/subject-fp.cc    \
 	$(dir)/thread.cc
 
 libnotmuch_modules := $(libnotmuch_c_srcs:.c=.o) $(libnotmuch_cxx_srcs:.cc=.o)
diff --git a/lib/database-private.h b/lib/database-private.h
index ca71a92..5de0b81 100644
--- a/lib/database-private.h
+++ b/lib/database-private.h
@@ -186,6 +186,7 @@ struct _notmuch_database {
 #if HAVE_XAPIAN_FIELD_PROCESSOR
     Xapian::FieldProcessor *date_field_processor;
     Xapian::FieldProcessor *query_field_processor;
+    Xapian::FieldProcessor *subject_field_processor;
 #endif
     Xapian::ValueRangeProcessor *last_mod_range_processor;
 };
diff --git a/lib/database.cc b/lib/database.cc
index 86bf261..adfbb81 100644
--- a/lib/database.cc
+++ b/lib/database.cc
@@ -21,6 +21,7 @@
 #include "database-private.h"
 #include "parse-time-vrp.h"
 #include "query-fp.h"
+#include "subject-fp.h"
 #include "string-util.h"
 
 #include <iostream>
@@ -1008,6 +1009,8 @@ notmuch_database_open_verbose (const char *path,
 	notmuch->query_parser->add_boolean_prefix("date", notmuch->date_field_processor);
 	notmuch->query_field_processor = new QueryFieldProcessor (*notmuch->query_parser, notmuch);
 	notmuch->query_parser->add_boolean_prefix("query", notmuch->query_field_processor);
+	notmuch->subject_field_processor = new SubjectFieldProcessor (*notmuch->query_parser, notmuch);
+	notmuch->query_parser->add_boolean_prefix("subject", notmuch->subject_field_processor);
 #endif
 	notmuch->last_mod_range_processor = new Xapian::NumberValueRangeProcessor (NOTMUCH_VALUE_LAST_MOD, "lastmod:");
 
@@ -1027,6 +1030,8 @@ notmuch_database_open_verbose (const char *path,
 
 	for (i = 0; i < ARRAY_SIZE (PROBABILISTIC_PREFIX); i++) {
 	    prefix_t *prefix = &PROBABILISTIC_PREFIX[i];
+	    if (strcmp (prefix->name, "subject") == 0)
+		continue;
 	    notmuch->query_parser->add_prefix (prefix->name, prefix->prefix);
 	}
     } catch (const Xapian::Error &error) {
diff --git a/lib/regexp-ps.cc b/lib/regexp-ps.cc
new file mode 100644
index 0000000..540c7d6
--- /dev/null
+++ b/lib/regexp-ps.cc
@@ -0,0 +1,92 @@
+/* query-fp.cc - "query:" field processor glue
+ *
+ * This file is part of notmuch.
+ *
+ * Copyright © 2016 David Bremner
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see https://www.gnu.org/licenses/ .
+ *
+ * Author: Austin Clements <aclements at csail.mit.edu>
+ *                David Bremner <david at tethera.net>
+ */
+
+#include "regexp-ps.h"
+
+RegexpPostingSource::RegexpPostingSource (Xapian::valueno slot, const std::string &regexp)
+    : slot_ (slot)
+{
+    int r = regcomp (&regexp_, regexp.c_str (), REG_EXTENDED | REG_NOSUB);
+
+    if (r != 0)
+	/* XXX Report a query syntax error using regerror */
+	throw "regcomp failed";
+}
+
+RegexpPostingSource::~RegexpPostingSource ()
+{
+    regfree (&regexp_);
+}
+
+void
+RegexpPostingSource::init (const Xapian::Database &db)
+{
+    db_ = db;
+    it_ = db_.valuestream_begin (slot_);
+    end_ = db.valuestream_end (slot_);
+    started_ = false;
+}
+
+Xapian::doccount
+RegexpPostingSource::get_termfreq_min () const
+{
+    return 0;
+}
+
+Xapian::doccount
+RegexpPostingSource::get_termfreq_est () const
+{
+    return get_termfreq_max () / 2;
+}
+
+Xapian::doccount
+RegexpPostingSource::get_termfreq_max () const
+{
+    return db_.get_value_freq (slot_);
+}
+
+Xapian::docid
+RegexpPostingSource::get_docid () const
+{
+    return it_.get_docid ();
+}
+
+bool
+RegexpPostingSource::at_end () const
+{
+    return it_ == end_;
+}
+
+void
+RegexpPostingSource::next (unused (double min_wt))
+{
+    if (started_ && ! at_end ())
+	++it_;
+    started_ = true;
+
+    for (; ! at_end (); ++it_) {
+	std::string value = *it_;
+	if (regexec (&regexp_, value.c_str (), 0, NULL, 0) == 0)
+	    break;
+    }
+}
diff --git a/lib/regexp-ps.h b/lib/regexp-ps.h
new file mode 100644
index 0000000..a4553a7
--- /dev/null
+++ b/lib/regexp-ps.h
@@ -0,0 +1,37 @@
+#ifndef NOTMUCH_REGEX_PS_H
+#define NOTMUCH_REGEX_PS_H
+
+#include <sys/types.h>
+#include <regex.h>
+#include <xapian.h>
+#include "notmuch-private.h"
+
+/* A posting source that returns documents where a value matches a
+ * regexp.
+ */
+class RegexpPostingSource : public Xapian::PostingSource
+{
+protected:
+const Xapian::valueno slot_;
+regex_t regexp_;
+Xapian::Database db_;
+bool started_;
+Xapian::ValueIterator it_, end_;
+
+/* No copying */
+RegexpPostingSource (const RegexpPostingSource &);
+RegexpPostingSource &operator= (const RegexpPostingSource &);
+
+public:
+ RegexpPostingSource (Xapian::valueno slot, const std::string &regexp);
+~RegexpPostingSource ();
+void init (const Xapian::Database &db);
+Xapian::doccount get_termfreq_min () const;
+Xapian::doccount get_termfreq_est () const;
+Xapian::doccount get_termfreq_max () const;
+Xapian::docid get_docid () const;
+bool at_end () const;
+void next (unused (double min_wt));
+};
+
+#endif
diff --git a/lib/subject-fp.cc b/lib/subject-fp.cc
new file mode 100644
index 0000000..1627721
--- /dev/null
+++ b/lib/subject-fp.cc
@@ -0,0 +1,41 @@
+/* subject-fp.cc - "subject:" field processor glue
+ *
+ * This file is part of notmuch.
+ *
+ * Copyright © 2016 David Bremner
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see https://www.gnu.org/licenses/ .
+ *
+ * Author: David Bremner <david at tethera.net>
+ */
+
+#include "database-private.h"
+#include "subject-fp.h"
+#include <iostream>
+
+#if HAVE_XAPIAN_FIELD_PROCESSOR
+
+Xapian::Query
+SubjectFieldProcessor::operator() (const std::string & str)
+{
+    std::string prefix = "rx:";
+
+    if (str.compare(0,prefix.size(),prefix)==0) {
+	postings = new RegexpPostingSource(NOTMUCH_VALUE_SUBJECT, str.substr(prefix.size()));
+	return Xapian::Query(postings);
+    } else {
+	return parser.parse_query (str, NOTMUCH_QUERY_PARSER_FLAGS, _find_prefix ("subject"));
+    }
+}
+#endif
diff --git a/lib/subject-fp.h b/lib/subject-fp.h
new file mode 100644
index 0000000..ca622ba
--- /dev/null
+++ b/lib/subject-fp.h
@@ -0,0 +1,43 @@
+/* subject-fp.h - subject field processor glue
+ *
+ * This file is part of notmuch.
+ *
+ * Copyright © 2016 David Bremner
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see https://www.gnu.org/licenses/ .
+ *
+ * Author: David Bremner <david at tethera.net>
+ */
+
+#ifndef NOTMUCH_SUBJECT_FP_H
+#define NOTMUCH_SUBJECT_FP_H
+
+#include <xapian.h>
+#include "notmuch.h"
+#include "regexp-ps.h"
+
+#if HAVE_XAPIAN_FIELD_PROCESSOR
+class SubjectFieldProcessor : public Xapian::FieldProcessor {
+ protected:
+    Xapian::QueryParser &parser;
+    notmuch_database_t *notmuch;
+    RegexpPostingSource *postings = NULL;
+ public:
+    SubjectFieldProcessor (Xapian::QueryParser &parser_, notmuch_database_t *notmuch_)
+	: parser(parser_), notmuch(notmuch_) { };
+
+    Xapian::Query operator()(const std::string & str);
+};
+#endif
+#endif /* NOTMUCH_SUBJECT_FP_H */
-- 
2.8.1



More information about the notmuch mailing list