[PATCH 1/2] Implement a custom query parser with a mostly Xapian-compatible grammar.
Austin Clements
amdragon at MIT.EDU
Wed Dec 22 22:00:23 PST 2010
This parser takes an extra step through an intermediate representation
that is convenient to manipulate via pluggable transformation passes.
These are used to implement regular Xapian-style query prefixes, but
are flexible enough to accomplish far more.
---
lib/Makefile.local | 1 +
lib/database-private.h | 6 +
lib/notmuch-private.h | 126 +++++++
lib/qparser.cc | 941 ++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1074 insertions(+), 0 deletions(-)
create mode 100644 lib/qparser.cc
diff --git a/lib/Makefile.local b/lib/Makefile.local
index 37d1c0d..37d3735 100644
--- a/lib/Makefile.local
+++ b/lib/Makefile.local
@@ -64,6 +64,7 @@ libnotmuch_cxx_srcs = \
$(dir)/index.cc \
$(dir)/message.cc \
$(dir)/query.cc \
+ $(dir)/qparser.cc \
$(dir)/thread.cc
libnotmuch_modules = $(libnotmuch_c_srcs:.c=.o) $(libnotmuch_cxx_srcs:.cc=.o)
diff --git a/lib/database-private.h b/lib/database-private.h
index 9f83407..358a71b 100644
--- a/lib/database-private.h
+++ b/lib/database-private.h
@@ -64,6 +64,12 @@ _notmuch_get_terms_with_prefix (void *ctx, Xapian::TermIterator &i,
Xapian::TermIterator &end,
const char *prefix);
+/* qparser.cc */
+
+/* Generate a Xapian query from a query AST. */
+Xapian::Query
+_notmuch_qparser_generate (_notmuch_qparser_t *qparser, _notmuch_token_t *root);
+
#pragma GCC visibility pop
#endif
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index b6f1095..be76c0c 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -508,6 +508,132 @@ notmuch_filenames_t *
_notmuch_filenames_create (const void *ctx,
notmuch_string_list_t *list);
+/* qparser.cc */
+
+typedef struct _notmuch_qparser _notmuch_qparser_t;
+
+enum _notmuch_token_type {
+ /* These first four token types appear only in the lexer output
+ * and never in the parse tree. */
+ TOK_LOVE, TOK_HATE, TOK_BRA, TOK_KET,
+ /* Binary operators. These should have left and right children.
+ * ADJ and NEAR should have a distance. */
+ TOK_AND, TOK_OR, TOK_XOR, TOK_ADJ, TOK_NEAR,
+ /* Unary operators. These have only a left child. Xapian::Query
+ * has no pure NOT operator, so the generator treats NOT as the
+ * child of an AND specially, and otherwise represents it as
+ * "<all> AND_NOT x". FILTER ignores the weights of the subquery
+ * and generates Xapian::Query::OP_FILTER if the left child of an
+ * AND or Xapian::Query::OP_SCALE_WEIGHT otherwise. The text
+ * field of a PREFIX operator specifies the prefix. PREFIX
+ * operators specify only syntactic prefixes, not database
+ * prefixes, and thus have no effect on the generated query. */
+ TOK_NOT, TOK_FILTER, TOK_PREFIX,
+ /* A single TOK_TERMS token can represent a single term, a quoted
+ * phrase, or an implicit phrase. An implicit phrase is something
+ * like "foo/bar", for which the database contains two separate
+ * terms, but you want to treat them as a phrase, even though it's
+ * not quoted. Xapian calls characters that implicitly connect
+ * terms into phrases "phrase generators." We take a simpler
+ * approach and treat almost any non-whitespace character as a
+ * phrase generator. */
+ TOK_TERMS,
+ /* Like TOK_TERMS, but the term text should be taken literally,
+ * with no phrase splitting or whitespace removal. The lexer
+ * only generates TOK_TERMS; the parser creates TOK_LIT. */
+ TOK_LIT,
+ /* TOK_END indicates the end of the token list. Such tokens loop
+ * back on themselves so it's always safe to follow "next".
+ * These appear only in the lexer output. */
+ TOK_END
+};
+
+typedef struct _notmuch_token {
+ enum _notmuch_token_type type;
+ const char *text;
+
+ /* For TOK_ADJ and TOK_NEAR, this specifies the distance
+ * argument. */
+ int distance;
+
+ /* For TOK_PREFIX, the flags of this prefix. */
+ int prefixFlags;
+
+ /* For TOK_TERMS and TOK_LIT, the database prefix to use when
+ * generating database terms. This must be filled in a
+ * transformation pass. */
+ char *prefix;
+
+ /* Link in the lexer token list. */
+ struct _notmuch_token *next;
+
+ /* Links in the intermediate AST. */
+ struct _notmuch_token *left, *right;
+} _notmuch_token_t;
+
+_notmuch_token_t *
+_notmuch_token_create (const void *ctx, enum _notmuch_token_type type,
+ const char *text);
+
+char *
+_notmuch_token_show_list (const void *ctx, _notmuch_token_t *tok);
+
+char *
+_notmuch_token_show_tree (const void *ctx, _notmuch_token_t *root);
+
+_notmuch_qparser_t *
+_notmuch_qparser_create (const void *ctx, notmuch_database_t *notmuch);
+
+/* Add a syntactic prefix. This will appear as a TOK_PREFIX in the
+ * AST, but does not alone affect the final query.
+ *
+ * The literal flag affects lexing. If true, this prefix must be
+ * followed by a regular term or quoted literal, which will not be
+ * stripped of whitespace or split in to a phrase. The boolean flag
+ * affects parsing. If true, then terms with this prefix will be
+ * combined into the query using the FILTER operator, so they must
+ * appear in the result and will not contribute to weights. Xapian's
+ * "boolean prefixes" are both literal and boolean.
+ */
+void
+_notmuch_qparser_add_prefix (_notmuch_qparser_t *qparser,
+ const char *prefix, notmuch_bool_t literal,
+ notmuch_bool_t boolean);
+
+/* Add a transform pass to a query parser. The transform function
+ * will be called with the root of the AST and should return a new AST
+ * root (which may be the same as the old root).
+ */
+void
+_notmuch_qparser_add_transform (_notmuch_qparser_t *qparser,
+ _notmuch_token_t *(*transform) (
+ _notmuch_token_t *ast, void *opaque),
+ void *opaque);
+
+/* Add a syntactic prefix (field) and a transform pass to transform
+ * that syntactic prefix into a database prefix (prefix). This
+ * corresponds to Xapian's add_prefix and add_boolean_prefix
+ * functions. */
+void
+_notmuch_qparser_add_db_prefix (_notmuch_qparser_t *qparser,
+ const char *field, const char *prefix,
+ notmuch_bool_t boolean);
+
+/* Lex a query string, returning the first token in the token list.
+ * This is only meant for testing. */
+_notmuch_token_t *
+_notmuch_qparser_lex (_notmuch_qparser_t *qparser, const char *query);
+
+/* Parse a query string, returning the root of the AST. */
+_notmuch_token_t *
+_notmuch_qparser_parse (_notmuch_qparser_t *qparser, const char *query);
+
+/* Transform a parsed query, running the transforms in the order they
+ * were added to the query parser. Return the root of the transformed
+ * AST. */
+_notmuch_token_t *
+_notmuch_qparser_transform (_notmuch_qparser_t *qparser, _notmuch_token_t *root);
+
#pragma GCC visibility pop
NOTMUCH_END_DECLS
diff --git a/lib/qparser.cc b/lib/qparser.cc
new file mode 100644
index 0000000..4804d06
--- /dev/null
+++ b/lib/qparser.cc
@@ -0,0 +1,941 @@
+/* qparser.cc - Notmuch query parser
+ *
+ * Copyright © 2010 Austin Clements
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see http://www.gnu.org/licenses/ .
+ *
+ * Author: Austin Clements <amdragon at mit.edu>
+ */
+
+/*
+ * Query parsing is performed in a series of phases similar to those
+ * of a traditional compiler.
+ *
+ * 1) We tokenize the query, identifying operators and term phrases.
+ * Note that a phrase (quoted or implicit) generates a single token
+ * at this point, which we split up later (unlike in the Xapian
+ * lexer).
+ * 2) We parse the token stream, generating an intermediate
+ * representation in the form of a binary AST. This IR is similar
+ * to the Xapian::Query operators, but is designed to be more
+ * easily manipulated.
+ * 3) We transform the parse tree, running a sequence of
+ * caller-provided transformation functions over the tree.
+ * 4) We generate the Xapian::Query from the transformed IR. This
+ * step also splits phrase tokens into multiple query terms.
+ * XXX and expands wildcard terms.
+ *
+ * To use the query parser, call _notmuch_qparser_parse to perform
+ * steps 1 and 2, _notmuch_qparser_transform to perform step 3, and
+ * _notmuch_qparser_generate to perform step 4.
+ *
+ * Still missing from this implementation:
+ * * Stemming - The stemming should probably be marked on TOK_TERMS
+ * tokens. Ideally, we can just pass this to the term generator.
+ * * Wildcard queries - This should be available in the IR so it's
+ * easy to generate wildcard queries in a transformer.
+ * * Value ranges in the IR
+ * * Queries "" and "*"
+ * * Memory management is poorly suited to reusing qparser's.
+ */
+
+/* XXX notmuch currently registers "tag" as an exclusive boolean
+ * prefix, which means queries like "tag:x tag:y" will return messages
+ * with tag x OR tag y. Is this intentional? */
+
+#include "notmuch-private.h"
+#include "database-private.h"
+
+#include <glib.h> /* GHashTable */
+
+struct _notmuch_qparser {
+ notmuch_database_t *notmuch;
+
+ /* Maps from prefix strings to PREFIX_* flags */
+ GHashTable *prefixes;
+
+ struct _notmuch_qparser_transform *transforms;
+ struct _notmuch_qparser_transform **transformsTail;
+};
+
+enum {
+ PREFIX_LITERAL = 1<<1,
+ PREFIX_PROB = 1<<2,
+ PREFIX_BOOL = 1<<3,
+};
+
+struct _notmuch_qparser_transform {
+ struct _notmuch_qparser_transform *next;
+ _notmuch_token_t *(*transform) (_notmuch_token_t *ast, void *opaque);
+ void *opaque;
+};
+
+struct _notmuch_lexer_state {
+ _notmuch_qparser_t *qparser;
+ _notmuch_token_t *head;
+ _notmuch_token_t **tail;
+};
+
+struct _notmuch_generate_state {
+ _notmuch_qparser_t *qparser;
+ unsigned int termpos;
+};
+
+static const char *token_types[] = {
+ "LOVE", "HATE", "BRA", "KET",
+ "AND", "OR", "XOR", "ADJ", "NEAR",
+ "NOT", "FILTER", "PREFIX",
+ "TERMS", "LIT", "END"
+};
+
+/* The distinguished end token. This simplifies the parser since it
+ * never has to worry about dereferencing next. */
+static _notmuch_token_t tok_end = {TOK_END, NULL, -1, FALSE, NULL,
+ &tok_end, NULL, NULL};
+
+_notmuch_token_t *
+_notmuch_token_create (const void *ctx, enum _notmuch_token_type type,
+ const char *text)
+{
+ struct _notmuch_token *tok = talloc (ctx, struct _notmuch_token);
+ tok->type = type;
+ tok->text = text;
+ if (type == TOK_ADJ || type == TOK_NEAR)
+ tok->distance = 10;
+ else
+ tok->distance = -1;
+ tok->prefixFlags = 0;
+ tok->prefix = NULL;
+ tok->left = tok->right = NULL;
+ return tok;
+}
+
+char *
+_notmuch_token_show_list (const void *ctx, _notmuch_token_t *tok)
+{
+ char *out = talloc_strdup (ctx, "");
+
+ for (; tok->type != TOK_END; tok = tok->next) {
+ out = talloc_asprintf_append (out, "%s%s",
+ *out == 0 ? "" : " ",
+ token_types[tok->type]);
+ if (tok->distance != -1)
+ out = talloc_asprintf_append (out, "/%d", tok->distance);
+ if (tok->text)
+ out = talloc_asprintf_append (out, ":%s", tok->text);
+ }
+
+ return out;
+}
+
+char *
+_notmuch_token_show_tree (const void *ctx, _notmuch_token_t *root)
+{
+ if (!root) {
+ return talloc_strdup (ctx, "<nil>");
+ } else if (root->type == TOK_TERMS || root->type == TOK_LIT) {
+ return talloc_strdup (ctx, root->text);
+ } else {
+ char *out = talloc_asprintf (ctx, "(%s", token_types[root->type]);
+ void *local = talloc_new (ctx);
+ if (root->type == TOK_PREFIX)
+ out = talloc_asprintf_append (out, "/%s", root->text);
+ if (root->left)
+ out = talloc_asprintf_append
+ (out, " %s", _notmuch_token_show_tree (local, root->left));
+ if (root->right)
+ out = talloc_asprintf_append
+ (out, " %s", _notmuch_token_show_tree (local, root->right));
+ out = talloc_asprintf_append (out, ")");
+ talloc_free (local);
+ return out;
+ }
+}
+
+using Xapian::Unicode::is_whitespace;
+using Xapian::Unicode::is_wordchar;
+
+static Xapian::Utf8Iterator
+lex_skip_ws (Xapian::Utf8Iterator it)
+{
+ while (is_whitespace (*it))
+ it++;
+ return it;
+}
+
+static struct _notmuch_token *
+lex_emit (struct _notmuch_lexer_state *s, enum _notmuch_token_type type,
+ char *text)
+{
+ struct _notmuch_token *tok = _notmuch_token_create (s, type, text);
+ tok->next = &tok_end;
+ *(s->tail) = tok;
+ s->tail = &tok->next;
+ return tok;
+}
+
+
+/* Lex a quoted phrase, returning an iterator pointing to the
+ * character following the closing quote. If escaped, then accept two
+ * quotes as an escaped version of a single quote.
+ */
+static Xapian::Utf8Iterator
+lex_quoted_phrase (struct _notmuch_lexer_state *s,
+ Xapian::Utf8Iterator it, notmuch_bool_t escaped)
+{
+ Xapian::Utf8Iterator next, orig, end;
+ char *term, *src, *dst;
+
+ /* Find the end of the phrase */
+ assert (*(it++) == '"');
+ for (next = it; next != end; ++next) {
+ if (*next == '"') {
+ if (!escaped)
+ break;
+
+ orig = next++;
+ if (next == end || *next != '"') {
+ next = orig;
+ break;
+ }
+ }
+ }
+
+ /* Xapian still lexes +/-/( in quotes mode and simply doesn't
+ * generate tokens for them. For us, the term generator will
+ * discard them. */
+ term = talloc_strndup (s, it.raw (), next.raw () - it.raw ());
+ if (escaped) {
+ /* Replace doubled quotes with a single quote. */
+ for (src = dst = term; *src; ++src, ++dst) {
+ *dst = *src;
+ if (*src == '"')
+ ++src;
+ }
+ *dst = '\0';
+ }
+ lex_emit (s, TOK_TERMS, term);
+
+ if (next != end)
+ ++next;
+ return next;
+}
+
+static Xapian::Utf8Iterator
+lex_try_consume_prefix (_notmuch_qparser_t *qparser, Xapian::Utf8Iterator it,
+ char **prefixOut, int *flagsOut)
+{
+ Xapian::Utf8Iterator next (it), end;
+ char *prefix;
+ int flags;
+
+ *prefixOut = NULL;
+ while (next != end && *next != ':' && !is_whitespace (*next))
+ ++next;
+ if (*next != ':')
+ return it;
+ /* Ignore if followed by <= ' ' or ')' */
+ ++next;
+ if (*next <= ' ' || *next == ')')
+ return it;
+
+ prefix = talloc_strndup (qparser, it.raw (), next.raw () - it.raw() - 1);
+ flags = GPOINTER_TO_INT (g_hash_table_lookup (qparser->prefixes, prefix));
+ if (!flags) {
+ /* Not a known prefix */
+ talloc_free (prefix);
+ return it;
+ }
+ *prefixOut = prefix;
+ *flagsOut = flags;
+ return next;
+}
+
+static Xapian::Utf8Iterator
+lex_consume_term (const void *ctx, Xapian::Utf8Iterator it, char **termOut)
+{
+ Xapian::Utf8Iterator next (it), end;
+ /* Xapian permits other characters to separate term phrases. For
+ * example, "x#y" is parsed as two separate (non-phrase) terms.
+ * However, because the characters allowed in a term are
+ * context-sensitive, replicating this is very hard. Here we take
+ * a simpler approach where only whitespace and a few operator
+ * characters that are never term characters separate terms. */
+ while (next != end && !strchr ("()\"", *next) && !is_whitespace (*next))
+ ++next;
+ *termOut = talloc_strndup (ctx, it.raw (), next.raw () - it.raw ());
+ return next;
+}
+
+static notmuch_bool_t
+lex_operator (struct _notmuch_lexer_state *s, char *term,
+ const char *op, enum _notmuch_token_type type)
+{
+ size_t oplen = strlen (op);
+
+ if (strcasecmp (term, op) == 0) {
+ lex_emit (s, type, term);
+ return true;
+ }
+
+ /* Check for ADJ or NEAR with argument. Our parsing of this is
+ * slightly incompatible with Xapian, but I believe this to be a
+ * bug in Xapian. Xapian parses "x NEAR/y z" as three term
+ * phrases, "x", "near y", and "z", like we do. However, it
+ * behaves differently if the bad NEAR operator is at the end of
+ * the query, parsing "x NEAR/y" like "x NEAR y". */
+ if ((type == TOK_ADJ || type == TOK_NEAR) &&
+ strncasecmp (term, op, oplen) == 0 &&
+ term[oplen] == '/') {
+ /* Try to parse the distance argument */
+ char *end;
+ int distance = strtol (&term[oplen + 1], &end, 10);
+ if (distance && !*end) {
+ struct _notmuch_token *tok = lex_emit (s, type, term);
+ tok->distance = distance;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static _notmuch_token_t *
+lex (_notmuch_qparser_t *qparser, const char *query)
+{
+ Xapian::Utf8Iterator it (query), next, end;
+ struct _notmuch_lexer_state *s;
+ struct _notmuch_token *tok;
+ char *term;
+ int prefixFlags;
+
+ s = talloc (qparser, struct _notmuch_lexer_state);
+ if (!s)
+ return NULL;
+ s->qparser = qparser;
+ s->head = &tok_end;
+ s->tail = &s->head;
+
+ while (it != end) {
+ unsigned ch;
+ if ((it = lex_skip_ws (it)) == end)
+ break;
+
+ ch = *it;
+ switch (ch) {
+ case '+':
+ case '-':
+ ++it;
+ /* Xapian ignores these unless preceded by whitespace or
+ * an open paren, which has the effect of ignoring all
+ * +'s in "x +++y", "x#+y", and "(x)+y". We don't
+ * bother. */
+
+ /* Ignore if followed by a space or another + or - */
+ if (is_whitespace (*it) || *it == '+' || *it == '-')
+ continue;
+ lex_emit (s, ch == '+' ? TOK_LOVE : TOK_HATE, NULL);
+ continue;
+
+ case '"':
+ it = lex_quoted_phrase(s, it, false);
+ continue;
+
+ case '(':
+ ++it;
+ /* Xapian ignores this unless preceded by whitespace,
+ * parens, +, or -. We don't bother. */
+ lex_emit (s, TOK_BRA, NULL);
+ continue;
+
+ case ')':
+ ++it;
+ lex_emit (s, TOK_KET, NULL);
+ continue;
+ }
+
+ /* Scan for a prefix */
+ next = lex_try_consume_prefix (qparser, it, &term, &prefixFlags);
+ if (term && (*next == '"' || *next == '(' || is_wordchar (*next))) {
+ int literal = prefixFlags & PREFIX_LITERAL;
+ tok = lex_emit (s, TOK_PREFIX, term);
+ tok->prefixFlags = prefixFlags;
+
+ it = next;
+ if (literal && *it == '"') {
+ /* Literal quoted strings keep everything and allow
+ * quote escaping, unlike regular quoted phrases. */
+ it = lex_quoted_phrase (s, it, true);
+ } else if (literal) {
+ /* Xapian uses anything up to the next space or ')'
+ * (because literal prefixes can't be applied to
+ * subqueries). I disagree with Xapian here, since
+ * Xapian will keep the open paren but not the close
+ * paren. Better would be to balance them. */
+ if (*next == '(')
+ ++next;
+ while (next != end && *next > ' ' && *next != ')')
+ ++next;
+ term = talloc_strndup (s, it.raw (),
+ next.raw () - it.raw ());
+ lex_emit (s, TOK_TERMS, term);
+ it = next;
+ }
+ /* For quoted strings and subqueries following non-literal
+ * prefixes and regular terms, we lex as usual. */
+ continue;
+ }
+
+ /* Scan for a term phrase or operator */
+ it = lex_consume_term (s, it, &term);
+
+ /* Check operators */
+ if (lex_operator (s, term, "and", TOK_AND) ||
+ lex_operator (s, term, "not", TOK_NOT) ||
+ lex_operator (s, term, "xor", TOK_XOR) ||
+ lex_operator (s, term, "or", TOK_OR) ||
+ lex_operator (s, term, "adj", TOK_ADJ) ||
+ lex_operator (s, term, "near", TOK_NEAR))
+ continue;
+
+ /* Must be a term */
+ lex_emit (s, TOK_TERMS, term);
+ }
+
+ return s->head;
+}
+
+static void
+add_to_query (const void *ctx, _notmuch_token_t **query,
+ _notmuch_token_type op, _notmuch_token_t *right)
+{
+ if (!*query)
+ *query = right;
+ else if (right) {
+ _notmuch_token_t *tok = _notmuch_token_create (ctx, op, NULL);
+ tok->left = *query;
+ tok->right = right;
+ *query = tok;
+ }
+}
+
+static _notmuch_token_t *
+parse_expr (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok);
+static _notmuch_token_t *
+parse_term (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok);
+
+static _notmuch_token_t *
+parse_prob (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok)
+{
+ /* A prob is a sequence of three types of subqueries. Because the
+ * default query operator is AND, loved terms are not treated
+ * specially.
+ * 1) Probabilistic terms (prefixed or not). These are combined
+ * with the default query operator, AND.
+ * 2) Terms with a boolean prefix. All of the terms with the same
+ * prefix are combined with OR. Different prefixes are
+ * combined with AND.
+ * 3) Hate terms. These are combined with OR.
+ * The final IR looks like
+ * (probs AND (FILTER bools)) AND (NOT hates)
+ */
+
+ _notmuch_token_t *probs, *hates, *sub, *q;
+ GHashTable *bools;
+ int done = 0;
+
+ probs = hates = NULL;
+ bools = g_hash_table_new (g_str_hash, g_str_equal);
+ while (!done) {
+ switch ((*tok)->type) {
+ case TOK_KET:
+ if (prec < 10) {
+ /* Unmatched close paren. Ignore it. */
+ *tok = (*tok)->next;
+ break;
+ }
+ /* Fall through */
+ case TOK_AND: case TOK_OR: case TOK_XOR: case TOK_NOT:
+ case TOK_END:
+ /* End of the prob. Might be empty. */
+ done = 1;
+ break;
+
+ case TOK_HATE:
+ *tok = (*tok)->next;
+ sub = parse_expr (qparser, prec + 1, tok);
+ add_to_query (qparser, &hates, TOK_OR, sub);
+ break;
+
+ case TOK_PREFIX:
+ sub = parse_expr (qparser, prec + 1, tok);
+ if (!sub)
+ break;
+ if (sub->prefixFlags & PREFIX_PROB) {
+ add_to_query (qparser, &probs, TOK_AND, sub);
+ } else {
+ _notmuch_token_t *newb, *pre = (_notmuch_token_t*)
+ g_hash_table_lookup (bools, sub->text);
+ if (!pre) {
+ newb = sub;
+ } else {
+ /* OR subqueries with same prefix */
+ newb = _notmuch_token_create (qparser, TOK_OR, NULL);
+ newb->left = pre;
+ newb->right = sub;
+ }
+ g_hash_table_insert (bools, (void*)sub->text, newb);
+ }
+ break;
+
+ case TOK_LOVE:
+ /* Join into the query like any other term, since the
+ * default operator is AND anyway. */
+ *tok = (*tok)->next;
+ /* Fall through */
+ case TOK_BRA:
+ case TOK_TERMS:
+ case TOK_LIT:
+ sub = parse_expr (qparser, prec + 1, tok);
+ add_to_query (qparser, &probs, TOK_AND, sub);
+ break;
+
+ case TOK_ADJ:
+ case TOK_NEAR:
+ case TOK_FILTER:
+ /* XXX Can ADJ or NEAR happen? */
+ INTERNAL_ERROR ("Unexpected token type %d", (*tok)->type);
+ }
+ }
+
+ q = probs;
+ if (g_hash_table_size (bools)) {
+ /* Merge boolean filters */
+ _notmuch_token_t *filter;
+ GList *vals = g_hash_table_get_values (bools), *l;
+ sub = NULL;
+ for (l = vals; l; l = l->next)
+ add_to_query (qparser, &sub, TOK_AND, (_notmuch_token_t *) l->data);
+ g_list_free (vals);
+
+ /* Create filter */
+ filter = _notmuch_token_create (qparser, TOK_FILTER, NULL);
+ filter->left = sub;
+ add_to_query (qparser, &q, TOK_AND, filter);
+ }
+ if (hates) {
+ sub = _notmuch_token_create (qparser, TOK_NOT, NULL);
+ sub->left = hates;
+ add_to_query (qparser, &q, TOK_AND, sub);
+ }
+ g_hash_table_unref (bools);
+ return q;
+}
+
+static _notmuch_token_t *
+parse_term (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok)
+{
+ _notmuch_token_t *sub;
+
+ if ((*tok)->type == TOK_END) {
+ /* Arises from things like "x -". Ignore. */
+ return NULL;
+ } else if ((*tok)->type == TOK_PREFIX) {
+ sub = *tok;
+ *tok = (*tok)->next;
+ sub->left = parse_term (qparser, prec, tok);
+ if (!sub->left)
+ return NULL;
+ if (sub->prefixFlags & PREFIX_LITERAL) {
+ /* Convert TOK_TERMS to TOK_LIT */
+ assert (sub->left->type == TOK_TERMS);
+ sub->left->type = TOK_LIT;
+ } else if (sub->left->type == TOK_PREFIX) {
+ sub->left = sub->left->left;
+ }
+ return sub;
+ } else if ((*tok)->type == TOK_BRA) {
+ *tok = (*tok)->next;
+ sub = parse_expr (qparser, prec + 10 - (prec%10), tok);
+ if ((*tok)->type == TOK_KET)
+ *tok = (*tok)->next;
+ return sub;
+ }
+
+ if ((*tok)->type != TOK_TERMS && (*tok)->type != TOK_LIT) {
+ /* Arises from "+AND", "-AND", "prob:AND". We could give up
+ * and return nothing, but it seems nicer to treat the
+ * operator as a term if it came from the original query. */
+ if (!(*tok)->text)
+ return NULL;
+ (*tok)->type = TOK_TERMS;
+ }
+
+ sub = *tok;
+ *tok = (*tok)->next;
+ return sub;
+}
+
+static _notmuch_token_t *
+parse_expr (_notmuch_qparser_t *qparser, int prec, _notmuch_token_t **tok)
+{
+ /* If you squint at the Xapian grammar, it's a precedence grammar
+ * with one strange "prob" level. This implements all but the
+ * prob level and the leaf "term" level.
+ *
+ * prec is (nesting level * 10 + precedence level). Normally we
+ * only care about the precedence level, but the nesting level is
+ * important for recovering from unbalanced parens.
+ */
+ int bprec = prec % 10;
+ if (bprec == 3) {
+ if ((*tok)->type == TOK_NOT) {
+ /* Unary NOT */
+ _notmuch_token_t *root = *tok;
+ *tok = (*tok)->next;
+ root->left = parse_expr (qparser, prec, tok);
+ return root;
+ }
+
+ return parse_prob (qparser, prec, tok);
+ }
+ if (bprec == 5)
+ return parse_term (qparser, prec, tok);
+
+ _notmuch_token_t *left = parse_expr (qparser, prec + 1, tok);
+ while ((bprec == 0 && (*tok)->type == TOK_OR) ||
+ (bprec == 1 && (*tok)->type == TOK_XOR) ||
+ (bprec == 2 && ((*tok)->type == TOK_AND ||
+ (*tok)->type == TOK_NOT)) ||
+ (bprec == 4 && ((*tok)->type == TOK_NEAR ||
+ (*tok)->type == TOK_ADJ))) {
+ _notmuch_token_t *root = *tok;
+ if (root->type == TOK_NOT) {
+ /* Replace "x NOT y" with (AND x (NOT y)) by inserting an
+ * AND operator and leaning on the unary NOT rule. */
+ root = _notmuch_token_create (qparser, TOK_AND, NULL);
+ } else {
+ *tok = (*tok)->next;
+ }
+
+ /* Xapian treats x AND -y as x AND NOT y, which affects
+ * precedence. */
+ if (root->type == TOK_AND && (*tok)->type == TOK_HATE)
+ (*tok)->type = TOK_NOT;
+
+ _notmuch_token_t *right = parse_expr (qparser, prec + 1, tok);
+ if (left && right) {
+ root->left = left;
+ root->right = right;
+ } else {
+ /* Left or right was empty. This may be a syntax error
+ * like an omitted expression, or an empty expression. */
+ root = left ? left : right;
+ }
+ left = root;
+ }
+ return left;
+}
+
+static _notmuch_token_t *
+parse (_notmuch_qparser_t *qparser, _notmuch_token_t *toks)
+{
+ _notmuch_token_t *root = parse_expr (qparser, 0, &toks);
+ if (toks->type != TOK_END)
+ INTERNAL_ERROR ("Token stream not fully consumed: %s",
+ _notmuch_token_show_list (qparser, toks));
+ return root;
+}
+
+static char *
+generate_term (const void *ctx, const char *term, const char *prefix)
+{
+ notmuch_bool_t colon = FALSE;
+ if (isupper (term[0]) && strlen (prefix) > 1)
+ colon = TRUE;
+ return talloc_asprintf (ctx, "%s%s%s", prefix, colon ? ":" : "", term);
+}
+
+static Xapian::Query
+generate_terms (struct _notmuch_generate_state *s, const char *text,
+ const char *prefix)
+{
+ Xapian::TermGenerator tg;
+ Xapian::Document doc;
+ Xapian::TermIterator it, end;
+ Xapian::PositionIterator pit, pend;
+ Xapian::Query *qs, q;
+ unsigned int nterms = 0;
+
+ if (prefix)
+ tg.index_text (text, 1, prefix);
+ else
+ tg.index_text (text);
+ doc = tg.get_document ();
+
+ /* Find the highest positioned term. Positions are 1-based. */
+ end = doc.termlist_end ();
+ for (it = doc.termlist_begin (); it != end; ++it) {
+ pend = it.positionlist_end ();
+ for (pit = it.positionlist_begin (); pit != pend; ++pit) {
+ if (*pit > nterms)
+ nterms = *pit;
+ }
+ }
+ if (nterms == 0)
+ return Xapian::Query ();
+
+ /* Extract terms */
+ qs = new Xapian::Query[nterms];
+ for (it = doc.termlist_begin (); it != end; ++it) {
+ pend = it.positionlist_end ();
+ for (pit = it.positionlist_begin (); pit != pend; ++pit)
+ qs[*pit - 1] = Xapian::Query (*it, 1, s->termpos + *pit - 1);
+ }
+ s->termpos += nterms;
+
+ /* Build query */
+ q = Xapian::Query (Xapian::Query::OP_PHRASE, qs, qs + nterms, nterms);
+ delete [] qs;
+ return q;
+}
+
+static Xapian::Query
+generate (struct _notmuch_generate_state *s, _notmuch_token_t *root)
+{
+ using Xapian::Query;
+ Query l, r;
+ Query::op op;
+
+ if (!root)
+ return Query ();
+
+ /* The tricky part here is that generate is allowed to return a
+ * empty query, indicating that the user's query cannot be
+ * expressed. For example, the term "#" is an empty query. */
+
+ switch (root->type) {
+ case TOK_AND:
+ if (root->left->type == TOK_NOT && root->right->type != TOK_NOT) {
+ _notmuch_token_t *tmp = root->left;
+ root->left = root->right;
+ root->right = tmp;
+ }
+ l = generate (s, root->left);
+ if (l.empty()) {
+ return generate (s, root->right);
+ } else if (root->right->type == TOK_NOT) {
+ r = generate (s, root->right->left);
+ op = Query::OP_AND_NOT;
+ } else if (root->right->type == TOK_FILTER) {
+ r = generate (s, root->right->left);
+ op = Query::OP_FILTER;
+ } else {
+ r = generate (s, root->right);
+ op = Query::OP_AND;
+ }
+ if (r.empty())
+ return l;
+ return Query (op, l, r);
+
+ case TOK_NOT:
+ l = generate (s, root->left);
+ if (l.empty())
+ return l;
+ return Query (Query::OP_AND_NOT, Query ("", 1, 0), l);
+
+ case TOK_FILTER:
+ l = generate(s, root->left);
+ if (l.empty())
+ return l;
+ return Query (Query::OP_SCALE_WEIGHT, l, 0.0);
+
+ case TOK_OR:
+ case TOK_XOR:
+ l = generate (s, root->left);
+ r = generate (s, root->right);
+ if (l.empty())
+ return r;
+ if (r.empty())
+ return l;
+ return Query (root->type == TOK_OR ? Query::OP_OR : Query::OP_XOR,
+ l, r);
+
+ case TOK_ADJ:
+ case TOK_NEAR:
+ {
+ /* XXX This differs from Xapian. Xapian treats ADJ and NEAR
+ * as n-ary operators and constructs the whole list of terms
+ * to be searched within the largest specified window. */
+ Query subs[2];
+ subs[0] = Query (generate (s, root->left));
+ subs[1] = Query (generate (s, root->right));
+ if (subs[0].empty())
+ return subs[1];
+ if (subs[1].empty())
+ return subs[0];
+ return Query (root->type == TOK_ADJ ? Query::OP_PHRASE : Query::OP_NEAR,
+ subs, subs + 2, root->distance + 1);
+ }
+
+ case TOK_PREFIX:
+ return generate (s, root->left);
+
+ case TOK_TERMS:
+ return generate_terms (s, root->text, root->prefix);
+
+ case TOK_LIT:
+ return Query (generate_term (s, root->text, root->prefix));
+
+ case TOK_LOVE:
+ case TOK_HATE:
+ case TOK_BRA:
+ case TOK_KET:
+ case TOK_END:
+ INTERNAL_ERROR ("Illegal token type %s in IR", token_types[root->type]);
+ }
+ /* We leave this outside the switch so the compiler will warn us
+ * if we missed a token type. */
+ INTERNAL_ERROR ("Illegal token type %d in IR", root->type);
+ return Xapian::Query ();
+}
+
+static int
+_notmuch_qparser_destructor (_notmuch_qparser_t *qparser)
+{
+ g_hash_table_unref (qparser->prefixes);
+ return 0;
+}
+
+_notmuch_qparser_t *
+_notmuch_qparser_create (const void *ctx, notmuch_database_t *notmuch)
+{
+ _notmuch_qparser_t *qparser = talloc (ctx, _notmuch_qparser_t);
+ if (!qparser)
+ return NULL;
+ qparser->prefixes = NULL;
+ talloc_set_destructor (qparser, _notmuch_qparser_destructor);
+
+ qparser->notmuch = notmuch;
+ qparser->prefixes = g_hash_table_new (g_str_hash, g_str_equal);
+ qparser->transforms = NULL;
+ qparser->transformsTail = &qparser->transforms;
+
+ return qparser;
+}
+
+void
+_notmuch_qparser_add_prefix (_notmuch_qparser_t *qparser,
+ const char *prefix, notmuch_bool_t literal,
+ notmuch_bool_t boolean)
+{
+ int flags = ((literal ? PREFIX_LITERAL : 0) |
+ (boolean ? PREFIX_BOOL : PREFIX_PROB));
+ g_hash_table_insert (qparser->prefixes, talloc_strdup (qparser, prefix),
+ GINT_TO_POINTER (flags));
+}
+
+void
+_notmuch_qparser_add_transform (_notmuch_qparser_t *qparser,
+ _notmuch_token_t *(*transform) (
+ _notmuch_token_t *ast, void *opaque),
+ void *opaque)
+{
+ struct _notmuch_qparser_transform *t;
+ t = talloc (qparser, struct _notmuch_qparser_transform);
+ t->next = NULL;
+ t->transform = transform;
+ t->opaque = opaque;
+ *qparser->transformsTail = t;
+ qparser->transformsTail = &t->next;
+}
+
+struct _notmuch_transform_prefix_info {
+ char *field, *prefix;
+};
+
+static _notmuch_token_t *
+transform_prefix_rec (_notmuch_token_t *root, char *field, char *prefix,
+ notmuch_bool_t active)
+{
+ if (!root)
+ return NULL;
+ if (root->type == TOK_PREFIX) {
+ active = (strcmp (field, root->text) == 0);
+ } else if (root->type == TOK_TERMS || root->type == TOK_LIT) {
+ if (active)
+ root->prefix = prefix;
+ }
+ transform_prefix_rec (root->left, field, prefix, active);
+ transform_prefix_rec (root->right, field, prefix, active);
+ return root;
+}
+
+static _notmuch_token_t *
+transform_prefix (_notmuch_token_t *root, void *opaque)
+{
+ struct _notmuch_transform_prefix_info *info =
+ (struct _notmuch_transform_prefix_info*)opaque;
+ return transform_prefix_rec (root, info->field, info->prefix, FALSE);
+}
+
+void
+_notmuch_qparser_add_db_prefix (_notmuch_qparser_t *qparser,
+ const char *field, const char *prefix,
+ notmuch_bool_t boolean)
+{
+ struct _notmuch_transform_prefix_info *info;
+ info = talloc (qparser, struct _notmuch_transform_prefix_info);
+ info->field = talloc_strdup (info, field);
+ info->prefix = talloc_strdup (info, prefix);
+ _notmuch_qparser_add_prefix (qparser, field, boolean, boolean);
+ _notmuch_qparser_add_transform (qparser, transform_prefix, info);
+}
+
+_notmuch_token_t *
+_notmuch_qparser_lex (_notmuch_qparser_t *qparser, const char *query)
+{
+ return lex (qparser, query);
+}
+
+_notmuch_token_t *
+_notmuch_qparser_parse (_notmuch_qparser_t *qparser, const char *query)
+{
+ _notmuch_token_t *toks = lex (qparser, query);
+ return parse (qparser, toks);
+}
+
+_notmuch_token_t *
+_notmuch_qparser_transform (_notmuch_qparser_t *qparser, _notmuch_token_t *root)
+{
+ struct _notmuch_qparser_transform *t;
+ for (t = qparser->transforms; t; t = t->next)
+ root = t->transform (root, t->opaque);
+ return root;
+}
+
+Xapian::Query
+_notmuch_qparser_generate (_notmuch_qparser_t *qparser, _notmuch_token_t *root)
+{
+ struct _notmuch_generate_state *s;
+ s = talloc (qparser, struct _notmuch_generate_state);
+ s->qparser = qparser;
+ s->termpos = 1;
+ Xapian::Query query = generate (s, root);
+ talloc_free (s);
+ if (query.empty())
+ /* Return all documents */
+ return Xapian::Query ("", 1, 0);
+ return query;
+}
--
1.7.2.3
More information about the notmuch
mailing list