[PATCH 3/8] Parse wildcard queries.

Austin Clements amdragon at MIT.EDU
Sun Jan 16 00:10:53 PST 2011


This implements support in the lexer and generator for wildcard terms,
expanding them into synonym queries the way Xapian does.  Since this
expansion is performed by the generator, it's easy to take advantage
of in query transforms.

With this, * works anywhere in the query, so we'll no longer need
special case code for '*' queries in query.cc.
---
 TODO                  |    3 --
 lib/notmuch-private.h |    6 +++-
 lib/qparser.cc        |   75 +++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 65 insertions(+), 19 deletions(-)

diff --git a/TODO b/TODO
index 438f7aa..10c8c12 100644
--- a/TODO
+++ b/TODO
@@ -228,9 +228,6 @@ for all messages with the word "to". If we don't provide the first
 behavior, perhaps we should exit on an error when a configured prefix
 is provided with no value?
 
-Support "*" in all cases and not just as a special case. That is, "* "
-should also work, as well as "* and tag:inbox".
-
 Implement a syntax for requesting set-theoertic operations on results
 of multiple searches. For example, I would like to do:
 
diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index a42afd6..eb346ea 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -522,7 +522,8 @@ enum _notmuch_token_type {
      * of TOK_TERMS, with the left child being a TOK_TERMS and the
      * right being another TOK_ADJ/TOK_NEAR.  The final right must be
      * NULL.  Both tokens can also carry distances; the highest
-     * distance in the chain will be used. */
+     * distance in the chain will be used.  The operand terms may not
+     * be prefixed or wildcards. */
     TOK_ADJ, TOK_NEAR,
     /* Unary operators.  These have only a left child.  Xapian::Query
      * has no pure NOT operator, so the generator treats NOT as the
@@ -572,6 +573,9 @@ typedef struct _notmuch_token {
      * generating database terms.  This must be filled in a
      * transformation pass. */
     const char *prefix;
+    /* For TOK_TERMS and TOK_LIT, indicates that this token should
+     * match any terms prefixed with text. */
+    notmuch_bool_t wildcard;
 
     /* Link in the lexer token list. */
     struct _notmuch_token *next;
diff --git a/lib/qparser.cc b/lib/qparser.cc
index 5a6d39b..bd0296a 100644
--- a/lib/qparser.cc
+++ b/lib/qparser.cc
@@ -33,7 +33,8 @@
  * 3) We transform the parse tree, running a sequence of
  *    caller-provided transformation functions over the tree.
  * 4) We generate the Xapian::Query from the transformed IR.  This
- *    step also splits phrase tokens into multiple query terms.
+ *    step also splits phrase tokens into multiple query terms and
+ *    expands wildcard terms.
  *
  * To use the query parser, call _notmuch_qparser_parse to perform
  * steps 1 and 2, _notmuch_qparser_transform to perform step 3, and
@@ -42,10 +43,7 @@
  * Still missing from this implementation:
  * * Stemming - The stemming should probably be marked on TOK_TERMS
  *   tokens.  Ideally, we can just pass this to the term generator.
- * * Wildcard queries - This should be available in the IR so it's
- *   easy to generate wildcard queries in a transformer.
  * * Value ranges in the IR
- * * Queries "" and "*"
  */
 
 /* XXX notmuch currently registers "tag" as an exclusive boolean
@@ -55,7 +53,7 @@
 #include "notmuch-private.h"
 #include "database-private.h"
 
-#include <glib.h>		/* GHashTable */
+#include <glib.h>		/* GHashTable, GPtrArray */
 
 struct _notmuch_qparser {
     notmuch_database_t *notmuch;
@@ -107,7 +105,7 @@ static const char *token_types[] = {
 
 /* The distinguished end token.  This simplifies the parser since it
  * never has to worry about dereferencing next. */
-static _notmuch_token_t tok_end = {TOK_END, NULL, -1, FALSE, NULL,
+static _notmuch_token_t tok_end = {TOK_END, NULL, -1, FALSE, NULL, FALSE,
 				   &tok_end, NULL, NULL};
 
 _notmuch_token_t *
@@ -142,9 +140,11 @@ _notmuch_token_show (const void *ctx, _notmuch_token_t *tok)
 	return talloc_asprintf (ctx, "<bad type %d>", tok->type);
 
     if (tok->type == TOK_TERMS)
-	return talloc_asprintf (ctx, "\"%s\"", tok->text);
+	return talloc_asprintf (ctx, "\"%s\"%s", tok->text,
+				tok->wildcard ? "*" : "");
     else if (tok->type == TOK_LIT)
-	return talloc_asprintf (ctx, "'%s'", tok->text);
+	return talloc_asprintf (ctx, "'%s'%s", tok->text,
+				tok->wildcard ? "*" : "");
     else if (tok->type == TOK_ERROR)
 	return talloc_asprintf (ctx, "ERROR/\"%s\"", tok->text);
 
@@ -348,7 +348,7 @@ lex (const void *ctx, _notmuch_qparser_t *qparser, const char *query)
     struct _notmuch_lex_state *s = &state;
     struct _notmuch_token *tok;
     char *term;
-    int prefixFlags, literal;
+    int prefixFlags, literal, wildcard, n;
 
     while (it != end) {
 	unsigned ch;
@@ -433,7 +433,15 @@ lex (const void *ctx, _notmuch_qparser_t *qparser, const char *query)
 	    continue;
 
 	/* Must be a term */
-	lex_emit (s, TOK_TERMS, term);
+	wildcard = 0;
+	n = strlen(term);
+	if (n && term[n-1] == '*') {
+	    /* Wildcard */
+	    wildcard = 1;
+	    term[n-1] = 0;
+	}
+	tok = lex_emit (s, TOK_TERMS, term);
+	tok->wildcard = wildcard;
     }
 
     return s->head;
@@ -713,8 +721,39 @@ generate_term (struct _notmuch_generate_state *s, const char *term,
 }
 
 static Xapian::Query
+generate_wildcard (struct _notmuch_generate_state *s, const char *term)
+{
+    GPtrArray *subarr;
+    Xapian::Query query, **qs;
+    Xapian::Database *db = s->qparser->notmuch->xapian_db;
+    Xapian::TermIterator i = db->allterms_begin (term),
+	end = db->allterms_end (term);
+
+    subarr = g_ptr_array_new ();
+    for (; i != end; i++)
+	g_ptr_array_add (subarr, new Xapian::Query (*i, 1, s->termpos));
+    /* If the term didn't expand, then return a query over the
+     * unexpanded term, which is guaranteed not to match anything.
+     * We can't simply return an empty query because Xapian treats
+     * those specially. */
+    if (!subarr->len) {
+	g_ptr_array_free (subarr, TRUE);
+	return Xapian::Query (term);
+    }
+
+    s->termpos++;
+    qs = (Xapian::Query**)subarr->pdata;
+    query = Xapian::Query (Xapian::Query::OP_SYNONYM, qs, qs + subarr->len);
+    for (unsigned int i = 0; i < subarr->len; ++i)
+	delete qs[i];
+    g_ptr_array_free (subarr, TRUE);
+    return query;
+}
+
+static Xapian::Query
 generate_terms (struct _notmuch_generate_state *s, const char *text,
-		const char *prefix, int distance, Xapian::Query::op op)
+		const char *prefix, notmuch_bool_t wildcard, int distance,
+		Xapian::Query::op op)
 {
     Xapian::TermGenerator tg;
     Xapian::Document doc;
@@ -740,6 +779,8 @@ generate_terms (struct _notmuch_generate_state *s, const char *text,
     }
     if (nterms == 0)
 	return Xapian::Query ();
+    if (nterms == 1 && wildcard)
+	return generate_wildcard (s, text);
 
     /* Extract terms */
     qs = new Xapian::Query[nterms];
@@ -762,6 +803,7 @@ generate (struct _notmuch_generate_state *s, _notmuch_token_t *root)
     using Xapian::Query;
     Query l, r;
     Query::op op;
+    char *term;
 
     if (!root)
 	return Query ();
@@ -842,7 +884,7 @@ generate (struct _notmuch_generate_state *s, _notmuch_token_t *root)
 	 * phrases, they will be split out and treated like any other
 	 * term in the operand list. */
 	op = root->type == TOK_ADJ ? Query::OP_PHRASE : Query::OP_NEAR;
-	l = generate_terms (s, terms, NULL, dist - 1, op);
+	l = generate_terms (s, terms, NULL, FALSE, dist - 1, op);
 	talloc_free (terms);
 	return l;
     }
@@ -851,11 +893,14 @@ generate (struct _notmuch_generate_state *s, _notmuch_token_t *root)
 	return generate (s, root->left);
 
     case TOK_TERMS:
-	return generate_terms (s, root->text, root->prefix, 0,
-			       Query::OP_PHRASE);
+	return generate_terms (s, root->text, root->prefix, root->wildcard,
+			       0, Query::OP_PHRASE);
 
     case TOK_LIT:
-	return Query (generate_term (s, root->text, root->prefix));
+	term = generate_term (s, root->text, root->prefix);
+	if (root->wildcard)
+	    return generate_wildcard (s, term);
+	return Query (term);
 
     case TOK_ERROR:
 	if (!s->error)
-- 
1.7.2.3



More information about the notmuch mailing list