[PATCH 2/8] Parse NEAR and ADJ operators.

Austin Clements amdragon at MIT.EDU
Sun Jan 16 00:10:52 PST 2011


NEAR and ADJ are treated as n-ary operators where all operands must be
terms, which fits with Xapian's own restrictions on near/adj queries.
This implementation is slightly more lenient than Xapian's in that it
allows phrases (both quoted and implicit) as operands and folds the
phrase terms in as operands to the near/adj operator.
---
 lib/notmuch-private.h |   10 +++++
 lib/qparser.cc        |  103 ++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 107 insertions(+), 6 deletions(-)

diff --git a/lib/notmuch-private.h b/lib/notmuch-private.h
index 06239b9..a42afd6 100644
--- a/lib/notmuch-private.h
+++ b/lib/notmuch-private.h
@@ -518,6 +518,12 @@ enum _notmuch_token_type {
     TOK_LOVE, TOK_HATE, TOK_BRA, TOK_KET,
     /* Binary operators.  These should have left and right children. */
     TOK_AND, TOK_OR, TOK_XOR,
+    /* n-ary operators.  In the AST, these are represented like lists
+     * of TOK_TERMS, with the left child being a TOK_TERMS and the
+     * right being another TOK_ADJ/TOK_NEAR.  The final right must be
+     * NULL.  Both tokens can also carry distances; the highest
+     * distance in the chain will be used. */
+    TOK_ADJ, TOK_NEAR,
     /* Unary operators.  These have only a left child.  Xapian::Query
      * has no pure NOT operator, so the generator treats NOT as the
      * child of an AND specially, and otherwise represents it as
@@ -555,6 +561,10 @@ typedef struct _notmuch_token {
     enum _notmuch_token_type type;
     const char *text;
 
+    /* For TOK_ADJ and TOK_NEAR, this specifies the distance
+     * argument. */
+    int distance;
+
     /* For TOK_PREFIX, the flags of this prefix. */
     int prefixFlags;
 
diff --git a/lib/qparser.cc b/lib/qparser.cc
index b86a445..5a6d39b 100644
--- a/lib/qparser.cc
+++ b/lib/qparser.cc
@@ -40,7 +40,6 @@
  * _notmuch_qparser_generate to perform step 4.
  *
  * Still missing from this implementation:
- * * NEAR/ADJ operators
  * * Stemming - The stemming should probably be marked on TOK_TERMS
  *   tokens.  Ideally, we can just pass this to the term generator.
  * * Wildcard queries - This should be available in the IR so it's
@@ -101,14 +100,14 @@ struct _notmuch_generate_state {
 
 static const char *token_types[] = {
     "LOVE", "HATE", "BRA", "KET",
-    "AND", "OR", "XOR",
+    "AND", "OR", "XOR", "ADJ", "NEAR",
     "NOT", "FILTER", "PREFIX",
     "TERMS", "LIT", "ERROR", "END"
 };
 
 /* The distinguished end token.  This simplifies the parser since it
  * never has to worry about dereferencing next. */
-static _notmuch_token_t tok_end = {TOK_END, NULL, FALSE, NULL,
+static _notmuch_token_t tok_end = {TOK_END, NULL, -1, FALSE, NULL,
 				   &tok_end, NULL, NULL};
 
 _notmuch_token_t *
@@ -118,6 +117,7 @@ _notmuch_token_create_op (const void *ctx, enum _notmuch_token_type type,
     _notmuch_token_t *tok = talloc (ctx, struct _notmuch_token);
     memset (tok, 0, sizeof (*tok));
     tok->type = type;
+    tok->distance = -1;
     tok->left = left;
     tok->right = right;
     return tok;
@@ -135,6 +135,7 @@ _notmuch_token_create_term (const void *ctx, enum _notmuch_token_type type,
 char *
 _notmuch_token_show (const void *ctx, _notmuch_token_t *tok)
 {
+    char dist[32] = "";
     int ispre = tok->type == TOK_PREFIX;
 
     if ((unsigned)tok->type > TOK_END)
@@ -147,9 +148,11 @@ _notmuch_token_show (const void *ctx, _notmuch_token_t *tok)
     else if (tok->type == TOK_ERROR)
 	return talloc_asprintf (ctx, "ERROR/\"%s\"", tok->text);
 
-    return talloc_asprintf (ctx, "%s%s%s",
+    if (tok->distance != -1)
+	sprintf(dist, "/%d", tok->distance);
+    return talloc_asprintf (ctx, "%s%s%s%s",
 			    token_types[tok->type],
-			    ispre ? "/" : "",
+			    dist, ispre ? "/" : "",
 			    ispre ? tok->text : "");
 }
 
@@ -308,10 +311,31 @@ static notmuch_bool_t
 lex_operator (struct _notmuch_lex_state *s, char *term,
 	      const char *op, enum _notmuch_token_type type)
 {
+    size_t oplen = strlen (op);
+
     if (strcasecmp (term, op) == 0) {
 	lex_emit (s, type, term);
 	return true;
     }
+
+    /* Check for ADJ or NEAR with argument.  Our parsing of this is
+     * slightly incompatible with Xapian, but I believe this to be a
+     * bug in Xapian.  Xapian parses "x NEAR/y z" as three term
+     * phrases, "x", "near y", and "z", like we do.  However, it
+     * behaves differently if the bad NEAR operator is at the end of
+     * the query, parsing "x NEAR/y" like "x NEAR y". */
+    if ((type == TOK_ADJ || type == TOK_NEAR) &&
+	strncasecmp (term, op, oplen) == 0 &&
+	term[oplen] == '/') {
+	/* Try to parse the distance argument */
+	char *end;
+	int distance = strtol (&term[oplen + 1], &end, 10);
+	if (distance && !*end) {
+	    struct _notmuch_token *tok = lex_emit (s, type, term);
+	    tok->distance = distance;
+	    return true;
+	}
+    }
     
     return false;
 }
@@ -403,7 +427,9 @@ lex (const void *ctx, _notmuch_qparser_t *qparser, const char *query)
 	if (lex_operator (s, term, "and",  TOK_AND) ||
 	    lex_operator (s, term, "not",  TOK_NOT) ||
 	    lex_operator (s, term, "xor",  TOK_XOR) ||
-	    lex_operator (s, term, "or",   TOK_OR))
+	    lex_operator (s, term, "or",   TOK_OR)  ||
+	    lex_operator (s, term, "adj",  TOK_ADJ) ||
+	    lex_operator (s, term, "near", TOK_NEAR))
 	    continue;
 
 	/* Must be a term */
@@ -495,6 +521,8 @@ parse_prob (struct _notmuch_parse_state *s, int prec, _notmuch_token_t **tok)
 	case TOK_BRA:
 	case TOK_TERMS:
 	case TOK_LIT:
+	case TOK_ADJ:
+	case TOK_NEAR:
 	    sub = parse_expr (s, prec + 1, tok);
 	    add_to_query (s->ctx, &probs, TOK_AND, sub);
 	    break;
@@ -529,6 +557,37 @@ parse_prob (struct _notmuch_parse_state *s, int prec, _notmuch_token_t **tok)
 }
 
 static _notmuch_token_t *
+parse_near (struct _notmuch_parse_state *s, int prec, _notmuch_token_t **tok)
+{
+    _notmuch_token_type first = (*tok)->type, conj = (*tok)->next->type;
+    _notmuch_token_t *root = parse_expr (s, prec + 1, tok);
+    _notmuch_token_t **tail = NULL;
+
+    /* XXX Xapian allows prefixed terms in near/adj. */
+    if (first != TOK_TERMS || !(conj == TOK_NEAR || conj == TOK_ADJ))
+	return root;
+
+    while ((*tok)->type == conj && (*tok)->next->type == TOK_TERMS) {
+	if (!tail) {
+	    /* First operator.  Create the list root. */
+	    _notmuch_token_t *nroot =
+		_notmuch_token_create_op (s->ctx, conj, root, NULL);
+	    root = nroot;
+	    tail = &nroot->right;
+	}
+
+	/* Append the operator and term token to the list */
+	*tail = *tok;
+	*tok = (*tok)->next;
+	(*tail)->left = *tok;
+	*tok = (*tok)->next;
+	tail = &(*tail)->right;
+    }
+
+    return root;
+}
+
+static _notmuch_token_t *
 parse_term (struct _notmuch_parse_state *s, int prec, _notmuch_token_t **tok)
 {
     _notmuch_token_t *sub;
@@ -596,6 +655,8 @@ parse_expr (struct _notmuch_parse_state *s, int prec, _notmuch_token_t **tok)
 	return parse_prob (s, prec, tok);
     }
     if (bprec == 4)
+	return parse_near (s, prec, tok);
+    if (bprec == 5)
 	return parse_term (s, prec, tok);
 
     _notmuch_token_t *left = parse_expr (s, prec + 1, tok);
@@ -756,6 +817,36 @@ generate (struct _notmuch_generate_state *s, _notmuch_token_t *root)
 	return Query (root->type == TOK_OR ? Query::OP_OR : Query::OP_XOR,
 		      l, r);
 
+    case TOK_ADJ:
+    case TOK_NEAR:
+    {
+	_notmuch_token_t *node;
+	int dist = -1;
+	char *terms = talloc_strdup (root, "");
+	/* Concatenate the operands and get the highest distance */
+	for (node = root; node; node = node->right) {
+	    if (node->left->type != TOK_TERMS)
+		INTERNAL_ERROR ("Illegal token in NEAR/ADJ: %s",
+				_notmuch_token_show (s->ctx, node->left));
+	    if (node->left->prefix)
+		INTERNAL_ERROR ("Prefixes not supported in NEAR/ADJ");
+
+	    terms = talloc_asprintf_append (terms, "%s ", node->left->text);
+	    if (node->distance > dist)
+		dist = node->distance;
+	}
+	/* The default distance is 10. */
+	if (dist == -1)
+	    dist = 10;
+	/* Generate a PHRASE or NEAR query.  If there are implicit
+	 * phrases, they will be split out and treated like any other
+	 * term in the operand list. */
+	op = root->type == TOK_ADJ ? Query::OP_PHRASE : Query::OP_NEAR;
+	l = generate_terms (s, terms, NULL, dist - 1, op);
+	talloc_free (terms);
+	return l;
+    }
+
     case TOK_PREFIX:
 	return generate (s, root->left);
 
-- 
1.7.2.3



More information about the notmuch mailing list