[PATCH v2 0/5] Use Xapian query syntax for batch-tag dump/restore

Austin Clements amdragon at MIT.EDU
Tue Dec 25 19:48:38 PST 2012


This obsoletes

  id:1356415076-5692-1-git-send-email-amdragon at mit.edu

In addition to incorporating all of David's suggestions, this reworks
the boolean term parsing so it only handles the subset of quoting
syntax used by make_boolean_term (which also happens to be all that we
described in the man page for the format).  The diff from v1 is below.

diff --git a/man/man1/notmuch-restore.1 b/man/man1/notmuch-restore.1
index 6bba628..78fef52 100644
--- a/man/man1/notmuch-restore.1
+++ b/man/man1/notmuch-restore.1
@@ -57,10 +57,8 @@ sup calls them).
 The
 .B batch-tag
 dump format is intended to more robust against malformed message-ids
-and tags containing whitespace or non-\fBascii\fR(7) characters.  This
-format hex-escapes all characters those outside of a small character
-set, intended to be suitable for e.g. pathnames in most UNIX-like
-systems.
+and tags containing whitespace or non-\fBascii\fR(7) characters.  See
+\fBnotmuch-dump\fR(1) for details on this format.
 
 .B "notmuch restore"
 updates the maildir flags according to tag changes if the
diff --git a/test/dump-restore b/test/dump-restore
index aecc393..f9ae5b3 100755
--- a/test/dump-restore
+++ b/test/dump-restore
@@ -200,6 +200,8 @@ a
 # the next non-comment line should report an an empty tag error for
 # batch tagging, but not for restore
 + +e -- id:20091117232137.GA7669 at griffis1.net
+# valid id, but warning about missing message
++e id:missing_message_id
 EOF
 
 cat <<EOF > EXPECTED
@@ -211,6 +213,7 @@ Warning: no query string after -- [+c +d --]
 Warning: hex decoding of tag %zz failed [+%zz -- id:whatever]
 Warning: cannot parse query: id:"
 Warning: not an id query: tag:abc
+Warning: cannot apply tags to missing message: missing_message_id
 EOF
 
 test_expect_equal_file EXPECTED OUTPUT
diff --git a/test/random-corpus.c b/test/random-corpus.c
index d0e3e8f..8b7748e 100644
--- a/test/random-corpus.c
+++ b/test/random-corpus.c
@@ -96,9 +96,9 @@ random_utf8_string (void *ctx, size_t char_count)
 	    buf = talloc_realloc (ctx, buf, gchar, buf_size);
 	}
 
-	randomchar = random_unichar ();
-	if (randomchar == '\n')
-	    randomchar = 'x';
+	do {
+	    randomchar = random_unichar ();
+	} while (randomchar == '\n');
 
 	written = g_unichar_to_utf8 (randomchar, buf + offset);
 
diff --git a/util/string-util.c b/util/string-util.c
index eaa6c99..db01b4b 100644
--- a/util/string-util.c
+++ b/util/string-util.c
@@ -43,9 +43,11 @@ make_boolean_term (void *ctx, const char *prefix, const char *term,
     size_t needed = 3;
     int need_quoting = 0;
 
-    /* Do we need quoting? */
+    /* Do we need quoting?  To be paranoid, we quote anything
+     * containing a quote, even though it only matters at the
+     * beginning, and anything containing non-ASCII text. */
     for (in = term; *in && !need_quoting; in++)
-	if (*in <= ' ' || *in == ')' || *in == '"')
+	if (*in <= ' ' || *in == ')' || *in == '"' || (unsigned char)*in > 127)
 	    need_quoting = 1;
 
     if (need_quoting)
@@ -95,21 +97,6 @@ make_boolean_term (void *ctx, const char *prefix, const char *term,
     return 0;
 }
 
-static int
-consume_double_quote (const char **str)
-{
-    if (**str == '"') {
-	++*str;
-	return 1;
-    } else if (strncmp(*str, "\xe2\x80\x9c", 3) == 0 || /* UTF8 0x201c */
-	       strncmp(*str, "\xe2\x80\x9d", 3) == 0) { /* UTF8 0x201d */
-	*str += 3;
-	return 3;
-    } else {
-	return 0;
-    }
-}
-
 int
 parse_boolean_term (void *ctx, const char *str,
 		    char **prefix_out, char **term_out)
@@ -123,28 +110,31 @@ parse_boolean_term (void *ctx, const char *str,
     *prefix_out = talloc_strndup (ctx, str, pos - str);
     ++pos;
 
-    /* Implement Xapian's boolean term de-quoting.  This is a nearly
-     * direct translation of QueryParser::Internal::parse_query. */
-    pos = *term_out = talloc_strdup (ctx, pos);
-    if (consume_double_quote (&pos)) {
-	char *out = talloc_strdup (ctx, pos);
-	pos = *term_out = out;
-	while (1) {
-	    if (! *pos) {
-		/* Premature end of string */
-		goto FAIL;
-	    } else if (*pos == '"') {
-		if (*++pos != '"')
+    /* Implement de-quoting compatible with make_boolean_term. */
+    if (*pos == '"') {
+	char *out = talloc_strdup (ctx, pos + 1);
+	int closed = 0;
+	/* Find the closing quote and un-double doubled internal
+	 * quotes. */
+	for (pos = *term_out = out; *pos; ) {
+	    if (*pos == '"') {
+		++pos;
+		if (*pos != '"') {
+		    /* Found the closing quote. */
+		    closed = 1;
 		    break;
-	    } else if (consume_double_quote (&pos)) {
-		break;
+		}
 	    }
 	    *out++ = *pos++;
 	}
-	if (*pos)
+	/* Did the term terminate without a closing quote or is there
+	 * trailing text after the closing quote? */
+	if (!closed || *pos)
 	    goto FAIL;
 	*out = '\0';
     } else {
+	*term_out = talloc_strdup (ctx, pos);
+	/* Check for text after the boolean term. */
 	while (*pos > ' ' && *pos != ')')
 	    ++pos;
 	if (*pos)
diff --git a/util/string-util.h b/util/string-util.h
index e4e4c42..aff2d65 100644
--- a/util/string-util.h
+++ b/util/string-util.h
@@ -28,9 +28,9 @@ char *strtok_len (char *s, const char *delim, size_t *len);
 int make_boolean_term (void *talloc_ctx, const char *prefix, const char *term,
 		       char **buf, size_t *len);
 
-/* Parse a boolean term query, returning the prefix in *prefix_out and
- * the term in *term_out.  *prefix_out and *term_out will be talloc'd
- * with context ctx.
+/* Parse a boolean term query produced by make_boolean_term, returning
+ * the prefix in *prefix_out and the term in *term_out.  *prefix_out
+ * and *term_out will be talloc'd with context ctx.
  *
  * Return: 0 on success, non-zero on parse error (including trailing
  * data in str).




More information about the notmuch mailing list