[PATCH v2 0/5] Use Xapian query syntax for batch-tag dump/restore
Austin Clements
amdragon at MIT.EDU
Tue Dec 25 19:48:38 PST 2012
This obsoletes
id:1356415076-5692-1-git-send-email-amdragon at mit.edu
In addition to incorporating all of David's suggestions, this reworks
the boolean term parsing so it only handles the subset of quoting
syntax used by make_boolean_term (which also happens to be all that we
described in the man page for the format). The diff from v1 is below.
diff --git a/man/man1/notmuch-restore.1 b/man/man1/notmuch-restore.1
index 6bba628..78fef52 100644
--- a/man/man1/notmuch-restore.1
+++ b/man/man1/notmuch-restore.1
@@ -57,10 +57,8 @@ sup calls them).
The
.B batch-tag
dump format is intended to more robust against malformed message-ids
-and tags containing whitespace or non-\fBascii\fR(7) characters. This
-format hex-escapes all characters those outside of a small character
-set, intended to be suitable for e.g. pathnames in most UNIX-like
-systems.
+and tags containing whitespace or non-\fBascii\fR(7) characters. See
+\fBnotmuch-dump\fR(1) for details on this format.
.B "notmuch restore"
updates the maildir flags according to tag changes if the
diff --git a/test/dump-restore b/test/dump-restore
index aecc393..f9ae5b3 100755
--- a/test/dump-restore
+++ b/test/dump-restore
@@ -200,6 +200,8 @@ a
# the next non-comment line should report an an empty tag error for
# batch tagging, but not for restore
+ +e -- id:20091117232137.GA7669 at griffis1.net
+# valid id, but warning about missing message
++e id:missing_message_id
EOF
cat <<EOF > EXPECTED
@@ -211,6 +213,7 @@ Warning: no query string after -- [+c +d --]
Warning: hex decoding of tag %zz failed [+%zz -- id:whatever]
Warning: cannot parse query: id:"
Warning: not an id query: tag:abc
+Warning: cannot apply tags to missing message: missing_message_id
EOF
test_expect_equal_file EXPECTED OUTPUT
diff --git a/test/random-corpus.c b/test/random-corpus.c
index d0e3e8f..8b7748e 100644
--- a/test/random-corpus.c
+++ b/test/random-corpus.c
@@ -96,9 +96,9 @@ random_utf8_string (void *ctx, size_t char_count)
buf = talloc_realloc (ctx, buf, gchar, buf_size);
}
- randomchar = random_unichar ();
- if (randomchar == '\n')
- randomchar = 'x';
+ do {
+ randomchar = random_unichar ();
+ } while (randomchar == '\n');
written = g_unichar_to_utf8 (randomchar, buf + offset);
diff --git a/util/string-util.c b/util/string-util.c
index eaa6c99..db01b4b 100644
--- a/util/string-util.c
+++ b/util/string-util.c
@@ -43,9 +43,11 @@ make_boolean_term (void *ctx, const char *prefix, const char *term,
size_t needed = 3;
int need_quoting = 0;
- /* Do we need quoting? */
+ /* Do we need quoting? To be paranoid, we quote anything
+ * containing a quote, even though it only matters at the
+ * beginning, and anything containing non-ASCII text. */
for (in = term; *in && !need_quoting; in++)
- if (*in <= ' ' || *in == ')' || *in == '"')
+ if (*in <= ' ' || *in == ')' || *in == '"' || (unsigned char)*in > 127)
need_quoting = 1;
if (need_quoting)
@@ -95,21 +97,6 @@ make_boolean_term (void *ctx, const char *prefix, const char *term,
return 0;
}
-static int
-consume_double_quote (const char **str)
-{
- if (**str == '"') {
- ++*str;
- return 1;
- } else if (strncmp(*str, "\xe2\x80\x9c", 3) == 0 || /* UTF8 0x201c */
- strncmp(*str, "\xe2\x80\x9d", 3) == 0) { /* UTF8 0x201d */
- *str += 3;
- return 3;
- } else {
- return 0;
- }
-}
-
int
parse_boolean_term (void *ctx, const char *str,
char **prefix_out, char **term_out)
@@ -123,28 +110,31 @@ parse_boolean_term (void *ctx, const char *str,
*prefix_out = talloc_strndup (ctx, str, pos - str);
++pos;
- /* Implement Xapian's boolean term de-quoting. This is a nearly
- * direct translation of QueryParser::Internal::parse_query. */
- pos = *term_out = talloc_strdup (ctx, pos);
- if (consume_double_quote (&pos)) {
- char *out = talloc_strdup (ctx, pos);
- pos = *term_out = out;
- while (1) {
- if (! *pos) {
- /* Premature end of string */
- goto FAIL;
- } else if (*pos == '"') {
- if (*++pos != '"')
+ /* Implement de-quoting compatible with make_boolean_term. */
+ if (*pos == '"') {
+ char *out = talloc_strdup (ctx, pos + 1);
+ int closed = 0;
+ /* Find the closing quote and un-double doubled internal
+ * quotes. */
+ for (pos = *term_out = out; *pos; ) {
+ if (*pos == '"') {
+ ++pos;
+ if (*pos != '"') {
+ /* Found the closing quote. */
+ closed = 1;
break;
- } else if (consume_double_quote (&pos)) {
- break;
+ }
}
*out++ = *pos++;
}
- if (*pos)
+ /* Did the term terminate without a closing quote or is there
+ * trailing text after the closing quote? */
+ if (!closed || *pos)
goto FAIL;
*out = '\0';
} else {
+ *term_out = talloc_strdup (ctx, pos);
+ /* Check for text after the boolean term. */
while (*pos > ' ' && *pos != ')')
++pos;
if (*pos)
diff --git a/util/string-util.h b/util/string-util.h
index e4e4c42..aff2d65 100644
--- a/util/string-util.h
+++ b/util/string-util.h
@@ -28,9 +28,9 @@ char *strtok_len (char *s, const char *delim, size_t *len);
int make_boolean_term (void *talloc_ctx, const char *prefix, const char *term,
char **buf, size_t *len);
-/* Parse a boolean term query, returning the prefix in *prefix_out and
- * the term in *term_out. *prefix_out and *term_out will be talloc'd
- * with context ctx.
+/* Parse a boolean term query produced by make_boolean_term, returning
+ * the prefix in *prefix_out and the term in *term_out. *prefix_out
+ * and *term_out will be talloc'd with context ctx.
*
* Return: 0 on success, non-zero on parse error (including trailing
* data in str).
More information about the notmuch
mailing list