[PATCH v4 1/5] util: Factor out boolean term quoting routine
Austin Clements
amdragon at MIT.EDU
Thu Jan 3 23:26:53 PST 2013
Quoth Jani Nikula on Jan 03 at 5:48 pm:
> On Mon, 31 Dec 2012, Austin Clements <amdragon at MIT.EDU> wrote:
> > From: Austin Clements <amdragon at MIT.EDU>
> >
> > This is now a generic boolean term quoting function. It performs
> > minimal quoting to produce user-friendly queries.
> >
> > This could live in tag-util as well, but it is really nothing specific
> > to tags (although the conventions are specific to Xapian).
> >
> > The API is changed from "caller-allocates" to "readline-like". The
> > scan for max tag length is pushed down into the quoting routine.
> > Furthermore, this now combines the term prefix with the quoted term;
> > arguably this is just as easy to do in the caller, but this will
> > nicely parallel the boolean term parsing function to be introduced
> > shortly.
> >
> > This is an amalgamation of code written by David Bremner and myself.
> > ---
> > notmuch-tag.c | 48 ++++++++++++---------------------------
> > util/string-util.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++
> > util/string-util.h | 14 ++++++++++++
> > 3 files changed, 92 insertions(+), 34 deletions(-)
> >
> > diff --git a/notmuch-tag.c b/notmuch-tag.c
> > index 88d559b..fc9d43a 100644
> > --- a/notmuch-tag.c
> > +++ b/notmuch-tag.c
> > @@ -19,6 +19,7 @@
> > */
> >
> > #include "notmuch-client.h"
> > +#include "string-util.h"
> >
> > static volatile sig_atomic_t interrupted;
> >
> > @@ -35,25 +36,6 @@ handle_sigint (unused (int sig))
> > interrupted = 1;
> > }
> >
> > -static char *
> > -_escape_tag (char *buf, const char *tag)
> > -{
> > - const char *in = tag;
> > - char *out = buf;
> > -
> > - /* Boolean terms surrounded by double quotes can contain any
> > - * character. Double quotes are quoted by doubling them. */
> > - *out++ = '"';
> > - while (*in) {
> > - if (*in == '"')
> > - *out++ = '"';
> > - *out++ = *in++;
> > - }
> > - *out++ = '"';
> > - *out = 0;
> > - return buf;
> > -}
> > -
> > typedef struct {
> > const char *tag;
> > notmuch_bool_t remove;
> > @@ -71,25 +53,16 @@ _optimize_tag_query (void *ctx, const char *orig_query_string,
> > * parenthesize and the exclusion part of the query must not use
> > * the '-' operator (though the NOT operator is fine). */
> >
> > - char *escaped, *query_string;
> > + char *escaped = NULL;
> > + size_t escaped_len = 0;
> > + char *query_string;
> > const char *join = "";
> > - int i;
> > - unsigned int max_tag_len = 0;
> > + size_t i;
> >
> > /* Don't optimize if there are no tag changes. */
> > if (tag_ops[0].tag == NULL)
> > return talloc_strdup (ctx, orig_query_string);
> >
> > - /* Allocate a buffer for escaping tags. This is large enough to
> > - * hold a fully escaped tag with every character doubled plus
> > - * enclosing quotes and a NUL. */
> > - for (i = 0; tag_ops[i].tag; i++)
> > - if (strlen (tag_ops[i].tag) > max_tag_len)
> > - max_tag_len = strlen (tag_ops[i].tag);
> > - escaped = talloc_array (ctx, char, max_tag_len * 2 + 3);
> > - if (! escaped)
> > - return NULL;
> > -
> > /* Build the new query string */
> > if (strcmp (orig_query_string, "*") == 0)
> > query_string = talloc_strdup (ctx, "(");
> > @@ -97,10 +70,17 @@ _optimize_tag_query (void *ctx, const char *orig_query_string,
> > query_string = talloc_asprintf (ctx, "( %s ) and (", orig_query_string);
> >
> > for (i = 0; tag_ops[i].tag && query_string; i++) {
> > + /* XXX in case of OOM, query_string will be deallocated when
> > + * ctx is, which might be at shutdown */
> > + if (make_boolean_term (ctx,
> > + "tag", tag_ops[i].tag,
> > + &escaped, &escaped_len))
> > + return NULL;
> > +
> > query_string = talloc_asprintf_append_buffer (
> > - query_string, "%s%stag:%s", join,
> > + query_string, "%s%s%s", join,
> > tag_ops[i].remove ? "" : "not ",
> > - _escape_tag (escaped, tag_ops[i].tag));
> > + escaped);
> > join = " or ";
> > }
> >
> > diff --git a/util/string-util.c b/util/string-util.c
> > index 44f8cd3..e4bea21 100644
> > --- a/util/string-util.c
> > +++ b/util/string-util.c
> > @@ -20,6 +20,7 @@
> >
> >
> > #include "string-util.h"
> > +#include "talloc.h"
> >
> > char *
> > strtok_len (char *s, const char *delim, size_t *len)
> > @@ -32,3 +33,66 @@ strtok_len (char *s, const char *delim, size_t *len)
> >
> > return *len ? s : NULL;
> > }
> > +
> > +int
> > +make_boolean_term (void *ctx, const char *prefix, const char *term,
> > + char **buf, size_t *len)
> > +{
> > + const char *in;
> > + char *out;
> > + size_t needed = 3;
> > + int need_quoting = 0;
> > +
> > + /* Do we need quoting? To be paranoid, we quote anything
> > + * containing a quote, even though it only matters at the
> > + * beginning, and anything containing non-ASCII text. */
> > + for (in = term; *in && !need_quoting; in++)
> > + if (*in <= ' ' || *in == ')' || *in == '"' || (unsigned char)*in > 127)
>
> Should that be *in >= 127?
Nope. Character 127 is fine (and ASCII). Technically the only
non-ASCII characters that require quoting are 0x201c and 0x201d, but
rather than decoding UTF-8 to find those characters, it's much easier
to just quote if there are any non-ASCII UTF-8 bytes. (Extra
technically, we would be in real trouble if a tag contained 8-bit
bytes but wasn't valid UTF-8; however, I think this would be the least
of our worries.)
> Otherwise LGTM.
>
> Jani.
>
> > + need_quoting = 1;
> > +
> > + if (need_quoting)
> > + for (in = term; *in; in++)
> > + needed += (*in == '"') ? 2 : 1;
> > + else
> > + needed = strlen (term) + 1;
> > +
> > + /* Reserve space for the prefix */
> > + if (prefix)
> > + needed += strlen (prefix) + 1;
> > +
> > + if ((*buf == NULL) || (needed > *len)) {
> > + *len = 2 * needed;
> > + *buf = talloc_realloc (ctx, *buf, char, *len);
> > + }
> > +
> > + if (! *buf)
> > + return 1;
> > +
> > + out = *buf;
> > +
> > + /* Copy in the prefix */
> > + if (prefix) {
> > + strcpy (out, prefix);
> > + out += strlen (prefix);
> > + *out++ = ':';
> > + }
> > +
> > + if (! need_quoting) {
> > + strcpy (out, term);
> > + return 0;
> > + }
> > +
> > + /* Quote term by enclosing it in double quotes and doubling any
> > + * internal double quotes. */
> > + *out++ = '"';
> > + in = term;
> > + while (*in) {
> > + if (*in == '"')
> > + *out++ = '"';
> > + *out++ = *in++;
> > + }
> > + *out++ = '"';
> > + *out = '\0';
> > +
> > + return 0;
> > +}
> > diff --git a/util/string-util.h b/util/string-util.h
> > index ac7676c..b8844a3 100644
> > --- a/util/string-util.h
> > +++ b/util/string-util.h
> > @@ -19,4 +19,18 @@
> >
> > char *strtok_len (char *s, const char *delim, size_t *len);
> >
> > +/* Construct a boolean term query with the specified prefix (e.g.,
> > + * "id") and search term, quoting term as necessary. Specifically, if
> > + * term contains any non-printable ASCII characters, non-ASCII
> > + * characters, close parenthesis or double quotes, it will be enclosed
> > + * in double quotes and any internal double quotes will be doubled
> > + * (e.g. a"b -> "a""b"). The result will be a valid notmuch query and
> > + * can be parsed by parse_boolean_term.
> > + *
> > + * Output is into buf; it may be talloc_realloced.
> > + * Return: 0 on success, non-zero on memory allocation failure.
> > + */
> > +int make_boolean_term (void *talloc_ctx, const char *prefix, const char *term,
> > + char **buf, size_t *len);
> > +
> > #endif
--
Austin Clements MIT/'06/PhD/CSAIL
amdragon at mit.edu http://web.mit.edu/amdragon
Somewhere in the dream we call reality you will find me,
searching for the reality we call dreams.
More information about the notmuch
mailing list