[Patch v2 01/17] hex-escape: (en|de)code strings to/from restricted character set

Fri Nov 30 13:43:38 PST 2012

On Sat, 24 Nov 2012, david at tethera.net wrote:
> From: David Bremner <bremner at debian.org>
>
> The character set is chosen to be suitable for pathnames, and the same
> as that used by contrib/nmbug
>
> [With additions by Jani Nikula]

So it must be good. ;)

Just a couple of nitpicks below.

BR,
Jani.

> ---
>  util/Makefile.local |    2 +-
>  util/hex-escape.c   |  168 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  util/hex-escape.h   |   41 +++++++++++++
>  3 files changed, 210 insertions(+), 1 deletion(-)
>  create mode 100644 util/hex-escape.c
>  create mode 100644 util/hex-escape.h
>
> diff --git a/util/Makefile.local b/util/Makefile.local
> index c7cae61..3ca623e 100644
> --- a/util/Makefile.local
> +++ b/util/Makefile.local
> @@ -3,7 +3,7 @@
>  dir := util
>  extra_cflags += -I$(srcdir)/$(dir)
>  
> -libutil_c_srcs := $(dir)/xutil.c $(dir)/error_util.c
> +libutil_c_srcs := $(dir)/xutil.c $(dir)/error_util.c $(dir)/hex-escape.c
>  
>  libutil_modules := $(libutil_c_srcs:.c=.o)
>  
> diff --git a/util/hex-escape.c b/util/hex-escape.c
> new file mode 100644
> index 0000000..d8905d0
> --- /dev/null
> +++ b/util/hex-escape.c
> @@ -0,0 +1,168 @@
> +/* hex-escape.c -  Manage encoding and decoding of byte strings into path names
> + *
> + * Copyright (c) 2011 David Bremner
> + *
> + * This program is free software: you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation, either version 3 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program.  If not, see http://www.gnu.org/licenses/ .
> + *
> + * Author: David Bremner <david at tethera.net>
> + */
> +
> +#include <assert.h>
> +#include <string.h>
> +#include <talloc.h>
> +#include <ctype.h>
> +#include "error_util.h"
> +#include "hex-escape.h"
> +
> +static const size_t default_buf_size = 1024;
> +
> +static const char *output_charset =
> +    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+-_@=.:,";
> +
> +static const char escape_char = '%';
> +
> +static int
> +is_output (char c)
> +{
> +    return (strchr (output_charset, c) != NULL);
> +}
> +
> +static int
> +maybe_realloc (void *ctx, size_t needed, char **out, size_t *out_size)
> +{
> +    if (*out_size < needed) {
> +
> +	if (*out == NULL)
> +	    *out = talloc_size (ctx, needed);
> +	else
> +	    *out = talloc_realloc (ctx, *out, char, needed);
> +
> +	if (*out == NULL)
> +	    return 0;
> +
> +	*out_size = needed;
> +    }
> +    return 1;
> +}
> +
> +hex_status_t
> +hex_encode (void *ctx, const char *in, char **out, size_t *out_size)
> +{
> +
> +    const unsigned char *p;

The casts to unsigned char * below bother me. Perhaps this should be
just const char *, with the only cast being in the sprintf?

> +    char *q;
> +
> +    size_t escape_count = 0;
> +    size_t len = 0;
> +    size_t needed;
> +
> +    assert (ctx); assert (in); assert (out); assert (out_size);
> +
> +    for (p = (unsigned char *) in; *p; p++) {
> +	escape_count += (!is_output (*p));
> +	len++;
> +    }
> +
> +    needed = len + escape_count * 2 + 1;

I wonder if it would be clearer if escape_count and len were ditched,
and the for loop just did:

	needed += is_output (*p) ? 1 : 3;

and another needed++ after the loop for NUL. And maybe s/needed/len/
after that.

> +
> +    if (*out == NULL)
> +	*out_size = 0;
> +
> +    if (!maybe_realloc (ctx, needed, out, out_size))
> +	return HEX_OUT_OF_MEMORY;
> +
> +    q = *out;
> +    p = (unsigned char *) in;
> +
> +    while (*p) {
> +	if (is_output (*p)) {
> +	    *q++ = *p++;
> +	} else {
> +	    sprintf (q, "%%%02x", *p++);
> +	    q += 3;
> +	}
> +    }
> +
> +    *q = '\0';
> +    return HEX_SUCCESS;
> +}
> +
> +/* Hex decode 'in' to 'out'.
> + *
> + * This must succeed for in == out to support hex_decode_inplace().
> + */
> +static hex_status_t
> +hex_decode_internal (const char *in, unsigned char *out)
> +{
> +    char buf[3];
> +
> +    while (*in) {
> +	if (*in == escape_char) {
> +	    char *endp;
> +
> +	    /* This also handles unexpected end-of-string. */
> +	    if (!isxdigit ((unsigned char) in[1]) ||
> +		!isxdigit ((unsigned char) in[2]))
> +		return HEX_SYNTAX_ERROR;
> +
> +	    buf[0] = in[1];
> +	    buf[1] = in[2];
> +	    buf[2] = '\0';
> +
> +	    *out = strtoul (buf, &endp, 16);
> +
> +	    if (endp != buf + 2)
> +		return HEX_SYNTAX_ERROR;
> +
> +	    in += 3;
> +	    out++;
> +	} else {
> +	    *out++ = *in++;
> +	}
> +    }
> +
> +    *out = '\0';
> +
> +    return HEX_SUCCESS;
> +}
> +
> +hex_status_t
> +hex_decode_inplace (char *s)
> +{
> +    /* A decoded string is never longer than the encoded one, so it is
> +     * safe to decode a string onto itself. */
> +    return hex_decode_internal (s, (unsigned char *) s);
> +}
> +
> +hex_status_t
> +hex_decode (void *ctx, const char *in, char **out, size_t * out_size)
> +{
> +    const char *p;
> +    size_t escape_count = 0;
> +    size_t needed = 0;
> +
> +    assert (ctx); assert (in); assert (out); assert (out_size);
> +
> +    size_t len = strlen (in);
> +
> +    for (p = in; *p; p++)
> +	escape_count += (*p == escape_char);
> +
> +    needed = len - escape_count * 2 + 1;

Same as above for counting the needed size. It would also save scanning
the input string twice (strlen and for loop).

> +
> +    if (!maybe_realloc (ctx, needed, out, out_size))
> +	return HEX_OUT_OF_MEMORY;
> +
> +    return hex_decode_internal (in, (unsigned char *) *out);
> +}
> diff --git a/util/hex-escape.h b/util/hex-escape.h
> new file mode 100644
> index 0000000..5182042
> --- /dev/null
> +++ b/util/hex-escape.h
> @@ -0,0 +1,41 @@
> +#ifndef _HEX_ESCAPE_H
> +#define _HEX_ESCAPE_H
> +
> +typedef enum hex_status {
> +    HEX_SUCCESS = 0,
> +    HEX_SYNTAX_ERROR,
> +    HEX_OUT_OF_MEMORY
> +} hex_status_t;
> +
> +/*
> + * The API for hex_encode() and hex_decode() is modelled on that for
> + * getline.
> + *
> + * If 'out' points to a NULL pointer a char array of the appropriate
> + * size is allocated using talloc, and out_size is updated.
> + *
> + * If 'out' points to a non-NULL pointer, it assumed to describe an
> + * existing char array, with the size given in *out_size.  This array
> + * may be resized by talloc_realloc if needed; in this case *out_size
> + * will also be updated.
> + *
> + * Note that it is an error to pass a NULL pointer for any parameter
> + * of these routines.
> + */
> +
> +hex_status_t
> +hex_encode (void *talloc_ctx, const char *in, char **out,
> +            size_t *out_size);
> +
> +hex_status_t
> +hex_decode (void *talloc_ctx, const char *in, char **out,
> +            size_t *out_size);
> +
> +/*
> + * Non-allocating hex decode to decode 's' in-place. The length of the
> + * result is always equal to or shorter than the length of the
> + * original.
> + */
> +hex_status_t
> +hex_decode_inplace (char *s);
> +#endif
> -- 
> 1.7.10.4
>
> _______________________________________________
> notmuch mailing list
> notmuch at notmuchmail.org
> http://notmuchmail.org/mailman/listinfo/notmuch