[Patch v6 1/6] dump: support gzipped and atomic output

Austin Clements amdragon at MIT.EDU
Fri Apr 4 15:05:47 PDT 2014


Quoth David Bremner on Apr 03 at  4:41 pm:
> The main goal is to support gzipped output for future internal
> calls (e.g. from notmuch-new) to notmuch_database_dump.
> 
> The additional dependency is not very heavy since xapian already pulls
> in zlib.
> 
> We want the dump to be "atomic", in the sense that after running the
> dump file is either present and complete, or not present.  This avoids
> certain classes of mishaps involving overwriting a good backup with a
> bad or partial one.
> ---
>  INSTALL                   | 20 ++++++++--
>  Makefile.local            |  2 +-
>  configure                 | 28 ++++++++++++--
>  doc/man1/notmuch-dump.rst |  3 ++
>  notmuch-client.h          |  4 +-
>  notmuch-dump.c            | 95 +++++++++++++++++++++++++++++++++++++----------
>  test/T240-dump-restore.sh | 12 ++++++
>  7 files changed, 136 insertions(+), 28 deletions(-)
> 
> diff --git a/INSTALL b/INSTALL
> index 690b0ef..b543c50 100644
> --- a/INSTALL
> +++ b/INSTALL
> @@ -20,8 +20,8 @@ configure stage.
>  
>  Dependencies
>  ------------
> -Notmuch depends on three libraries: Xapian, GMime 2.4 or 2.6, and
> -Talloc which are each described below:
> +Notmuch depends on four libraries: Xapian, GMime 2.4 or 2.6,
> +Talloc, and zlib which are each described below:
>  
>  	Xapian
>  	------
> @@ -60,6 +60,18 @@ Talloc which are each described below:
>  
>  	Talloc is available from http://talloc.samba.org/
>  
> +	zlib
> +	----
> +
> +	zlib is an extremely popular compression library. It is used
> +	by Xapian, so if you installed that you will already have
> +	zlib. You may need to install the zlib headers separately.
> +
> +	Notmuch needs the transparent write feature of zlib introduced
> +	in version 1.2.5.2 (Dec. 2011).
> +
> +	zlib is available from http://zlib.net
> +
>  Building Documentation
>  ----------------------
>  
> @@ -79,11 +91,11 @@ dependencies with a simple simple command line. For example:
>  
>    For Debian and similar:
>  
> -        sudo apt-get install libxapian-dev libgmime-2.6-dev libtalloc-dev python-sphinx
> +        sudo apt-get install libxapian-dev libgmime-2.6-dev libtalloc-dev zlib1g-dev python-sphinx
>  
>    For Fedora and similar:
>  
> -	sudo yum install xapian-core-devel gmime-devel libtalloc-devel python-sphinx
> +	sudo yum install xapian-core-devel gmime-devel libtalloc-devel zlib-devel python-sphinx
>  
>  On other systems, a similar command can be used, but the details of
>  the package names may be different.
> diff --git a/Makefile.local b/Makefile.local
> index cb7b106..e5a20a7 100644
> --- a/Makefile.local
> +++ b/Makefile.local
> @@ -41,7 +41,7 @@ PV_FILE=bindings/python/notmuch/version.py
>  # Smash together user's values with our extra values
>  FINAL_CFLAGS = -DNOTMUCH_VERSION=$(VERSION) $(CPPFLAGS) $(CFLAGS) $(WARN_CFLAGS) $(extra_cflags) $(CONFIGURE_CFLAGS)
>  FINAL_CXXFLAGS = $(CPPFLAGS) $(CXXFLAGS) $(WARN_CXXFLAGS) $(extra_cflags) $(extra_cxxflags) $(CONFIGURE_CXXFLAGS)
> -FINAL_NOTMUCH_LDFLAGS = $(LDFLAGS) -Lutil -lutil -Llib -lnotmuch $(AS_NEEDED_LDFLAGS) $(GMIME_LDFLAGS) $(TALLOC_LDFLAGS)
> +FINAL_NOTMUCH_LDFLAGS = $(LDFLAGS) -Lutil -lutil -Llib -lnotmuch $(AS_NEEDED_LDFLAGS) $(GMIME_LDFLAGS) $(TALLOC_LDFLAGS) $(ZLIB_LDFLAGS)
>  FINAL_NOTMUCH_LINKER = CC
>  ifneq ($(LINKER_RESOLVES_LIBRARY_DEPENDENCIES),1)
>  FINAL_NOTMUCH_LDFLAGS += $(CONFIGURE_LDFLAGS)
> diff --git a/configure b/configure
> index 1d430b9..1d624f7 100755
> --- a/configure
> +++ b/configure
> @@ -340,6 +340,18 @@ else
>      errors=$((errors + 1))
>  fi
>  
> +printf "Checking for zlib (>= 1.2.5.2)... "
> +have_zlib=0
> +if pkg-config --atleast-version=1.2.5.2 zlib; then
> +    printf "Yes.\n"
> +    have_zlib=1
> +    zlib_cflags=$(pkg-config --cflags zlib)
> +    zlib_ldflags=$(pkg-config --libs zlib)
> +else
> +    printf "No.\n"
> +    errors=$((errors + 1))
> +fi
> +
>  printf "Checking for talloc development files... "
>  if pkg-config --exists talloc; then
>      printf "Yes.\n"
> @@ -496,6 +508,11 @@ EOF
>  	echo "	Xapian library (including development files such as headers)"
>  	echo "	http://xapian.org/"
>      fi
> +    if [ $have_zlib -eq 0 ]; then
> +	echo "	zlib library (including development files such as headers)"
> +	echo "	http://zlib.net/"
> +	echo
> +    fi
>      if [ $have_gmime -eq 0 ]; then
>  	echo "	Either GMime 2.4 library" $GMIME_24_VERSION_CTR "or GMime 2.6 library" $GMIME_26_VERSION_CTR
>  	echo "	(including development files such as headers)"
> @@ -519,11 +536,11 @@ case a simple command will install everything you need. For example:
>  
>  On Debian and similar systems:
>  
> -	sudo apt-get install libxapian-dev libgmime-2.6-dev libtalloc-dev
> +	sudo apt-get install libxapian-dev libgmime-2.6-dev libtalloc-dev zlib1g-dev
>  
>  Or on Fedora and similar systems:
>  
> -	sudo yum install xapian-core-devel gmime-devel libtalloc-devel
> +	sudo yum install xapian-core-devel gmime-devel libtalloc-devel zlib-devel
>  
>  On other systems, similar commands can be used, but the details of the
>  package names may be different.
> @@ -844,6 +861,10 @@ XAPIAN_LDFLAGS = ${xapian_ldflags}
>  GMIME_CFLAGS = ${gmime_cflags}
>  GMIME_LDFLAGS = ${gmime_ldflags}
>  
> +# Flags needed to compile and link against zlib
> +ZLIB_CFLAGS = ${zlib_cflags}
> +ZLIB_LDFLAGS = ${zlib_ldflags}
> +
>  # Flags needed to compile and link against talloc
>  TALLOC_CFLAGS = ${talloc_cflags}
>  TALLOC_LDFLAGS = ${talloc_ldflags}
> @@ -882,6 +903,7 @@ CONFIGURE_CFLAGS = -DHAVE_GETLINE=\$(HAVE_GETLINE) \$(GMIME_CFLAGS)      \\
>  		   -DUTIL_BYTE_ORDER=\$(UTIL_BYTE_ORDER)
>  
>  CONFIGURE_CXXFLAGS = -DHAVE_GETLINE=\$(HAVE_GETLINE) \$(GMIME_CFLAGS)    \\
> +		     \$(ZLIB_CFLAGS)					 \\
>  		     \$(TALLOC_CFLAGS) -DHAVE_VALGRIND=\$(HAVE_VALGRIND) \\
>  		     \$(VALGRIND_CFLAGS) \$(XAPIAN_CXXFLAGS)             \\
>  		     -DHAVE_STRCASESTR=\$(HAVE_STRCASESTR)               \\
> @@ -892,5 +914,5 @@ CONFIGURE_CXXFLAGS = -DHAVE_GETLINE=\$(HAVE_GETLINE) \$(GMIME_CFLAGS)    \\
>  		     -DHAVE_XAPIAN_COMPACT=\$(HAVE_XAPIAN_COMPACT)       \\
>  		     -DUTIL_BYTE_ORDER=\$(UTIL_BYTE_ORDER)
>  
> -CONFIGURE_LDFLAGS =  \$(GMIME_LDFLAGS) \$(TALLOC_LDFLAGS) \$(XAPIAN_LDFLAGS)
> +CONFIGURE_LDFLAGS =  \$(GMIME_LDFLAGS) \$(TALLOC_LDFLAGS) \$(ZLIB_LDFLAGS) \$(XAPIAN_LDFLAGS)
>  EOF
> diff --git a/doc/man1/notmuch-dump.rst b/doc/man1/notmuch-dump.rst
> index 17d1da5..d94cb4f 100644
> --- a/doc/man1/notmuch-dump.rst
> +++ b/doc/man1/notmuch-dump.rst
> @@ -19,6 +19,9 @@ recreated from the messages themselves. The output of notmuch dump is
>  therefore the only critical thing to backup (and much more friendly to
>  incremental backup than the native database files.)
>  
> +``--gzip``
> +    Compress the output in a format compatible with **gzip(1)**.
> +
>  ``--format=(sup|batch-tag)``
>      Notmuch restore supports two plain text dump formats, both with one
>      message-id per line, followed by a list of tags.
> diff --git a/notmuch-client.h b/notmuch-client.h
> index d110648..e1efbe0 100644
> --- a/notmuch-client.h
> +++ b/notmuch-client.h
> @@ -450,7 +450,9 @@ typedef enum dump_formats {
>  int
>  notmuch_database_dump (notmuch_database_t *notmuch,
>  		       const char *output_file_name,
> -		       const char *query_str, dump_format_t output_format);
> +		       const char *query_str,
> +		       dump_format_t output_format,
> +		       notmuch_bool_t gzip_output);
>  
>  #include "command-line-arguments.h"
>  #endif
> diff --git a/notmuch-dump.c b/notmuch-dump.c
> index 21702d7..2a7252a 100644
> --- a/notmuch-dump.c
> +++ b/notmuch-dump.c
> @@ -21,9 +21,11 @@
>  #include "notmuch-client.h"
>  #include "hex-escape.h"
>  #include "string-util.h"
> +#include <zlib.h>
> +
>  
>  static int
> -database_dump_file (notmuch_database_t *notmuch, FILE *output,
> +database_dump_file (notmuch_database_t *notmuch, gzFile output,
>  		    const char *query_str, int output_format)
>  {
>      notmuch_query_t *query;
> @@ -69,7 +71,7 @@ database_dump_file (notmuch_database_t *notmuch, FILE *output,
>  	}
>  
>  	if (output_format == DUMP_FORMAT_SUP) {
> -	    fprintf (output, "%s (", message_id);
> +	    gzprintf (output, "%s (", message_id);
>  	}
>  
>  	for (tags = notmuch_message_get_tags (message);
> @@ -78,12 +80,12 @@ database_dump_file (notmuch_database_t *notmuch, FILE *output,
>  	    const char *tag_str = notmuch_tags_get (tags);
>  
>  	    if (! first)
> -		fputs (" ", output);
> +		gzputs (output, " ");
>  
>  	    first = 0;
>  
>  	    if (output_format == DUMP_FORMAT_SUP) {
> -		fputs (tag_str, output);
> +		gzputs (output, tag_str);
>  	    } else {
>  		if (hex_encode (notmuch, tag_str,
>  				&buffer, &buffer_size) != HEX_SUCCESS) {
> @@ -91,12 +93,12 @@ database_dump_file (notmuch_database_t *notmuch, FILE *output,
>  			     tag_str);
>  		    return EXIT_FAILURE;
>  		}
> -		fprintf (output, "+%s", buffer);
> +		gzprintf (output, "+%s", buffer);
>  	    }
>  	}
>  
>  	if (output_format == DUMP_FORMAT_SUP) {
> -	    fputs (")\n", output);
> +	    gzputs (output, ")\n");
>  	} else {
>  	    if (make_boolean_term (notmuch, "id", message_id,
>  				   &buffer, &buffer_size)) {
> @@ -104,7 +106,7 @@ database_dump_file (notmuch_database_t *notmuch, FILE *output,
>  			     message_id, strerror (errno));
>  		    return EXIT_FAILURE;
>  	    }
> -	    fprintf (output, " -- %s\n", buffer);
> +	    gzprintf (output, " -- %s\n", buffer);
>  	}
>  
>  	notmuch_message_destroy (message);
> @@ -121,24 +123,77 @@ database_dump_file (notmuch_database_t *notmuch, FILE *output,
>  int
>  notmuch_database_dump (notmuch_database_t *notmuch,
>  		       const char *output_file_name,
> -		       const char *query_str, dump_format_t output_format)
> +		       const char *query_str,
> +		       dump_format_t output_format,
> +		       notmuch_bool_t gzip_output)
>  {
> -    FILE *output = stdout;
> -    int ret;
> +    gzFile output;

This should to gzclose_w output on all error paths.  The easiest way
to do that it probably to
  gzFile output = NULL;
here and ...

> +    const char *mode = gzip_output ? "w9" : "wT";
> +    const char *name_for_error = output_file_name ? output_file_name : "stdout";
> +
> +    char *tempname = NULL;
> +    int outfd = -1;
> +
> +    int ret = -1;
>  
>      if (output_file_name) {
> -	output = fopen (output_file_name, "w");
> -	if (output == NULL) {
> -	    fprintf (stderr, "Error opening %s for writing: %s\n",
> -		     output_file_name, strerror (errno));
> -	    return EXIT_FAILURE;
> -	}
> +	tempname = talloc_asprintf (notmuch, "%s.XXXXXX", output_file_name);
> +	outfd = mkstemp (tempname);
> +    } else {
> +	outfd = dup (STDOUT_FILENO);
> +    }
> +
> +    if (outfd < 0) {
> +	fprintf (stderr, "Bad output file %s\n", name_for_error);
> +	goto DONE;
> +    }
> +
> +    output = gzdopen (outfd, mode);
> +
> +    if (output == NULL) {
> +	fprintf (stderr, "Error opening %s for (gzip) writing: %s\n",
> +		 name_for_error, strerror (errno));
> +	if (close (outfd))
> +	    fprintf (stderr, "Error closing %s during shutdown: %s\n",
> +		 name_for_error, strerror (errno));
> +	goto DONE;
>      }
>  
>      ret = database_dump_file (notmuch, output, query_str, output_format);
> +    if (ret) goto DONE;
>  
> -    if (output != stdout)
> -	fclose (output);
> +    ret = gzflush (output, Z_FINISH);
> +    if (ret) {
> +	fprintf (stderr, "Error flushing output: %s\n", gzerror (output, NULL));
> +	goto DONE;
> +    }
> +
> +    if (output_file_name) {
> +	ret = fdatasync (outfd);
> +	if (ret) {
> +	    fprintf (stderr, "Error syncing %s to disk: %s\n",
> +		     name_for_error, strerror (errno));
> +	    goto DONE;
> +	}
> +    }
> +
> +    if (gzclose_w (output) != Z_OK) {
> +	ret = EXIT_FAILURE;
> +	goto DONE;
> +    }

... output = NULL; here, and ...

> +
> +    if (output_file_name) {
> +	ret = rename (tempname, output_file_name);
> +	if (ret) {
> +	    fprintf (stderr, "Error renaming %s to %s: %s\n",
> +		     tempname, output_file_name, strerror (errno));
> +	    goto DONE;
> +	}
> +
> +    }
> + DONE:

...
if (output)
  gzclose_w (output);
here.

> +    if (ret != EXIT_SUCCESS && output_file_name)
> +	(void) unlink (tempname);
>  
>      return ret;
>  }
> @@ -158,6 +213,7 @@ notmuch_dump_command (notmuch_config_t *config, int argc, char *argv[])
>      int opt_index;
>  
>      int output_format = DUMP_FORMAT_BATCH_TAG;
> +    notmuch_bool_t gzip_output = 0;
>  
>      notmuch_opt_desc_t options[] = {
>  	{ NOTMUCH_OPT_KEYWORD, &output_format, "format", 'f',
> @@ -165,6 +221,7 @@ notmuch_dump_command (notmuch_config_t *config, int argc, char *argv[])
>  				  { "batch-tag", DUMP_FORMAT_BATCH_TAG },
>  				  { 0, 0 } } },
>  	{ NOTMUCH_OPT_STRING, &output_file_name, "output", 'o', 0  },
> +	{ NOTMUCH_OPT_BOOLEAN, &gzip_output, "gzip", 'z', 0 },
>  	{ 0, 0, 0, 0, 0 }
>      };
>  
> @@ -181,7 +238,7 @@ notmuch_dump_command (notmuch_config_t *config, int argc, char *argv[])
>      }
>  
>      ret = notmuch_database_dump (notmuch, output_file_name, query_str,
> -				 output_format);
> +				 output_format, gzip_output);
>  
>      notmuch_database_destroy (notmuch);
>  
> diff --git a/test/T240-dump-restore.sh b/test/T240-dump-restore.sh
> index 0004438..d79aca8 100755
> --- a/test/T240-dump-restore.sh
> +++ b/test/T240-dump-restore.sh
> @@ -68,6 +68,18 @@ test_begin_subtest "dump --output=outfile --"
>  notmuch dump --output=dump-1-arg-dash.actual --
>  test_expect_equal_file dump.expected dump-1-arg-dash.actual
>  
> +# gzipped output
> +
> +test_begin_subtest "dump --gzip"
> +notmuch dump --gzip > dump-gzip.gz
> +gunzip dump-gzip.gz
> +test_expect_equal_file dump.expected dump-gzip
> +
> +test_begin_subtest "dump --gzip --output=outfile"
> +notmuch dump --gzip --output=dump-gzip-outfile.gz
> +gunzip dump-gzip-outfile.gz
> +test_expect_equal_file dump.expected dump-gzip-outfile
> +
>  # Note, we assume all messages from cworth have a message-id
>  # containing cworth.org
>  


More information about the notmuch mailing list