summaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org.ua>2014-07-07 15:43:37 +0300
committerSergey Poznyakoff <gray@gnu.org.ua>2014-07-07 15:43:37 +0300
commit24eb3d6df145f2e301a2484551a395edfd9aa8e0 (patch)
tree998252ae00b90f59df849cf1133d822f09585455 /examples
parentebfd2897f2230f527d1bfdaa507674268dc513e6 (diff)
downloadmailutils-24eb3d6df145f2e301a2484551a395edfd9aa8e0.tar.gz
mailutils-24eb3d6df145f2e301a2484551a395edfd9aa8e0.tar.bz2
New example: mboxidx
* examples/mboxidx.c: New file * examples/Makefile.am: Add mboxidx
Diffstat (limited to 'examples')
-rw-r--r--examples/mboxidx.c532
1 files changed, 532 insertions, 0 deletions
diff --git a/examples/mboxidx.c b/examples/mboxidx.c
new file mode 100644
index 000000000..b06f68ecf
--- /dev/null
+++ b/examples/mboxidx.c
@@ -0,0 +1,532 @@
+/* Convert mailbox to a Sphinx XML input.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+
+ GNU Mailutils is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GNU Mailutils is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GNU Mailutils. If not, see <http://www.gnu.org/licenses/>. */
+
+/* This program takes a mailbox in arbitrary format as its only argument
+ and produces on standard output an XML stream suitable for use with
+ Sphinx for indexing the mailbox content. Example usage in sphinx.conf:
+
+ source mbox_source
+ {
+ type = xmlpipe2
+ xmlpipe_command = /usr/local/bin/mboxidx /var/spool/archive/mbox
+ }
+
+ index mbox_idx
+ {
+ source = mbox_source
+ docinfo = extern
+ charset_type = utf-8
+ ...
+ }
+*/
+
+#include <config.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <mailutils/mailutils.h>
+#include <fnmatch.h>
+
+mu_list_t typelist;
+mu_list_t fldlist;
+int log_to_stderr = 1;
+char *output_charset = "utf-8";
+int no_out_if_empty = 0;
+
+char *docstr[] = {
+ "usage: mboxidx [OPTIONS] MBOX",
+ "converts MBOX into a xmlpipe2 stream for Sphinx indexing engine",
+ "OPTIONS are:",
+ "",
+ " -c CHARSET set output charset",
+ " -d SPEC set debug verbosity",
+ " -f FIELD index header FIELD",
+ " -F FACILITY set syslog facility",
+ " -L TAG set syslog tag",
+ " -N no output if nothing to index",
+ " -S N skip first N messages",
+ " -s FILE save and read status of the prior run from FILE\n",
+ " -t GLOB index message body if its content type matches GLOB",
+ " -h display this help summary",
+ NULL
+};
+
+void
+help (int c)
+{
+ int i;
+ FILE *fp = c ? stderr : stdout;
+
+ for (i = 0; docstr[i]; i++)
+ fprintf (fp, "%s\n", docstr[i]);
+
+ exit (c);
+}
+
+
+static int
+matchstr (void const *item, void const *data)
+{
+ char const *pat = item;
+ char const *str = data;
+
+ return fnmatch (pat, str, 0);
+}
+
+struct field
+{
+ char *name;
+ int ishdr;
+};
+
+static void
+fldadd (char *name, int ishdr)
+{
+ struct field *f = mu_alloc (sizeof (*f));
+ f->name = mu_strdup (name);
+ f->ishdr = ishdr;
+ mu_list_append (fldlist, f);
+}
+
+static int
+get_hdr_value (mu_header_t hdr, const char *name, char **value)
+{
+ int status = mu_header_aget_value_unfold (hdr, name, value);
+ if (status == 0)
+ mu_rtrim_class (*value, MU_CTYPE_SPACE);
+ return status;
+}
+
+
+static int
+get_content_encoding (mu_header_t hdr, char **value)
+{
+ char *encoding = NULL;
+ if (get_hdr_value (hdr, MU_HEADER_CONTENT_TRANSFER_ENCODING, &encoding))
+ encoding = mu_strdup ("7bit"); /* Default. */
+ *value = encoding;
+ return 0;
+}
+
+static int
+set_charset_filter (mu_stream_t * str, char *input_charset)
+{
+ int rc;
+ char const *args[4];
+
+ mu_stream_t b_stream = *str;
+
+ args[0] = "iconv";
+ args[1] = input_charset;
+ args[2] = output_charset;
+ args[3] = "copy-octal";
+ args[4] = NULL;
+
+ rc = mu_filter_create_args (str, b_stream,
+ "iconv",
+ 4, args, MU_FILTER_DECODE, MU_STREAM_READ);
+
+ if (rc == 0)
+ mu_stream_unref (b_stream);
+ return rc;
+}
+
+
+static void
+formatbody (mu_message_t msg, int delim)
+{
+ int rc;
+ char *type;
+ char *input_charset = NULL;
+ int ismime = 0;
+ mu_message_t msgpart;
+ mu_header_t hdr = NULL;
+
+ mu_message_get_header (msg, &hdr);
+ if (get_hdr_value (hdr, MU_HEADER_CONTENT_TYPE, &type))
+ type = mu_strdup ("text/plain");
+ else
+ {
+ struct mu_wordsplit ws;
+
+ mu_strlower (type);
+
+ ws.ws_delim = ";";
+ if (mu_wordsplit (type, &ws,
+ MU_WRDSF_DELIM | MU_WRDSF_WS
+ | MU_WRDSF_NOVAR | MU_WRDSF_NOCMD))
+ {
+ mu_error ("can't split content type \"%s\": %s",
+ type, mu_wordsplit_strerror (&ws));
+ }
+ else
+ {
+ int i;
+
+ type = ws.ws_wordv[0];
+ ws.ws_wordv[0] = NULL;
+ for (i = 1; i < ws.ws_wordc; i++)
+ {
+ if (strncasecmp (ws.ws_wordv[i], "charset=", 8) == 0)
+ {
+ input_charset = mu_strdup (ws.ws_wordv[i] + 8);
+ break;
+ }
+ }
+ mu_wordsplit_free (&ws);
+ }
+ }
+
+ mu_message_is_multipart (msg, &ismime);
+ if (ismime)
+ {
+ size_t i, nparts;
+
+ rc = mu_message_get_num_parts (msg, &nparts);
+ if (rc)
+ mu_diag_funcall (MU_DIAG_ERROR, "mu_message_get_num_parts", NULL, rc);
+ else
+ for (i = 1; i <= nparts; i++)
+ {
+ if (mu_message_get_part (msg, i, &msgpart) == 0)
+ formatbody (msgpart, 1);
+ }
+ }
+ else if (mu_c_strncasecmp (type, "message/rfc822", strlen (type)) == 0)
+ {
+ rc = mu_message_unencapsulate (msg, &msgpart, NULL);
+ if (rc)
+ mu_diag_funcall (MU_DIAG_ERROR, "mu_message_unencapsulate", NULL, rc);
+ else
+ formatbody (msgpart, 1);
+ }
+ else if (mu_list_locate (typelist, type, NULL) == 0)
+ {
+ char *encoding;
+ mu_body_t body = NULL;
+ mu_stream_t b_stream = NULL;
+ mu_stream_t d_stream = NULL;
+ mu_stream_t stream = NULL;
+
+ get_content_encoding (hdr, &encoding);
+
+ mu_message_get_body (msg, &body);
+ mu_body_get_streamref (body, &b_stream);
+ if (mu_filter_create (&d_stream, b_stream, encoding,
+ MU_FILTER_DECODE, MU_STREAM_READ) == 0)
+ {
+ mu_stream_unref (b_stream);
+ stream = d_stream;
+ }
+ else
+ stream = b_stream;
+
+ if (!input_charset || set_charset_filter (&stream, input_charset))
+ set_charset_filter (&stream, "US-ASCII");
+
+ b_stream = stream;
+ if (mu_filter_create (&stream, b_stream, "xml",
+ MU_FILTER_ENCODE, MU_STREAM_READ) == 0)
+ mu_stream_unref (b_stream);
+
+ if (delim)
+ mu_printf ("\n");
+ mu_stream_copy (mu_strout, stream, 0, NULL);
+
+ mu_stream_unref (stream);
+ free (encoding);
+ }
+ free (type);
+}
+
+static int
+fldfmt (void *item, void *data)
+{
+ struct field *f = item;
+ mu_message_t msg = data;
+ int rc;
+
+ mu_printf (" <%s>", f->name);
+ if (f->ishdr)
+ {
+ mu_header_t hdr;
+ char *hfield;
+
+ mu_message_get_header (msg, &hdr);
+ rc = mu_header_aget_value_unfold (hdr, f->name, &hfield);
+ if (rc == 0)
+ {
+ char *tmp;
+
+ rc = mu_rfc2047_decode (output_charset, hfield, &tmp);
+ if (rc)
+ mu_stream_write (mu_strout, hfield, strlen (hfield), NULL);
+ else
+ {
+ mu_stream_write (mu_strout, tmp, strlen (tmp), NULL);
+ free (tmp);
+ }
+ }
+ }
+ else
+ formatbody (msg, 0);
+
+ mu_printf ("</%s>\n", f->name);
+ return 0;
+}
+
+static int
+fldout (void *item, void *data)
+{
+ struct field *f = item;
+ mu_printf (" <sphinx:field name=\"%s\"/>\n", f->name);
+ return 0;
+}
+
+void
+xmlpipe2_header ()
+{
+ mu_printf ("<?xml version=\"1.0\"?>\n");
+ mu_printf ("<sphinx:docset>\n");
+ mu_printf (" <sphinx:schema>\n");
+ mu_list_foreach (fldlist, fldout, NULL);
+ mu_printf (" </sphinx:schema>\n");
+}
+
+void
+xmlpipe2_footer ()
+{
+ mu_printf ("</sphinx:docset>\n");
+}
+
+size_t skip_count;
+size_t nmesg;
+
+void
+read_state_file (const char *name)
+{
+ FILE *fp;
+ size_t count;
+ int c;
+
+ if (access (name, F_OK))
+ {
+ if (errno == ENOENT)
+ return;
+ mu_error ("can't access state file \"%s\": %s",
+ name, mu_strerror (errno));
+ exit (1);
+ }
+
+ fp = fopen (name, "r");
+ if (!fp)
+ {
+ mu_error ("can't open state file \"%s\" for reading: %s",
+ name, mu_strerror (errno));
+ exit (1);
+ }
+
+ count = 0;
+ while ((c = fgetc (fp)) != EOF && c >= '0' && c <= '9')
+ {
+ size_t n = count * 10 + c - '0';
+ if (n < count)
+ {
+ mu_error ("%s: message number too big", name);
+ exit (1);
+ }
+ count = n;
+ }
+ fclose (fp);
+
+ if (c != EOF && c != '\n')
+ {
+ mu_error ("%s: malformed state file", name);
+ exit (1);
+ }
+
+ skip_count = count;
+}
+
+void
+write_state_file (const char *name)
+{
+ FILE *fp;
+
+ fp = fopen (name, "w");
+ if (!fp)
+ {
+ mu_error ("can't open state file \"%s\" for writing: %s",
+ name, mu_strerror (errno));
+ exit (1);
+ }
+
+ fprintf (fp, "%lu\n", (unsigned long) nmesg);
+
+ fclose (fp);
+}
+
+static int
+action (mu_observer_t o, size_t type, void *data, void *action_data)
+{
+ mu_mailbox_t mbox = mu_observer_get_owner (o);
+ mu_message_t msg = NULL;
+
+ switch (type)
+ {
+ case MU_EVT_MESSAGE_ADD:
+ ++nmesg;
+ if (nmesg < skip_count)
+ break;
+ if (no_out_if_empty && nmesg == skip_count)
+ xmlpipe2_header ();
+
+ MU_ASSERT (mu_mailbox_get_message (mbox, nmesg, &msg));
+ mu_printf (" <sphinx:document id=\"%lu\">\n", (unsigned long) nmesg);
+ mu_list_foreach (fldlist, fldfmt, msg);
+ mu_printf (" </sphinx:document>\n");
+ break;
+ case MU_EVT_MAILBOX_PROGRESS:
+ /* Noop. */
+ break;
+ }
+ return 0;
+}
+
+int
+main (int argc, char **argv)
+{
+ int rc;
+ mu_mailbox_t mbox;
+ mu_observer_t observer;
+ mu_observable_t observable;
+ size_t total;
+ char *p;
+ char *state_file = NULL;
+
+ mu_set_program_name (argv[0]);
+ mu_register_all_mbox_formats ();
+ mu_stdstream_setup (MU_STDSTREAM_RESET_NONE);
+
+ MU_ASSERT (mu_list_create (&fldlist));
+
+ MU_ASSERT (mu_list_create (&typelist));
+ mu_list_set_comparator (typelist, matchstr);
+ mu_list_append (typelist, mu_strdup ("text/*"));
+
+ while ((rc = getopt (argc, argv, "c:d:f:F:hL:NS:s:t:")) != EOF)
+ {
+ switch (rc)
+ {
+ case 'c':
+ output_charset = optarg;
+ break;
+ case 'd':
+ mu_debug_parse_spec (optarg);
+ break;
+ case 'F':
+ if (mu_string_to_syslog_facility (optarg, &mu_log_facility))
+ {
+ mu_error ("unknown facility: %s", optarg);
+ exit (1);
+ }
+ log_to_stderr = 0;
+ break;
+ case 'f':
+ fldadd (optarg, 1);
+ break;
+ case 'L':
+ mu_log_tag = optarg;
+ break;
+ case 'N':
+ no_out_if_empty = 1;
+ break;
+ case 'S':
+ skip_count = strtoul (optarg, &p, 10);
+ if (*p)
+ {
+ mu_error ("invalid number: %s", optarg);
+ exit (1);
+ }
+ break;
+ case 's':
+ state_file = optarg;
+ break;
+ case 't':
+ p = mu_strdup (optarg);
+ mu_strlower (p);
+ mu_list_append (typelist, p);
+ break;
+ case 'h':
+ help (0);
+ default:
+ exit (1);
+ }
+ }
+ fldadd ("content", 0);
+
+ if (state_file)
+ read_state_file (state_file);
+
+ skip_count++;
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1)
+ help (1);
+
+ if (log_to_stderr)
+ mu_stdstream_strerr_setup (MU_STRERR_STDERR);
+ else
+ {
+ if (!mu_log_tag)
+ mu_log_tag = (char *) mu_program_name;
+ mu_stdstream_strerr_setup (MU_STRERR_SYSLOG);
+ }
+
+ rc = mu_mailbox_create_default (&mbox, argv[0]);
+ if (rc)
+ {
+ mu_error ("mu_mailbox_create: %s", mu_strerror (rc));
+ exit (EXIT_FAILURE);
+ }
+
+ rc = mu_mailbox_open (mbox, MU_STREAM_READ);
+ if (rc)
+ {
+ mu_error ("mu_mailbox_open: %s", mu_strerror (rc));
+ exit (EXIT_FAILURE);
+ }
+
+ mu_observer_create (&observer, mbox);
+ mu_observer_set_action (observer, action, mbox);
+ mu_observer_set_action_data (observer, NULL, mbox);
+ mu_mailbox_get_observable (mbox, &observable);
+ mu_observable_attach (observable, MU_EVT_MESSAGE_ADD, observer);
+
+ if (!no_out_if_empty)
+ xmlpipe2_header ();
+ MU_ASSERT (mu_mailbox_scan (mbox, 1, &total));
+ if (!no_out_if_empty || nmesg >= skip_count)
+ xmlpipe2_footer ();
+ mu_mailbox_close (mbox);
+ mu_mailbox_destroy (&mbox);
+
+ if (state_file)
+ write_state_file (state_file);
+
+ return EXIT_SUCCESS;
+}

Return to:

Send suggestions and report system problems to the System administrator.