aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org.ua>2007-10-03 15:43:02 +0000
committerSergey Poznyakoff <gray@gnu.org.ua>2007-10-03 15:43:02 +0000
commit8cca58c9aad75d3c5a2faa3bd39a9479798254be (patch)
tree64745dd2780d15d28432596895ef7573b53c02eb
parent36b9df2b2727530faaa6865c32c08d9396c3ac67 (diff)
downloadswis-8cca58c9aad75d3c5a2faa3bd39a9479798254be.tar.gz
swis-8cca58c9aad75d3c5a2faa3bd39a9479798254be.tar.bz2
* src/word-split.c: New file.
* src/Makefile.am: Add word-split * src/swis.h: Include errno.h * src/html-strip.l: Minor fixes. git-svn-id: file:///svnroot/swis/trunk@9 05ba3e8d-823b-0410-8fb2-de0ee4edb5ba
-rw-r--r--ChangeLog5
-rw-r--r--src/Makefile.am3
-rw-r--r--src/html-strip.l9
-rw-r--r--src/swis.h2
-rw-r--r--src/word-split.c192
5 files changed, 206 insertions, 5 deletions
diff --git a/ChangeLog b/ChangeLog
index e4cce24..7832f3e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,10 @@
2007-10-03 Sergey Poznyakoff <gray@gnu.org.ua>
+ * src/word-split.c: New file.
+ * src/Makefile.am: Add word-split
+ * src/swis.h: Include errno.h
+ * src/html-strip.l: Minor fixes.
+
* README-hacking: New file
* README: Initial edit.
diff --git a/src/Makefile.am b/src/Makefile.am
index e141b6b..1086aa7 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -14,8 +14,9 @@
# You should have received a copy of the GNU General Public License
# along with SWIS. If not, see <http://www.gnu.org/licenses/>.
-bin_PROGRAMS=html-strip
+bin_PROGRAMS=html-strip word-split
html_strip_SOURCES=html-strip.l
+word_split_SOURCES=word-split.c
INCLUDES=-I${top_srcdir}/gnu -I../gnu
LIBS=../gnu/libgnu.a
diff --git a/src/html-strip.l b/src/html-strip.l
index bc7b9f8..b42f618 100644
--- a/src/html-strip.l
+++ b/src/html-strip.l
@@ -60,7 +60,8 @@ parse_content_type()
memmove (buf, start, p - start + 1);
if (strcasecmp (buf, "utf-8"))
{
- error(0,0,"enabling conversion %s->%s\n", buf,"utf-8");
+ if (yy_flex_debug)
+ fprintf (stderr, "enabling conversion %s->%s\n", buf,"utf-8");
cd = iconv_open ("UTF-8", buf);
if (cd == INVALID_ICONV_CD)
error (0, errno, "cannot convert from %s", buf);
@@ -98,7 +99,7 @@ convert_output ()
error (1, errno, "write error");
errno = saved_errno;
}
- if (rc == INVALID_ICONV_CD)
+ if (rc == (size_t) -1)
{
if (errno == EILSEQ)
error (1, 0, "cannot convert \"%.*s\"", idx, inbuf);
@@ -245,7 +246,7 @@ usage ()
printf ("Usage: html-strip [OPTIONS] [FILES...]\n");
printf ("Strip off HTML tags from input files and convert them to UTF-8\n");
printf ("\nOptions are:\n");
- printf (" -d, --debug output debugging info 1\n");
+ printf (" -d, --debug output debugging info\n");
printf (" -o, --output=FILE direct output to FILE instead of stdout\n");
printf ("\n");
printf (" -h, --help print this help list\n");
@@ -303,7 +304,7 @@ main (int argc, char **argv)
open_input ();
}
- while (yylex())
+ while (yylex ())
;
exit (0);
}
diff --git a/src/swis.h b/src/swis.h
index d4ad34b..9adb00c 100644
--- a/src/swis.h
+++ b/src/swis.h
@@ -21,6 +21,8 @@
#include <getopt.h>
#include <string.h>
#include <ctype.h>
+#include <errno.h>
+#include <locale.h>
#include "error.h"
#include "mbchar.h"
#include "mbswidth.h"
diff --git a/src/word-split.c b/src/word-split.c
new file mode 100644
index 0000000..59a05b7
--- /dev/null
+++ b/src/word-split.c
@@ -0,0 +1,192 @@
+/* This file is part of SWIS
+ Copyright (C) 2007 Sergey Poznyakoff
+
+ SWIS is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ SWIS is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with SWIS. If not, see <http://www.gnu.org/licenses/>. */
+
+#include "swis.h"
+
+enum {
+ PROGNAME_OPTION,
+};
+
+struct option options[] = {
+ { "progname", required_argument, NULL, PROGNAME_OPTION },
+ { "debug", no_argument, NULL, 'd' },
+ { "help", no_argument, NULL, 'h' },
+ { "version", no_argument, NULL, 'v' },
+ { "output", required_argument, NULL, 'o' },
+ { NULL }
+};
+
+DECL_COPYRIGHT;
+
+void
+usage ()
+{
+ printf ("Usage: word-split [OPTIONS] [FILES...]\n");
+ printf ("Split UTF-8 input into words\n");
+ printf ("\nOptions are:\n");
+ /* printf (" -d, --debug output debugging info\n"); */
+ printf (" -o, --output=FILE direct output to FILE instead of stdout\n");
+ printf ("\n");
+ printf (" -h, --help print this help list\n");
+ printf (" -v, --version print program version and exit\n");
+ printf ("\n");
+ printf ("Report bugs to <%s>\n", PACKAGE_BUGREPORT);
+}
+
+char **input_file;
+FILE *input;
+FILE *output;
+
+int
+open_input ()
+{
+ if (input_file && *input_file)
+ {
+ char *name = *input_file++;
+ if (name[0] == '-' && name[1] == 0)
+ input = stdin;
+ else
+ {
+ input = fopen (name, "r");
+ if (!input)
+ error (1, errno, "cannot open input file %s", name);
+ }
+ return 0;
+ }
+ return 1;
+}
+
+static int after_delim_output = 1;
+
+int
+word_split ()
+{
+ unsigned wc;
+ while ((wc = fgetc (input)) != EOF)
+ {
+ if (wc < 0x80)
+ {
+ if (isalnum (wc))
+ {
+ fputc (wc, output);
+ }
+ else
+ {
+ if (!after_delim_output)
+ {
+ fputc ('\n', output);
+ after_delim_output = 1;
+ }
+ continue;
+ }
+ }
+ else
+ {
+ int count;
+
+ if (0xc2 <= wc && wc <= 0xdf)
+ count = 2;
+ else if (0xe0 <= wc && wc <= 0xef)
+ count = 3;
+ else if (0xf0 <= wc && wc <= 0xf4)
+ count = 4;
+ else
+ {
+ /* FIXME: don't know what to do */
+ error (0, 0, "unknown UTF-8 char: %x", wc);
+ fputc (wc, output);
+ continue;
+ }
+
+ fputc (wc, output);
+ while (--count)
+ {
+ if ((wc = fgetc (input)) == EOF)
+ {
+ error (1, 0, "unexpected end of file");
+ break;
+ }
+ fputc (wc, output);
+ }
+ }
+ after_delim_output = 0;
+ }
+ return !open_input ();
+}
+
+int
+main (int argc, char **argv)
+{
+ int c;
+
+ program_name = argv[0];
+
+ while ((c = getopt_long (argc, argv, "dho:v", options, NULL)) != EOF)
+ {
+ switch (c)
+ {
+ case 'd':
+ error (0, 0, "warning: the --debug option is not yet supported");
+ /* FIXME */
+ break;
+
+ case PROGNAME_OPTION:
+ program_name = optarg;
+ break;
+
+ case 'h':
+ usage ();
+ exit (0);
+
+ case 'o':
+ output = fopen (optarg, "w");
+ if (!output)
+ error (1, errno, "cannot open output file %s", optarg);
+ break;
+
+ case 'v':
+ version_etc (stdout, "word-split", PACKAGE_NAME, VERSION,
+ PACKAGE_AUTHOR, NULL);
+ exit (0);
+
+ default:
+ exit (1);
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc)
+ {
+ input_file = argv;
+ open_input ();
+ }
+
+ if (!input)
+ input = stdin;
+ if (!output)
+ output = stdout;
+
+ while (word_split ())
+ ;
+
+ exit (0);
+}
+
+
+
+

Return to:

Send suggestions and report system problems to the System administrator.