diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2007-10-03 15:43:02 +0000 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2007-10-03 15:43:02 +0000 |
commit | 8cca58c9aad75d3c5a2faa3bd39a9479798254be (patch) | |
tree | 64745dd2780d15d28432596895ef7573b53c02eb | |
parent | 36b9df2b2727530faaa6865c32c08d9396c3ac67 (diff) | |
download | swis-8cca58c9aad75d3c5a2faa3bd39a9479798254be.tar.gz swis-8cca58c9aad75d3c5a2faa3bd39a9479798254be.tar.bz2 |
* src/word-split.c: New file.
* src/Makefile.am: Add word-split
* src/swis.h: Include errno.h
* src/html-strip.l: Minor fixes.
git-svn-id: file:///svnroot/swis/trunk@9 05ba3e8d-823b-0410-8fb2-de0ee4edb5ba
-rw-r--r-- | ChangeLog | 5 | ||||
-rw-r--r-- | src/Makefile.am | 3 | ||||
-rw-r--r-- | src/html-strip.l | 9 | ||||
-rw-r--r-- | src/swis.h | 2 | ||||
-rw-r--r-- | src/word-split.c | 192 |
5 files changed, 206 insertions, 5 deletions
@@ -1,5 +1,10 @@ 2007-10-03 Sergey Poznyakoff <gray@gnu.org.ua> + * src/word-split.c: New file. + * src/Makefile.am: Add word-split + * src/swis.h: Include errno.h + * src/html-strip.l: Minor fixes. + * README-hacking: New file * README: Initial edit. diff --git a/src/Makefile.am b/src/Makefile.am index e141b6b..1086aa7 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -14,8 +14,9 @@ # You should have received a copy of the GNU General Public License # along with SWIS. If not, see <http://www.gnu.org/licenses/>. -bin_PROGRAMS=html-strip +bin_PROGRAMS=html-strip word-split html_strip_SOURCES=html-strip.l +word_split_SOURCES=word-split.c INCLUDES=-I${top_srcdir}/gnu -I../gnu LIBS=../gnu/libgnu.a diff --git a/src/html-strip.l b/src/html-strip.l index bc7b9f8..b42f618 100644 --- a/src/html-strip.l +++ b/src/html-strip.l @@ -60,7 +60,8 @@ parse_content_type() memmove (buf, start, p - start + 1); if (strcasecmp (buf, "utf-8")) { - error(0,0,"enabling conversion %s->%s\n", buf,"utf-8"); + if (yy_flex_debug) + fprintf (stderr, "enabling conversion %s->%s\n", buf,"utf-8"); cd = iconv_open ("UTF-8", buf); if (cd == INVALID_ICONV_CD) error (0, errno, "cannot convert from %s", buf); @@ -98,7 +99,7 @@ convert_output () error (1, errno, "write error"); errno = saved_errno; } - if (rc == INVALID_ICONV_CD) + if (rc == (size_t) -1) { if (errno == EILSEQ) error (1, 0, "cannot convert \"%.*s\"", idx, inbuf); @@ -245,7 +246,7 @@ usage () printf ("Usage: html-strip [OPTIONS] [FILES...]\n"); printf ("Strip off HTML tags from input files and convert them to UTF-8\n"); printf ("\nOptions are:\n"); - printf (" -d, --debug output debugging info 1\n"); + printf (" -d, --debug output debugging info\n"); printf (" -o, --output=FILE direct output to FILE instead of stdout\n"); printf ("\n"); printf (" -h, --help print this help list\n"); @@ -303,7 +304,7 @@ main (int argc, char **argv) open_input (); } - while (yylex()) + while (yylex ()) ; exit (0); } @@ -21,6 +21,8 @@ #include <getopt.h> #include <string.h> #include <ctype.h> +#include <errno.h> +#include <locale.h> #include "error.h" #include "mbchar.h" #include "mbswidth.h" diff --git a/src/word-split.c b/src/word-split.c new file mode 100644 index 0000000..59a05b7 --- /dev/null +++ b/src/word-split.c @@ -0,0 +1,192 @@ +/* This file is part of SWIS + Copyright (C) 2007 Sergey Poznyakoff + + SWIS is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + SWIS is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with SWIS. If not, see <http://www.gnu.org/licenses/>. */ + +#include "swis.h" + +enum { + PROGNAME_OPTION, +}; + +struct option options[] = { + { "progname", required_argument, NULL, PROGNAME_OPTION }, + { "debug", no_argument, NULL, 'd' }, + { "help", no_argument, NULL, 'h' }, + { "version", no_argument, NULL, 'v' }, + { "output", required_argument, NULL, 'o' }, + { NULL } +}; + +DECL_COPYRIGHT; + +void +usage () +{ + printf ("Usage: word-split [OPTIONS] [FILES...]\n"); + printf ("Split UTF-8 input into words\n"); + printf ("\nOptions are:\n"); + /* printf (" -d, --debug output debugging info\n"); */ + printf (" -o, --output=FILE direct output to FILE instead of stdout\n"); + printf ("\n"); + printf (" -h, --help print this help list\n"); + printf (" -v, --version print program version and exit\n"); + printf ("\n"); + printf ("Report bugs to <%s>\n", PACKAGE_BUGREPORT); +} + +char **input_file; +FILE *input; +FILE *output; + +int +open_input () +{ + if (input_file && *input_file) + { + char *name = *input_file++; + if (name[0] == '-' && name[1] == 0) + input = stdin; + else + { + input = fopen (name, "r"); + if (!input) + error (1, errno, "cannot open input file %s", name); + } + return 0; + } + return 1; +} + +static int after_delim_output = 1; + +int +word_split () +{ + unsigned wc; + while ((wc = fgetc (input)) != EOF) + { + if (wc < 0x80) + { + if (isalnum (wc)) + { + fputc (wc, output); + } + else + { + if (!after_delim_output) + { + fputc ('\n', output); + after_delim_output = 1; + } + continue; + } + } + else + { + int count; + + if (0xc2 <= wc && wc <= 0xdf) + count = 2; + else if (0xe0 <= wc && wc <= 0xef) + count = 3; + else if (0xf0 <= wc && wc <= 0xf4) + count = 4; + else + { + /* FIXME: don't know what to do */ + error (0, 0, "unknown UTF-8 char: %x", wc); + fputc (wc, output); + continue; + } + + fputc (wc, output); + while (--count) + { + if ((wc = fgetc (input)) == EOF) + { + error (1, 0, "unexpected end of file"); + break; + } + fputc (wc, output); + } + } + after_delim_output = 0; + } + return !open_input (); +} + +int +main (int argc, char **argv) +{ + int c; + + program_name = argv[0]; + + while ((c = getopt_long (argc, argv, "dho:v", options, NULL)) != EOF) + { + switch (c) + { + case 'd': + error (0, 0, "warning: the --debug option is not yet supported"); + /* FIXME */ + break; + + case PROGNAME_OPTION: + program_name = optarg; + break; + + case 'h': + usage (); + exit (0); + + case 'o': + output = fopen (optarg, "w"); + if (!output) + error (1, errno, "cannot open output file %s", optarg); + break; + + case 'v': + version_etc (stdout, "word-split", PACKAGE_NAME, VERSION, + PACKAGE_AUTHOR, NULL); + exit (0); + + default: + exit (1); + } + } + + argc -= optind; + argv += optind; + + if (argc) + { + input_file = argv; + open_input (); + } + + if (!input) + input = stdin; + if (!output) + output = stdout; + + while (word_split ()) + ; + + exit (0); +} + + + + |