aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org.ua>2007-10-06 10:41:27 +0000
committerSergey Poznyakoff <gray@gnu.org.ua>2007-10-06 10:41:27 +0000
commit91bd0620c0216e548550d5f27a4ad89702b3acc3 (patch)
treefd971d6ae8771a321757530b69999f1d4772d104
parente3a58e9f3f662091c9b9eae44612196fce8392ea (diff)
downloadswis-91bd0620c0216e548550d5f27a4ad89702b3acc3.tar.gz
swis-91bd0620c0216e548550d5f27a4ad89702b3acc3.tar.bz2
* src/word-split.c (usage): add --min-length.
* src/Makefile.am (bin_PROGRAMS): Rename to libexec_PROGRAMS. (bin_PROGRAMS): new program `swis' (EXTRA_DIST): Add swis.in db.mysql swis.conf (swis): New goal * src/db.mysql: New file * src/swis.conf: New file * src/swis.in: New file git-svn-id: file:///svnroot/swis/trunk@26 05ba3e8d-823b-0410-8fb2-de0ee4edb5ba
-rw-r--r--src/Makefile.am12
-rw-r--r--src/db.mysql42
-rw-r--r--src/swis.conf18
-rw-r--r--src/swis.in256
-rw-r--r--src/word-split.c1
5 files changed, 327 insertions, 2 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index ae3d2bb..a34a2f2 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -16,8 +16,9 @@
noinst_LIBRARIES=libswis.a
MYSQL_BACKEND=mysql-backend
-bin_PROGRAMS=html-strip word-split @BUILD_BACKENDS@
+libexec_PROGRAMS=html-strip word-split @BUILD_BACKENDS@
EXTRA_PROGRAMS=mysql-backend
+bin_PROGRAMS=swis
html_strip_SOURCES=html-strip.l
@@ -34,5 +35,12 @@ libswis_a_SOURCES=\
INCLUDES=-I${top_srcdir}/gnu -I../gnu
LDADD=./libswis.a ../gnu/libgnu.a $(LIBICONV)
-EXTRA_DIST=swis.h
+EXTRA_DIST=swis.h swis.in db.mysql swis.conf
AM_LFLAGS=-d
+
+swis: ${top_srcdir}/src/swis.in
+ sed -e 's|This file is part of SWIS|&; Do not edit, it is generated automatically|'\
+ -e 's|=SYSCONFDIR=|${sysconfdir}|g;s|=LIBEXECDIR=|${libexecdir}|g' \
+ ${top_srcdir}/src/swis.in > $@-t && \
+ cmp $@-t swis >/dev/null 2>&1 || mv $@-t swis
+
diff --git a/src/db.mysql b/src/db.mysql
new file mode 100644
index 0000000..c73ccbf
--- /dev/null
+++ b/src/db.mysql
@@ -0,0 +1,42 @@
+# This file is part of SWIS
+# Copyright (C) 2007 Sergey Poznyakoff
+#
+# SWIS is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# SWIS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with SWIS. If not, see <http://www.gnu.org/licenses/>.
+
+set names utf8;
+CREATE DATABASE swis CHARACTER SET utf8 COLLATE utf8_general_ci;
+USE swis
+
+DROP TABLE IF EXISTS `url`;
+CREATE TABLE `url` (
+ `id` int(32) NOT NULL auto_increment,
+ `url` varchar (1024),
+ UNIQUE(`id`),
+ INDEX(`url`(1024))
+);
+
+DROP TABLE IF EXISTS `words`;
+CREATE TABLE `words` (
+ `word` varchar (128),
+ `url_id` int(32) NOT NULL,
+ INDEX(`word`),
+ INDEX(`url_id`)
+);
+
+DROP TABLE IF EXISTS `text`;
+CREATE table `text` (
+ `text` blob (4096),
+ `url_id` int(32) NOT NULL,
+ INDEX(`url_id`)
+);
diff --git a/src/swis.conf b/src/swis.conf
new file mode 100644
index 0000000..0d360aa
--- /dev/null
+++ b/src/swis.conf
@@ -0,0 +1,18 @@
+directory ~/src/ellinika/html
+base-url http://ellinika.gnu.org.ua
+file-pattern *.html
+
+badwords-from badwords.en
+badwords-from badwords.pl
+badwords-from badwords.uk
+badwords-from badwords.ru
+
+min-length 2
+
+exclude '[0-9]+'
+#exclude-from exfile
+
+full-text yes
+
+backend mysql -uroot --socket=/tmp/mysql.sock --cleanup
+password-file $HOME/.sqlpass
diff --git a/src/swis.in b/src/swis.in
new file mode 100644
index 0000000..494d754
--- /dev/null
+++ b/src/swis.in
@@ -0,0 +1,256 @@
+#! /bin/sh
+# This file is part of SWIS
+# Copyright (C) 2007 Sergey Poznyakoff
+#
+# SWIS is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 3, or (at your option)
+# any later version.
+#
+# SWIS is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with SWIS. If not, see <http://www.gnu.org/licenses/>.
+
+CONFIG==SYSCONFDIR=/swis.conf
+PATH==LIBEXECDIR=:$PATH
+DEBUG=
+
+error() {
+ echo >&2 "$0: $*"
+}
+
+die() {
+ code=$1
+ shift
+ error $*
+ exit $code
+}
+
+# absname arg
+absname() {
+ case $1 in
+ /*) echo $1;;
+ *) dir=`expr "$CONFIG" : '\(.*\)/.*'`
+ if [ -n "$dir" ]; then
+ echo $dir/$1
+ else
+ echo $1
+ fi
+ esac
+}
+
+# dirscan [dir [url]]
+dirscan() {
+ FNARG=
+ WSARG=$DEBUG
+ BACKEND=
+ PASSFILE=
+ DIR=
+ BASE_URL=
+
+ if [ -r $CONFIG ]; then
+ LINE=0
+ while read keyword arg rest
+ do
+ LINE=$(($LINE + 1))
+ case $keyword in
+ \#*) continue;;
+ "") continue;;
+ directory)
+ DIR=$arg;;
+ file-pattern)
+ FNARG="$FNARG -name '$arg'"
+ ;;
+ find-option)
+ FNARG="$FNARG $arg $rest"
+ ;;
+ base-url)
+ BASE_URL=$arg
+ ;;
+ badwords-from)
+ WSARG="$WSARG --badword=`absname $arg`"
+ for arg in $rest
+ do
+ WSARG="$WSARG --badword=`absname $arg`"
+ done
+ ;;
+
+ min-length)
+ WSARG="$WSARG --min-length=$arg"
+ ;;
+ exclude)
+ WSARG="$WSARG --exclude=$arg"
+ ;;
+ exclude-from)
+ WSARG="$WSARG --exclude-from=`absname $arg`"
+ for arg in $rest
+ do
+ WSARG="$WSARG --exclude-from=`absname $arg`"
+ done
+ ;;
+ full-text)
+ if [ $arg = yes ]; then
+ WSARG="$WSARG --full-text"
+ fi
+ ;;
+ backend)
+ BACKEND="${arg}-backend $rest"
+ ;;
+ password-file)
+ eval PASSFILE=$arg;;
+ *) die 1 "$CONFIG:$LINE: unknown keyword"
+ esac
+ done < $CONFIG
+ else
+ error "warning: configuration file $CONFIG does not exist or is unreadable"
+ fi
+
+ if [ -z "$FNARG" ]; then
+ FNARG="-name '*.html'"
+ fi
+
+ if [ -n "$PASSFILE" ]; then
+ if [ -n "$BACKEND" ]; then
+ BACKEND="$BACKEND --pass-from-file='$PASSFILE'"
+ fi
+ fi
+
+ if [ -z "$BACKEND" ]; then
+ BACKEND=cat
+ fi
+
+ # Process arguments
+ [ -n "$1" ] && DIR=$1
+ [ -n "$2" ] && BASE_URL=$2
+
+ case $DIR in
+ "") die 1 "Source directory not given"
+ ;;
+ */) ;;
+ *) DIR=$DIR/
+ esac
+
+ case $BASE_URL in
+ "") die 1 "Base URL not given"
+ ;;
+ */) ;;
+ *) BASE_URL=$BASE_URL/
+ esac
+
+ eval find $DIR -type f $FNARG |
+ html-strip -t -T - $DEBUG |
+ eval word-split -t -u $WSARG |
+ sed "s,> $DIR,> $BASE_URL," |
+ eval $BACKEND $DEBUG
+}
+
+TMPDIR=/tmp/swis.$$
+cleanup() {
+ rm -rf $TMPDIR
+}
+
+# webcan url
+webscan() {
+ mkdir $TMPDIR || die "cannot create temporary directory"
+ trap cleanup 1 2 13 15
+ wget -q -r -nH -P$TMPDIR $1
+ dirscan $TMPDIR $1
+ cleanup
+}
+
+###############################################################################
+# Main
+###############################################################################
+
+usage() {
+ cat - <<EOT
+Usage: swis [OPTIONS] COMMAND [args]
+Index given URL or directory.
+
+Options are:
+ -d, --debug increase debugging level
+ -c, --config=FILE read configuration from FILE
+ -h, --help show this help summary
+ -v, --version print program version and exit
+
+Commands are:
+ web URL index given URL
+ dir [DIR [URL]] index directory DIR
+
+EOT
+ # Display bug reporting address:
+ html-strip --help | tail -n 1
+}
+
+longopt() {
+ case ${1##--} in
+ h|he|hel|help)
+ usage
+ exit 0
+ ;;
+ d|de|deb|debu|debug)
+ [ -z "$DEBUG" ] && DEBUG=-
+ DEBUG="${DEBUG}d"
+ ;;
+ v|ve|ver|vers|versi|versio|version)
+ html-strip --version | sed "s|html-strip|$0|g"
+ exit 0
+ ;;
+ c|co|conf|confi|config)
+ CONFIG=$2
+ ;;
+ *)
+ error "unknown long option: --$1"
+ exit 1
+ esac
+}
+
+while getopts "c:dhv-:" OPTION
+do
+ case $OPTION in
+ c) longopt config $OPTARG;;
+ h) longopt help;;
+ d) longopt debug;;
+ v) longopt version;;
+ -) case $OPTARG in
+ *=*) arg=${OPTARG##*=}
+ OPTARG=${OPTARG%%=*}
+ shift $(($OPTIND - 1))
+ set -- --$OPTARG $arg $*
+ OPTIND=2
+ esac
+ case ${OPTARG} in
+ c|co|conf|confi|config)
+ shift $(($OPTIND - 1))
+ longopt $OPTARG $1
+ shift
+ OPTIND=0
+ ;;
+ *)
+ longopt $OPTARG
+ ;;
+ esac
+ ;;
+ esac
+done
+
+shift $(($OPTIND - 1))
+
+if [ $# -eq 0 ]; then
+ error "not enough arguments"
+ exit 1
+fi
+
+case $1 in
+dir) shift; dirscan $*;;
+web) shift; webscan $*;;
+*) error "unrecognized command $1"
+esac
+
+exit 0
+# End
+
diff --git a/src/word-split.c b/src/word-split.c
index 05c73fe..60cefcc 100644
--- a/src/word-split.c
+++ b/src/word-split.c
@@ -51,6 +51,7 @@ usage ()
printf (" -o, --output=FILE direct output to FILE instead of stdout\n");
printf (" --exclude=REGEX exclude words matching REGEX\n");
printf (" --exclude-from=FILE read list of regexps to exclude from FILE\n");
+ printf (" -m, --min-length=NUMBER set minimal word length\n");
printf (" -t, --tag preserve file name tags\n");
printf (" -T, --from-file=FILE read input file names from FILE\n");
printf (" -0, --null -T reads null-terminated names\n");

Return to:

Send suggestions and report system problems to the System administrator.