diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2007-10-06 10:41:27 +0000 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2007-10-06 10:41:27 +0000 |
commit | 91bd0620c0216e548550d5f27a4ad89702b3acc3 (patch) | |
tree | fd971d6ae8771a321757530b69999f1d4772d104 | |
parent | e3a58e9f3f662091c9b9eae44612196fce8392ea (diff) | |
download | swis-91bd0620c0216e548550d5f27a4ad89702b3acc3.tar.gz swis-91bd0620c0216e548550d5f27a4ad89702b3acc3.tar.bz2 |
* src/word-split.c (usage): add --min-length.
* src/Makefile.am (bin_PROGRAMS): Rename to libexec_PROGRAMS.
(bin_PROGRAMS): new program `swis'
(EXTRA_DIST): Add swis.in db.mysql swis.conf
(swis): New goal
* src/db.mysql: New file
* src/swis.conf: New file
* src/swis.in: New file
git-svn-id: file:///svnroot/swis/trunk@26 05ba3e8d-823b-0410-8fb2-de0ee4edb5ba
-rw-r--r-- | src/Makefile.am | 12 | ||||
-rw-r--r-- | src/db.mysql | 42 | ||||
-rw-r--r-- | src/swis.conf | 18 | ||||
-rw-r--r-- | src/swis.in | 256 | ||||
-rw-r--r-- | src/word-split.c | 1 |
5 files changed, 327 insertions, 2 deletions
diff --git a/src/Makefile.am b/src/Makefile.am index ae3d2bb..a34a2f2 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -16,8 +16,9 @@ noinst_LIBRARIES=libswis.a MYSQL_BACKEND=mysql-backend -bin_PROGRAMS=html-strip word-split @BUILD_BACKENDS@ +libexec_PROGRAMS=html-strip word-split @BUILD_BACKENDS@ EXTRA_PROGRAMS=mysql-backend +bin_PROGRAMS=swis html_strip_SOURCES=html-strip.l @@ -34,5 +35,12 @@ libswis_a_SOURCES=\ INCLUDES=-I${top_srcdir}/gnu -I../gnu LDADD=./libswis.a ../gnu/libgnu.a $(LIBICONV) -EXTRA_DIST=swis.h +EXTRA_DIST=swis.h swis.in db.mysql swis.conf AM_LFLAGS=-d + +swis: ${top_srcdir}/src/swis.in + sed -e 's|This file is part of SWIS|&; Do not edit, it is generated automatically|'\ + -e 's|=SYSCONFDIR=|${sysconfdir}|g;s|=LIBEXECDIR=|${libexecdir}|g' \ + ${top_srcdir}/src/swis.in > $@-t && \ + cmp $@-t swis >/dev/null 2>&1 || mv $@-t swis + diff --git a/src/db.mysql b/src/db.mysql new file mode 100644 index 0000000..c73ccbf --- /dev/null +++ b/src/db.mysql @@ -0,0 +1,42 @@ +# This file is part of SWIS +# Copyright (C) 2007 Sergey Poznyakoff +# +# SWIS is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# SWIS is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with SWIS. If not, see <http://www.gnu.org/licenses/>. + +set names utf8; +CREATE DATABASE swis CHARACTER SET utf8 COLLATE utf8_general_ci; +USE swis + +DROP TABLE IF EXISTS `url`; +CREATE TABLE `url` ( + `id` int(32) NOT NULL auto_increment, + `url` varchar (1024), + UNIQUE(`id`), + INDEX(`url`(1024)) +); + +DROP TABLE IF EXISTS `words`; +CREATE TABLE `words` ( + `word` varchar (128), + `url_id` int(32) NOT NULL, + INDEX(`word`), + INDEX(`url_id`) +); + +DROP TABLE IF EXISTS `text`; +CREATE table `text` ( + `text` blob (4096), + `url_id` int(32) NOT NULL, + INDEX(`url_id`) +); diff --git a/src/swis.conf b/src/swis.conf new file mode 100644 index 0000000..0d360aa --- /dev/null +++ b/src/swis.conf @@ -0,0 +1,18 @@ +directory ~/src/ellinika/html +base-url http://ellinika.gnu.org.ua +file-pattern *.html + +badwords-from badwords.en +badwords-from badwords.pl +badwords-from badwords.uk +badwords-from badwords.ru + +min-length 2 + +exclude '[0-9]+' +#exclude-from exfile + +full-text yes + +backend mysql -uroot --socket=/tmp/mysql.sock --cleanup +password-file $HOME/.sqlpass diff --git a/src/swis.in b/src/swis.in new file mode 100644 index 0000000..494d754 --- /dev/null +++ b/src/swis.in @@ -0,0 +1,256 @@ +#! /bin/sh +# This file is part of SWIS +# Copyright (C) 2007 Sergey Poznyakoff +# +# SWIS is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# SWIS is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with SWIS. If not, see <http://www.gnu.org/licenses/>. + +CONFIG==SYSCONFDIR=/swis.conf +PATH==LIBEXECDIR=:$PATH +DEBUG= + +error() { + echo >&2 "$0: $*" +} + +die() { + code=$1 + shift + error $* + exit $code +} + +# absname arg +absname() { + case $1 in + /*) echo $1;; + *) dir=`expr "$CONFIG" : '\(.*\)/.*'` + if [ -n "$dir" ]; then + echo $dir/$1 + else + echo $1 + fi + esac +} + +# dirscan [dir [url]] +dirscan() { + FNARG= + WSARG=$DEBUG + BACKEND= + PASSFILE= + DIR= + BASE_URL= + + if [ -r $CONFIG ]; then + LINE=0 + while read keyword arg rest + do + LINE=$(($LINE + 1)) + case $keyword in + \#*) continue;; + "") continue;; + directory) + DIR=$arg;; + file-pattern) + FNARG="$FNARG -name '$arg'" + ;; + find-option) + FNARG="$FNARG $arg $rest" + ;; + base-url) + BASE_URL=$arg + ;; + badwords-from) + WSARG="$WSARG --badword=`absname $arg`" + for arg in $rest + do + WSARG="$WSARG --badword=`absname $arg`" + done + ;; + + min-length) + WSARG="$WSARG --min-length=$arg" + ;; + exclude) + WSARG="$WSARG --exclude=$arg" + ;; + exclude-from) + WSARG="$WSARG --exclude-from=`absname $arg`" + for arg in $rest + do + WSARG="$WSARG --exclude-from=`absname $arg`" + done + ;; + full-text) + if [ $arg = yes ]; then + WSARG="$WSARG --full-text" + fi + ;; + backend) + BACKEND="${arg}-backend $rest" + ;; + password-file) + eval PASSFILE=$arg;; + *) die 1 "$CONFIG:$LINE: unknown keyword" + esac + done < $CONFIG + else + error "warning: configuration file $CONFIG does not exist or is unreadable" + fi + + if [ -z "$FNARG" ]; then + FNARG="-name '*.html'" + fi + + if [ -n "$PASSFILE" ]; then + if [ -n "$BACKEND" ]; then + BACKEND="$BACKEND --pass-from-file='$PASSFILE'" + fi + fi + + if [ -z "$BACKEND" ]; then + BACKEND=cat + fi + + # Process arguments + [ -n "$1" ] && DIR=$1 + [ -n "$2" ] && BASE_URL=$2 + + case $DIR in + "") die 1 "Source directory not given" + ;; + */) ;; + *) DIR=$DIR/ + esac + + case $BASE_URL in + "") die 1 "Base URL not given" + ;; + */) ;; + *) BASE_URL=$BASE_URL/ + esac + + eval find $DIR -type f $FNARG | + html-strip -t -T - $DEBUG | + eval word-split -t -u $WSARG | + sed "s,> $DIR,> $BASE_URL," | + eval $BACKEND $DEBUG +} + +TMPDIR=/tmp/swis.$$ +cleanup() { + rm -rf $TMPDIR +} + +# webcan url +webscan() { + mkdir $TMPDIR || die "cannot create temporary directory" + trap cleanup 1 2 13 15 + wget -q -r -nH -P$TMPDIR $1 + dirscan $TMPDIR $1 + cleanup +} + +############################################################################### +# Main +############################################################################### + +usage() { + cat - <<EOT +Usage: swis [OPTIONS] COMMAND [args] +Index given URL or directory. + +Options are: + -d, --debug increase debugging level + -c, --config=FILE read configuration from FILE + -h, --help show this help summary + -v, --version print program version and exit + +Commands are: + web URL index given URL + dir [DIR [URL]] index directory DIR + +EOT + # Display bug reporting address: + html-strip --help | tail -n 1 +} + +longopt() { + case ${1##--} in + h|he|hel|help) + usage + exit 0 + ;; + d|de|deb|debu|debug) + [ -z "$DEBUG" ] && DEBUG=- + DEBUG="${DEBUG}d" + ;; + v|ve|ver|vers|versi|versio|version) + html-strip --version | sed "s|html-strip|$0|g" + exit 0 + ;; + c|co|conf|confi|config) + CONFIG=$2 + ;; + *) + error "unknown long option: --$1" + exit 1 + esac +} + +while getopts "c:dhv-:" OPTION +do + case $OPTION in + c) longopt config $OPTARG;; + h) longopt help;; + d) longopt debug;; + v) longopt version;; + -) case $OPTARG in + *=*) arg=${OPTARG##*=} + OPTARG=${OPTARG%%=*} + shift $(($OPTIND - 1)) + set -- --$OPTARG $arg $* + OPTIND=2 + esac + case ${OPTARG} in + c|co|conf|confi|config) + shift $(($OPTIND - 1)) + longopt $OPTARG $1 + shift + OPTIND=0 + ;; + *) + longopt $OPTARG + ;; + esac + ;; + esac +done + +shift $(($OPTIND - 1)) + +if [ $# -eq 0 ]; then + error "not enough arguments" + exit 1 +fi + +case $1 in +dir) shift; dirscan $*;; +web) shift; webscan $*;; +*) error "unrecognized command $1" +esac + +exit 0 +# End + diff --git a/src/word-split.c b/src/word-split.c index 05c73fe..60cefcc 100644 --- a/src/word-split.c +++ b/src/word-split.c @@ -51,6 +51,7 @@ usage () printf (" -o, --output=FILE direct output to FILE instead of stdout\n"); printf (" --exclude=REGEX exclude words matching REGEX\n"); printf (" --exclude-from=FILE read list of regexps to exclude from FILE\n"); + printf (" -m, --min-length=NUMBER set minimal word length\n"); printf (" -t, --tag preserve file name tags\n"); printf (" -T, --from-file=FILE read input file names from FILE\n"); printf (" -0, --null -T reads null-terminated names\n"); |