From 24abd7afb6530620c4a80c14fdce96eead280ef7 Mon Sep 17 00:00:00 2001 From: Sergey Poznyakoff Date: Thu, 8 May 2014 14:26:32 +0300 Subject: Use Sphinx for searches. * sphinx/index.php: New file. * sphinx/xmlindex: New file. * xml/lingua.conf.in: Use own search instead of Google. (install-srch): New macro. * xml/pl/ellinika.xml: Call (install-srch). Build search page template. * xml/ru/ellinika.xml: Likewise. * xml/uk/ellinika.xml: Likewise. * po/pl.po: Update. * po/ru.po: Update. * po/uk.po: Update. --- po/pl.po | 10 +- po/ru.po | 14 +-- po/uk.po | 14 +-- sphinx/index.php | 158 ++++++++++++++++++++++++++++ sphinx/xmlindex | 296 ++++++++++++++++++++++++++++++++++++++++++++++++++++ xml/lingua.conf.in | 22 ++-- xml/pl/ellinika.xml | 9 ++ xml/ru/ellinika.xml | 9 ++ xml/uk/ellinika.xml | 9 ++ 9 files changed, 512 insertions(+), 29 deletions(-) create mode 100644 sphinx/index.php create mode 100755 sphinx/xmlindex diff --git a/po/pl.po b/po/pl.po index f5b9957..7aa9079 100644 --- a/po/pl.po +++ b/po/pl.po @@ -7,14 +7,14 @@ msgid "" msgstr "" "Project-Id-Version: Ellinika 1.0\n" "Report-Msgid-Bugs-To: gray@gnu.org\n" -"POT-Creation-Date: 2011-06-20 13:02+0300\n" +"POT-Creation-Date: 2014-05-04 10:23+0300\n" "PO-Revision-Date: 2011-06-20 13:04+0300\n" "Last-Translator: Sergey Poznyakoff \n" "Language-Team: Polish \n" +"Language: pl\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" -"Language: pl\n" "Plural-Forms: nplurals=3; plural=n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 " "|| n%100>=20) ? 1 : 2;\n" @@ -119,7 +119,7 @@ msgstr "" msgid "Μη έγκυρη είσοδος" msgstr "Niepoprawne wejście" -#: src/cgi-bin/conj.scm4:334 +#: src/cgi-bin/conj.scm4:336 msgid "" "Στην λέξη εισαγωγής δεν υπάρχει τόνος. Μήπος θέλατε να κλίσετε ένα απ'αυτά " "τα ρήματα:" @@ -127,12 +127,12 @@ msgstr "" "W tym słowie brak akcentów. Czy chodziło Ci o jeden z następujących\n" "rzeczowników:" -#: src/cgi-bin/conj.scm4:367 +#: src/cgi-bin/conj.scm4:369 msgid "" "Αυτή η λέξη δεν είναι ρήμα στο πρώτο ενικό πρόσωπο της οριστικής του " "ενεστώτα." msgstr "Podane słowo nie jest rzeczownikiem w formie 1.os. liczby pojedynczej." -#: src/cgi-bin/conj.scm4:387 +#: src/cgi-bin/conj.scm4:389 msgid "Το θέμα αυτού του χρόνου δεν επιβεβαιώνεται από τη βάση δεδοµένων" msgstr "" diff --git a/po/ru.po b/po/ru.po index d6517cd..be9d470 100644 --- a/po/ru.po +++ b/po/ru.po @@ -7,16 +7,16 @@ msgid "" msgstr "" "Project-Id-Version: Ellinika 1.0\n" "Report-Msgid-Bugs-To: gray@gnu.org\n" -"POT-Creation-Date: 2011-06-20 13:02+0300\n" +"POT-Creation-Date: 2014-05-04 10:23+0300\n" "PO-Revision-Date: 2006-03-17 21:42+0200\n" "Last-Translator: Sergey Poznyakoff , 2004.\n" "Language-Team: Russian \n" +"Language: ru\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" -"Language: ru\n" -"Plural-Forms: nplurals=3; plural=n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%" -"10<=4 && (n%100<10 || n%100>=20) ? 1 : 2;\n" +"Plural-Forms: nplurals=3; plural=n%10==1 && n%100!=11 ? 0 : n%10>=2 && n" +"%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2;\n" "\n" #: src/cgi-bin/dict.scm4:40 src/cgi-bin/conj.scm4:43 @@ -122,18 +122,18 @@ msgstr "" msgid "Μη έγκυρη είσοδος" msgstr "" -#: src/cgi-bin/conj.scm4:334 +#: src/cgi-bin/conj.scm4:336 msgid "" "Στην λέξη εισαγωγής δεν υπάρχει τόνος. Μήπος θέλατε να κλίσετε ένα απ'αυτά " "τα ρήματα:" msgstr "" -#: src/cgi-bin/conj.scm4:367 +#: src/cgi-bin/conj.scm4:369 msgid "" "Αυτή η λέξη δεν είναι ρήμα στο πρώτο ενικό πρόσωπο της οριστικής του " "ενεστώτα." msgstr "" -#: src/cgi-bin/conj.scm4:387 +#: src/cgi-bin/conj.scm4:389 msgid "Το θέμα αυτού του χρόνου δεν επιβεβαιώνεται από τη βάση δεδοµένων" msgstr "" diff --git a/po/uk.po b/po/uk.po index 3bd8d08..3f31d4d 100644 --- a/po/uk.po +++ b/po/uk.po @@ -7,16 +7,16 @@ msgid "" msgstr "" "Project-Id-Version: Ellinika 1.0\n" "Report-Msgid-Bugs-To: gray@gnu.org\n" -"POT-Creation-Date: 2011-06-20 13:02+0300\n" +"POT-Creation-Date: 2014-05-04 10:23+0300\n" "PO-Revision-Date: 2006-03-24 19:25+0200\n" "Last-Translator: Sergey Poznyakoff \n" "Language-Team: Ukrainian \n" +"Language: uk\n" "MIME-Version: 1.0\n" "Content-Type: text/plain; charset=UTF-8\n" "Content-Transfer-Encoding: 8bit\n" -"Language: uk\n" -"Plural-Forms: nplurals=3; plural=(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%" -"10<=4 && (n%100<10 || n%100>=20) ? 1 : 2);\n" +"Plural-Forms: nplurals=3; plural=(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n" +"%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2);\n" #: src/cgi-bin/dict.scm4:40 src/cgi-bin/conj.scm4:43 msgid "ΣΦΆΛΜΑ: σύνδεση με το λέξικο απέτυχε." @@ -121,18 +121,18 @@ msgstr "" msgid "Μη έγκυρη είσοδος" msgstr "" -#: src/cgi-bin/conj.scm4:334 +#: src/cgi-bin/conj.scm4:336 msgid "" "Στην λέξη εισαγωγής δεν υπάρχει τόνος. Μήπος θέλατε να κλίσετε ένα απ'αυτά " "τα ρήματα:" msgstr "" -#: src/cgi-bin/conj.scm4:367 +#: src/cgi-bin/conj.scm4:369 msgid "" "Αυτή η λέξη δεν είναι ρήμα στο πρώτο ενικό πρόσωπο της οριστικής του " "ενεστώτα." msgstr "" -#: src/cgi-bin/conj.scm4:387 +#: src/cgi-bin/conj.scm4:389 msgid "Το θέμα αυτού του χρόνου δεν επιβεβαιώνεται από τη βάση δεδοµένων" msgstr "" diff --git a/sphinx/index.php b/sphinx/index.php new file mode 100644 index 0000000..dbe5014 --- /dev/null +++ b/sphinx/index.php @@ -0,0 +1,158 @@ +. +*/ + +require ("sphinxapi.php"); + +function read_config($file) { + global $config; + + $fp = fopen($file, "r"); + if (!$fp) + die("can't open config file $file"); + while (!feof($fp)) { + $line = fgets($fp); + $line = preg_replace('/(^\s+)|(\s+$)|(#.*)/', '', $line); + if ($line == '') continue; + if (preg_match('/(.+?)\s*=\s*(.*)/', $line, $m)) { + $config[$m[1]] = $m[2]; + } + } + fclose($fp); +} + +$cfg = getenv('ELLINIKA_INDEX_CONFIG'); +if (isset($cfg)) + read_config($cfg); +else + die("configuration file not set"); + +$base_url = $config['base-url']; +$base_dir = $config['base-directory']; +$index = $config['index']; + +if (isset($config['id-directory'])) { + $iddir = $config['id-directory']; +} else { + $iddir = "$base_dir/sphinx/var"; +} + +if ($_REQUEST['l']) { + $index = preg_replace(array('/\$l(?=[^\w])/', + '/\${l}/'), + array($_REQUEST['l'], + $_REQUEST['l']), $index); + $htmldir = $base_dir.'/'.$_REQUEST['l']; +} else { + header("Location: $base_url"); + exit(0); +} + +function print_page_title($id, $cl, $q) { + global $iddir; + global $index; + + $dom = new DOMDocument; + if (!$dom->loadHTMLFile("$iddir/$id")) { + return "
Can't load file $iddir/$id
\n"; + } + $items = $dom->getElementsByTagName('title'); + $r = ""; + if ($items->length) { + $href = readlink("$iddir/$id"); + if (preg_match('/.*\/(.+?\/.+?\.html)/', $href, $matches)) { + $href = "/".$matches[1]; + } + $r .= "".$items->item(0)->nodeValue."\n"; + } + + $xpath = new DOMXPath($dom); + $results = $xpath->query("//div[@class='content-inner']"); + $a = array(); + if ($results) { + for ($i = 0; $i < $results->length; $i++) { + $a[] = $results->item($i)->nodeValue; + } + } + + $r .= '

'; + foreach ($cl->BuildExcerpts($a, $index, $q, + array('limit' => 256)) as + $line) { + $r .= "$line"; + } + $r .= '

'; + return $r; +} + +function dosearch($q) { + global $index; + $ret = ""; + + $ret .= "

'".$q."'

"; + $cl = new SphinxClient(); + #$cl->SetServer ( $host, $port ); + $cl->SetArrayResult(false); + $cl->SetFieldWeights(array('title' => 5, + 'content' => 1)); + $cl->SetSortMode(SPH_SORT_RELEVANCE); + $result = $cl->Query($q, $index); +// print "\n"; + if ($result === false) { + $ret .= "
Query failed: " . $cl->GetLastError() . ".
\n"; + } else { + $ret .= "
\n"; + $ret .= "
    \n"; + foreach ($result['matches'] as $id => $val) { + $ret .= "
  1. "; + $ret .= print_page_title($id, $cl, $q); + $ret .= "
  2. "; + } + $ret .= "
\n"; + $ret .= "
\n"; + } + return $ret; +} + + +$fp = fopen("$htmldir/search.html", "r"); +if (!$fp) die(); + +while (!feof($fp)) { + $line = preg_replace_callback('/(@@result@@|@@args@@)/', + function ($matches) { + if ($matches[1] == '@@result@@') { + if ($_REQUEST['q']) { + return dosearch($_REQUEST['q']); + } else { + return '
'. + 'No query supplied'. + '
'; + } + } else if ($matches[1] == '@@args@@') { + if ($_REQUEST['q']) { + return "&q=".$_REQUEST['q']; + } + } + }, fgets($fp)); + print $line; +} +fclose($fp); + +?> diff --git a/sphinx/xmlindex b/sphinx/xmlindex new file mode 100755 index 0000000..f3fe76d --- /dev/null +++ b/sphinx/xmlindex @@ -0,0 +1,296 @@ +#! /usr/bin/perl +# Xmlpipe2 indexer for Ellinika +# Copyright (C) 2014 Sergey Poznyakoff +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +use v5.14; +use Sphinx::XML::Pipe2; +use File::Find; +use File::Basename; +use GDBM_File; +use HTML::TreeBuilder; +use HTML::TreeBuilder::XPath; +use Getopt::Long qw(:config gnu_getopt no_ignore_case); +use Pod::Usage; +use Pod::Man; +use Cwd; + +my $script; # This script name. +my $base_dir; # Base directory (roughly corresponds to DocumentRoot). +my $iddir; # ID directory contains symlinks to actual HTML pages. + # Each link is named by the page Sphinx ID. +my $idtabname; # Name of the GDBM file which maps the HTML file name + # to its index. +my @exclude; # Array of file names to be excluded from indexing. + +my %ignore; # @exclude is converted to this hash befor use. + +# Read and parse the configuration file. +sub read_config_file($) { + my $config_file = shift; +# print STDERR "reading $config_file\n"; + open(my $fd, "<", $config_file) or die("cannot open $config_file: $!"); + while (<$fd>) { + chomp; + s/^\s+//; + s/\s+$//; + s/\s+=\s+/=/; + s/#.*//; + next if ($_ eq ""); + unshift(@ARGV, "--$_"); + } + close($fd); +} + +# ############################################################################ +# Parse configuration and command line. Initialize global variables. +# ############################################################################ + +($script = $0) =~ s/.*\///; + +## Read configuration +my $config_file; +if (-e "./.xmlindex.conf") { + $config_file = "./.xmlindex.conf"; +} elsif (-e "$ENV{HOME}/.xmlindex.conf") { + $config_file = "$ENV{HOME}/.xmlindex.conf"; +} elsif ($ENV{'ELLINIKA_INDEX_CONFIG'}) { + $config_file = $ENV{'ELLINIKA_INDEX_CONFIG'}; +} elsif ($ENV{'XMLINDEX_CONF'}) { + $config_file = $ENV{'XMLINDEX_CONF'}; +} + +read_config_file($config_file) if defined($config_file); + +my $help; +my $man; +GetOptions("help" => \$man, + "h" => \$help, + "config|c=s" => sub { read_config_file($_[1]); }, + "base-directory=s" => \$base_dir, + "id-directory=s" => \$iddir, + "id-db=s" => \$idtabname, + "exclude=s" => \@exclude, + "clear-exclude" => sub { @exclude = (); }, + "base-url=s" => sub { }, # ignored + "index=s" => sub { }, # ignored + ) or exit(1); + +pod2usage(-message => "$script: index ellinika HTML pages", + -exitstatus => 0) if $help; +pod2usage(-exitstatus => 0, -verbose => 2) if $man; + +die "no directories to index" if ($#ARGV == -1); + +# Set up defaults. +$base_dir = getcwd unless defined($base_dir); +$iddir = "$base_dir/sphinx/var" unless defined($iddir); +$idtabname = "id.db" unless defined($idtabname); +$idtabname = "$iddir/$idtabname" unless ($idtabname =~ /^\//); + +%ignore = map { $_ => 1 } @exclude; + +# ############################################################################ +# Set up ID mapping. +# ############################################################################ + +my %idtab; +tie %idtab, 'GDBM_File', $idtabname, &GDBM_WRCREAT, 0640; + +$idtab{DEFAULT} = 1 unless defined($idtab{DEFAULT}); + +# ############################################################################ +# Scan directories, create and output resulting XML. +# ############################################################################ + +binmode STDIN, ":encoding(utf8)"; +binmode STDOUT, ":encoding(utf8)"; + +my $p = Sphinx::XML::Pipe2->new; + +$p->attr('size', 'int'); +$p->attr('type', 'str2ordinal'); +$p->field('content'); +$p->field('title'); + +find( sub { + my $file = $_; + my $size; + if (-f -r $file && ($size = -s $file) && + $size > 0 && + $file =~ /\.(html|txt|rtf)?$/i && + !defined($ignore{basename($File::Find::name)})) { + $idtab{$File::Find::name} = $idtab{DEFAULT}++ + unless (defined($idtab{$File::Find::name})); + my $id = $idtab{$File::Find::name}; + symlink ($File::Find::name, "$iddir/$id") unless (-l "$iddir/$id"); + + my $tree = HTML::TreeBuilder::XPath->new(); + open(my $fd, "<:encoding(utf8)", $File::Find::name) or + die "Can't open $File::Find::name: $!"; + $tree->parse_file($fd); + close($fd); + + foreach my $elt ($tree->findnodes('//div[@class="bottom-navbar"]')) { + $elt->delete; + } + + my $title = $tree->findvalue('//title'); + my $content = $tree->findvalue('//div[@class="content-inner"]'); + $tree->delete; + + $p->add($id, # document id + $size, # attributes in declaration order, i.e. 'size' + lc($1), # 'type' + $content, + $title); + } +}, map { $_ =~ /^\// ? $_ : "$base_dir/$_"; } @ARGV); + +print $p->xml; +__END__ +=head1 xmlindex + +xmlindex - index generated ellinika HTML pages + +=head1 SYNOPSIS + +xmlindex B<--base-directory=>I B<--id-directory=>I B<--id-db=>I + B<--exclude=>I B [B...] + +xmlindex B<--help>|B<-h> + +=head1 DESCRIPTION + +This utility is intended to create B indexes for the B +sites. It scans the supplied directories for the Ellinika HTML files and +produces on the standard output an XML document in B format. + +The sample usage in the B file is: + + source ellinika_pl { + type = xmlpipe2; + xmlpipe_command = /srv/ellinika/sphinx/xmlindex \ + --config=/srv/ellinika/etc/index.conf pl + } + +=head1 OPTIONS + +=over 4 + +=item B<-c>, B<--config=>I + +Read configuration from I. See the section B for +a discussion of its format. + +=item B<--base-directory=>I + +Sets the base directory. Any arguments that do not begin with a +slash are looked in that directory. Unless ID directory is set using +the B<--id-directory> option, it defaults to B/sphinx/var>. + +Default is current working directory. + +=item B<--id-directory=>I + +Sets the ID directory. The contents of this directory is used to translate +document names to their IDs and vice-versa. The first part of the task is +done using the GDBM file B located in this directory. The second +part (translating IDs to document names) is done by creating symbolic links +for each indexed document. Each such link is located in this directory and +is named by the document ID. + +Default is B/sphinx/var>. + +=item B<--id-db=>I + +Define name of the ID database file. Unless I begins with a slash, +the value of I is prepended to it. + +Default is B. + +=item B<--exclude=>I + +Do not index I. This option can be used multiple times. + +=item B<--clear-exclude> + +Clears the exclusion list built so far. + +=item B<--base-url=>I + +Base URL of the site. This option is ignored by B. It is used +by B. + +=item B<--index=>I + +Pattern of the index file. This option is ignored by B. It is used +by B. + +=back + +=head1 CONFIGURATION + +Along with the command line, program options can be retrieved from a +configuration file. It is looked up in one of the following locations: + +=over 4 + +=item B B<.xmlindex.conf> + +=item B B<~/.xmlindex.conf> + +=item B The file named by B<$ELLINIKA_INDEX_CONFIG> environment variable, if it is set. + +=item B The file named by B<$XMLINDEX_CONF> environment variable, if it is set. + +=back + +The first file found is read, then the command line is processed (it can +force another configuration file to be read, by using the B<--config> +command line option). Each subsequent source of options overrides the +previous one, excepting the B option, which is cumulative. The +B<--clear-exclude> option can be used to clear up the previously built +exclusion list. + +The configuration file has the usual UNIX configuration format. Empty +lines and UNIX comments are ignored. Each non-empty line is either an +option name, or option assignment, i.e. B=B, with any amount of +optional whitespace around the equals sign. Valid option names are +the same as long command line options, but without the leading B<-->. + +For example: + + # xmlindex settings: + base-directory = /srv/websites/ellinika-dev + exclude = NMZ.body.html + exclude = dict.html + exclude = htdig.html + exclude = map.html + exclude = nea.html + exclude = nomatch.html + exclude = conj.html + exclude = search.html + # Index.php settings. + base-url = http://ellinika-dev.gnu.org.ua + index = ellinika_dev_${l}_idx1 + +=head1 AUTHOR + +Sergey Poznyakoff + +=cut + + diff --git a/xml/lingua.conf.in b/xml/lingua.conf.in index 66e62cb..d736bcb 100644 --- a/xml/lingua.conf.in +++ b/xml/lingua.conf.in @@ -11,22 +11,19 @@ - +
- - - - + ~A + +
- ]]> + + ]]>
2004-2011 Sergey Poznyakoff (install-nea) (install-conj) + (install-srch) #f @@ -116,6 +117,14 @@ Copyright 2004-2011 Sergey Poznyakoff + + diff --git a/xml/ru/ellinika.xml b/xml/ru/ellinika.xml index d9707df..2a0068d 100644 --- a/xml/ru/ellinika.xml +++ b/xml/ru/ellinika.xml @@ -38,6 +38,7 @@ Copyright 2004-2011 Sergey Poznyakoff (install-nea) + (install-srch) #f @@ -115,6 +116,14 @@ Copyright 2004-2011 Sergey Poznyakoff + + diff --git a/xml/uk/ellinika.xml b/xml/uk/ellinika.xml index 96a9cf2..61945d9 100644 --- a/xml/uk/ellinika.xml +++ b/xml/uk/ellinika.xml @@ -38,6 +38,7 @@ Copyright 2004-2011 Sergey Poznyakoff (install-nea) + (install-srch) #f @@ -115,6 +116,14 @@ Copyright 2004-2011 Sergey Poznyakoff + + -- cgit v1.2.1