diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2014-05-08 14:26:32 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2014-05-08 14:26:32 +0300 |
commit | 24abd7afb6530620c4a80c14fdce96eead280ef7 (patch) | |
tree | e24267cc7747d94d9a38c424bdce7190a7c10673 | |
parent | 750e7ae3bafe9eec88887eb22b7250719028f6d6 (diff) | |
download | ellinika-24abd7afb6530620c4a80c14fdce96eead280ef7.tar.gz ellinika-24abd7afb6530620c4a80c14fdce96eead280ef7.tar.bz2 |
Use Sphinx for searches.
* sphinx/index.php: New file.
* sphinx/xmlindex: New file.
* xml/lingua.conf.in: Use own search instead of Google.
(install-srch): New macro.
* xml/pl/ellinika.xml: Call (install-srch). Build search page template.
* xml/ru/ellinika.xml: Likewise.
* xml/uk/ellinika.xml: Likewise.
* po/pl.po: Update.
* po/ru.po: Update.
* po/uk.po: Update.
-rw-r--r-- | po/pl.po | 10 | ||||
-rw-r--r-- | po/ru.po | 14 | ||||
-rw-r--r-- | po/uk.po | 14 | ||||
-rw-r--r-- | sphinx/index.php | 158 | ||||
-rwxr-xr-x | sphinx/xmlindex | 296 | ||||
-rw-r--r-- | xml/lingua.conf.in | 22 | ||||
-rw-r--r-- | xml/pl/ellinika.xml | 9 | ||||
-rw-r--r-- | xml/ru/ellinika.xml | 9 | ||||
-rw-r--r-- | xml/uk/ellinika.xml | 9 |
9 files changed, 512 insertions, 29 deletions
@@ -9,3 +9,3 @@ msgstr "" "Report-Msgid-Bugs-To: gray@gnu.org\n" -"POT-Creation-Date: 2011-06-20 13:02+0300\n" +"POT-Creation-Date: 2014-05-04 10:23+0300\n" "PO-Revision-Date: 2011-06-20 13:04+0300\n" @@ -13,2 +13,3 @@ msgstr "" "Language-Team: Polish <translation-team-pl@lists.sourceforge.net>\n" +"Language: pl\n" "MIME-Version: 1.0\n" @@ -16,3 +17,2 @@ msgstr "" "Content-Transfer-Encoding: 8bit\n" -"Language: pl\n" "Plural-Forms: nplurals=3; plural=n==1 ? 0 : n%10>=2 && n%10<=4 && (n%100<10 " @@ -121,3 +121,3 @@ msgstr "Niepoprawne wejście" -#: src/cgi-bin/conj.scm4:334 +#: src/cgi-bin/conj.scm4:336 msgid "" @@ -129,3 +129,3 @@ msgstr "" -#: src/cgi-bin/conj.scm4:367 +#: src/cgi-bin/conj.scm4:369 msgid "" @@ -135,3 +135,3 @@ msgstr "Podane słowo nie jest rzeczownikiem w formie 1.os. liczby pojedynczej." -#: src/cgi-bin/conj.scm4:387 +#: src/cgi-bin/conj.scm4:389 msgid "Το θέμα αυτού του χρόνου δεν επιβεβαιώνεται από τη βάση δεδοµένων" @@ -9,3 +9,3 @@ msgstr "" "Report-Msgid-Bugs-To: gray@gnu.org\n" -"POT-Creation-Date: 2011-06-20 13:02+0300\n" +"POT-Creation-Date: 2014-05-04 10:23+0300\n" "PO-Revision-Date: 2006-03-17 21:42+0200\n" @@ -13,2 +13,3 @@ msgstr "" "Language-Team: Russian <ru@li.org>\n" +"Language: ru\n" "MIME-Version: 1.0\n" @@ -16,5 +17,4 @@ msgstr "" "Content-Transfer-Encoding: 8bit\n" -"Language: ru\n" -"Plural-Forms: nplurals=3; plural=n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%" -"10<=4 && (n%100<10 || n%100>=20) ? 1 : 2;\n" +"Plural-Forms: nplurals=3; plural=n%10==1 && n%100!=11 ? 0 : n%10>=2 && n" +"%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2;\n" "\n" @@ -124,3 +124,3 @@ msgstr "" -#: src/cgi-bin/conj.scm4:334 +#: src/cgi-bin/conj.scm4:336 msgid "" @@ -130,3 +130,3 @@ msgstr "" -#: src/cgi-bin/conj.scm4:367 +#: src/cgi-bin/conj.scm4:369 msgid "" @@ -136,3 +136,3 @@ msgstr "" -#: src/cgi-bin/conj.scm4:387 +#: src/cgi-bin/conj.scm4:389 msgid "Το θέμα αυτού του χρόνου δεν επιβεβαιώνεται από τη βάση δεδοµένων" @@ -9,3 +9,3 @@ msgstr "" "Report-Msgid-Bugs-To: gray@gnu.org\n" -"POT-Creation-Date: 2011-06-20 13:02+0300\n" +"POT-Creation-Date: 2014-05-04 10:23+0300\n" "PO-Revision-Date: 2006-03-24 19:25+0200\n" @@ -13,2 +13,3 @@ msgstr "" "Language-Team: Ukrainian <translation-team-uk@lists.sourceforge.net>\n" +"Language: uk\n" "MIME-Version: 1.0\n" @@ -16,5 +17,4 @@ msgstr "" "Content-Transfer-Encoding: 8bit\n" -"Language: uk\n" -"Plural-Forms: nplurals=3; plural=(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%" -"10<=4 && (n%100<10 || n%100>=20) ? 1 : 2);\n" +"Plural-Forms: nplurals=3; plural=(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n" +"%10<=4 && (n%100<10 || n%100>=20) ? 1 : 2);\n" @@ -123,3 +123,3 @@ msgstr "" -#: src/cgi-bin/conj.scm4:334 +#: src/cgi-bin/conj.scm4:336 msgid "" @@ -129,3 +129,3 @@ msgstr "" -#: src/cgi-bin/conj.scm4:367 +#: src/cgi-bin/conj.scm4:369 msgid "" @@ -135,3 +135,3 @@ msgstr "" -#: src/cgi-bin/conj.scm4:387 +#: src/cgi-bin/conj.scm4:389 msgid "Το θέμα αυτού του χρόνου δεν επιβεβαιώνεται από τη βάση δεδοµένων" diff --git a/sphinx/index.php b/sphinx/index.php new file mode 100644 index 0000000..dbe5014 --- /dev/null +++ b/sphinx/index.php @@ -0,0 +1,158 @@ +<?php +/* Web search for Ellinika + Copyright (C) 2014 Sergey Poznyakoff + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +require ("sphinxapi.php"); + +function read_config($file) { + global $config; + + $fp = fopen($file, "r"); + if (!$fp) + die("can't open config file $file"); + while (!feof($fp)) { + $line = fgets($fp); + $line = preg_replace('/(^\s+)|(\s+$)|(#.*)/', '', $line); + if ($line == '') continue; + if (preg_match('/(.+?)\s*=\s*(.*)/', $line, $m)) { + $config[$m[1]] = $m[2]; + } + } + fclose($fp); +} + +$cfg = getenv('ELLINIKA_INDEX_CONFIG'); +if (isset($cfg)) + read_config($cfg); +else + die("configuration file not set"); + +$base_url = $config['base-url']; +$base_dir = $config['base-directory']; +$index = $config['index']; + +if (isset($config['id-directory'])) { + $iddir = $config['id-directory']; +} else { + $iddir = "$base_dir/sphinx/var"; +} + +if ($_REQUEST['l']) { + $index = preg_replace(array('/\$l(?=[^\w])/', + '/\${l}/'), + array($_REQUEST['l'], + $_REQUEST['l']), $index); + $htmldir = $base_dir.'/'.$_REQUEST['l']; +} else { + header("Location: $base_url"); + exit(0); +} + +function print_page_title($id, $cl, $q) { + global $iddir; + global $index; + + $dom = new DOMDocument; + if (!$dom->loadHTMLFile("$iddir/$id")) { + return "<div class=\"err\">Can't load file $iddir/$id</div>\n"; + } + $items = $dom->getElementsByTagName('title'); + $r = ""; + if ($items->length) { + $href = readlink("$iddir/$id"); + if (preg_match('/.*\/(.+?\/.+?\.html)/', $href, $matches)) { + $href = "/".$matches[1]; + } + $r .= "<a href=\"$href\">".$items->item(0)->nodeValue."</a>\n"; + } + + $xpath = new DOMXPath($dom); + $results = $xpath->query("//div[@class='content-inner']"); + $a = array(); + if ($results) { + for ($i = 0; $i < $results->length; $i++) { + $a[] = $results->item($i)->nodeValue; + } + } + + $r .= '<p>'; + foreach ($cl->BuildExcerpts($a, $index, $q, + array('limit' => 256)) as + $line) { + $r .= "$line"; + } + $r .= '</p>'; + return $r; +} + +function dosearch($q) { + global $index; + $ret = ""; + + $ret .= "<h2>'".$q."'</h2>"; + $cl = new SphinxClient(); + #$cl->SetServer ( $host, $port ); + $cl->SetArrayResult(false); + $cl->SetFieldWeights(array('title' => 5, + 'content' => 1)); + $cl->SetSortMode(SPH_SORT_RELEVANCE); + $result = $cl->Query($q, $index); +// print "<!-- \n"; +// print_r($result); +// print "-->\n"; + if ($result === false) { + $ret .= "<div class=\"err\">Query failed: " . $cl->GetLastError() . ".</div>\n"; + } else { + $ret .= "<div class=\"res\">\n"; + $ret .= "<ol class=\"res\">\n"; + foreach ($result['matches'] as $id => $val) { + $ret .= "<li>"; + $ret .= print_page_title($id, $cl, $q); + $ret .= "</li>"; + } + $ret .= "</ol>\n"; + $ret .= "</div>\n"; + } + return $ret; +} + + +$fp = fopen("$htmldir/search.html", "r"); +if (!$fp) die(); + +while (!feof($fp)) { + $line = preg_replace_callback('/(@@result@@|@@args@@)/', + function ($matches) { + if ($matches[1] == '@@result@@') { + if ($_REQUEST['q']) { + return dosearch($_REQUEST['q']); + } else { + return '<div class="err">'. + 'No query supplied'. + '</div>'; + } + } else if ($matches[1] == '@@args@@') { + if ($_REQUEST['q']) { + return "&q=".$_REQUEST['q']; + } + } + }, fgets($fp)); + print $line; +} +fclose($fp); + +?> diff --git a/sphinx/xmlindex b/sphinx/xmlindex new file mode 100755 index 0000000..f3fe76d --- /dev/null +++ b/sphinx/xmlindex @@ -0,0 +1,296 @@ +#! /usr/bin/perl +# Xmlpipe2 indexer for Ellinika +# Copyright (C) 2014 Sergey Poznyakoff +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +use v5.14; +use Sphinx::XML::Pipe2; +use File::Find; +use File::Basename; +use GDBM_File; +use HTML::TreeBuilder; +use HTML::TreeBuilder::XPath; +use Getopt::Long qw(:config gnu_getopt no_ignore_case); +use Pod::Usage; +use Pod::Man; +use Cwd; + +my $script; # This script name. +my $base_dir; # Base directory (roughly corresponds to DocumentRoot). +my $iddir; # ID directory contains symlinks to actual HTML pages. + # Each link is named by the page Sphinx ID. +my $idtabname; # Name of the GDBM file which maps the HTML file name + # to its index. +my @exclude; # Array of file names to be excluded from indexing. + +my %ignore; # @exclude is converted to this hash befor use. + +# Read and parse the configuration file. +sub read_config_file($) { + my $config_file = shift; +# print STDERR "reading $config_file\n"; + open(my $fd, "<", $config_file) or die("cannot open $config_file: $!"); + while (<$fd>) { + chomp; + s/^\s+//; + s/\s+$//; + s/\s+=\s+/=/; + s/#.*//; + next if ($_ eq ""); + unshift(@ARGV, "--$_"); + } + close($fd); +} + +# ############################################################################ +# Parse configuration and command line. Initialize global variables. +# ############################################################################ + +($script = $0) =~ s/.*\///; + +## Read configuration +my $config_file; +if (-e "./.xmlindex.conf") { + $config_file = "./.xmlindex.conf"; +} elsif (-e "$ENV{HOME}/.xmlindex.conf") { + $config_file = "$ENV{HOME}/.xmlindex.conf"; +} elsif ($ENV{'ELLINIKA_INDEX_CONFIG'}) { + $config_file = $ENV{'ELLINIKA_INDEX_CONFIG'}; +} elsif ($ENV{'XMLINDEX_CONF'}) { + $config_file = $ENV{'XMLINDEX_CONF'}; +} + +read_config_file($config_file) if defined($config_file); + +my $help; +my $man; +GetOptions("help" => \$man, + "h" => \$help, + "config|c=s" => sub { read_config_file($_[1]); }, + "base-directory=s" => \$base_dir, + "id-directory=s" => \$iddir, + "id-db=s" => \$idtabname, + "exclude=s" => \@exclude, + "clear-exclude" => sub { @exclude = (); }, + "base-url=s" => sub { }, # ignored + "index=s" => sub { }, # ignored + ) or exit(1); + +pod2usage(-message => "$script: index ellinika HTML pages", + -exitstatus => 0) if $help; +pod2usage(-exitstatus => 0, -verbose => 2) if $man; + +die "no directories to index" if ($#ARGV == -1); + +# Set up defaults. +$base_dir = getcwd unless defined($base_dir); +$iddir = "$base_dir/sphinx/var" unless defined($iddir); +$idtabname = "id.db" unless defined($idtabname); +$idtabname = "$iddir/$idtabname" unless ($idtabname =~ /^\//); + +%ignore = map { $_ => 1 } @exclude; + +# ############################################################################ +# Set up ID mapping. +# ############################################################################ + +my %idtab; +tie %idtab, 'GDBM_File', $idtabname, &GDBM_WRCREAT, 0640; + +$idtab{DEFAULT} = 1 unless defined($idtab{DEFAULT}); + +# ############################################################################ +# Scan directories, create and output resulting XML. +# ############################################################################ + +binmode STDIN, ":encoding(utf8)"; +binmode STDOUT, ":encoding(utf8)"; + +my $p = Sphinx::XML::Pipe2->new; + +$p->attr('size', 'int'); +$p->attr('type', 'str2ordinal'); +$p->field('content'); +$p->field('title'); + +find( sub { + my $file = $_; + my $size; + if (-f -r $file && ($size = -s $file) && + $size > 0 && + $file =~ /\.(html|txt|rtf)?$/i && + !defined($ignore{basename($File::Find::name)})) { + $idtab{$File::Find::name} = $idtab{DEFAULT}++ + unless (defined($idtab{$File::Find::name})); + my $id = $idtab{$File::Find::name}; + symlink ($File::Find::name, "$iddir/$id") unless (-l "$iddir/$id"); + + my $tree = HTML::TreeBuilder::XPath->new(); + open(my $fd, "<:encoding(utf8)", $File::Find::name) or + die "Can't open $File::Find::name: $!"; + $tree->parse_file($fd); + close($fd); + + foreach my $elt ($tree->findnodes('//div[@class="bottom-navbar"]')) { + $elt->delete; + } + + my $title = $tree->findvalue('//title'); + my $content = $tree->findvalue('//div[@class="content-inner"]'); + $tree->delete; + + $p->add($id, # document id + $size, # attributes in declaration order, i.e. 'size' + lc($1), # 'type' + $content, + $title); + } +}, map { $_ =~ /^\// ? $_ : "$base_dir/$_"; } @ARGV); + +print $p->xml; +__END__ +=head1 xmlindex + +xmlindex - index generated ellinika HTML pages + +=head1 SYNOPSIS + +xmlindex B<--base-directory=>I<DIR> B<--id-directory=>I<DIR> B<--id-db=>I<NAME> + B<--exclude=>I<FILE> B<DIR> [B<DIR>...] + +xmlindex B<--help>|B<-h> + +=head1 DESCRIPTION + +This utility is intended to create B<Sphinx> indexes for the B<Ellinika> +sites. It scans the supplied directories for the Ellinika HTML files and +produces on the standard output an XML document in B<xmlpipe2> format. + +The sample usage in the B<sphinx.conf> file is: + + source ellinika_pl { + type = xmlpipe2; + xmlpipe_command = /srv/ellinika/sphinx/xmlindex \ + --config=/srv/ellinika/etc/index.conf pl + } + +=head1 OPTIONS + +=over 4 + +=item B<-c>, B<--config=>I<FILE> + +Read configuration from I<FILE>. See the section B<CONFIGURATION> for +a discussion of its format. + +=item B<--base-directory=>I<DIR> + +Sets the base directory. Any arguments that do not begin with a +slash are looked in that directory. Unless ID directory is set using +the B<--id-directory> option, it defaults to B<I<DIR>/sphinx/var>. + +Default is current working directory. + +=item B<--id-directory=>I<DIR> + +Sets the ID directory. The contents of this directory is used to translate +document names to their IDs and vice-versa. The first part of the task is +done using the GDBM file B<id.db> located in this directory. The second +part (translating IDs to document names) is done by creating symbolic links +for each indexed document. Each such link is located in this directory and +is named by the document ID. + +Default is B<I<base-directory>/sphinx/var>. + +=item B<--id-db=>I<NAME> + +Define name of the ID database file. Unless I<NAME> begins with a slash, +the value of I<id-directory> is prepended to it. + +Default is B<id.db>. + +=item B<--exclude=>I<FILE> + +Do not index I<FILE>. This option can be used multiple times. + +=item B<--clear-exclude> + +Clears the exclusion list built so far. + +=item B<--base-url=>I<URL> + +Base URL of the site. This option is ignored by B<xmlindex>. It is used +by B<sphinx/index.php>. + +=item B<--index=>I<NAME> + +Pattern of the index file. This option is ignored by B<xmlindex>. It is used +by B<sphinx/index.php>. + +=back + +=head1 CONFIGURATION + +Along with the command line, program options can be retrieved from a +configuration file. It is looked up in one of the following locations: + +=over 4 + +=item B<a.> B<.xmlindex.conf> + +=item B<b.> B<~/.xmlindex.conf> + +=item B<c.> The file named by B<$ELLINIKA_INDEX_CONFIG> environment variable, if it is set. + +=item B<d.> The file named by B<$XMLINDEX_CONF> environment variable, if it is set. + +=back + +The first file found is read, then the command line is processed (it can +force another configuration file to be read, by using the B<--config> +command line option). Each subsequent source of options overrides the +previous one, excepting the B<exclude> option, which is cumulative. The +B<--clear-exclude> option can be used to clear up the previously built +exclusion list. + +The configuration file has the usual UNIX configuration format. Empty +lines and UNIX comments are ignored. Each non-empty line is either an +option name, or option assignment, i.e. B<opt>=B<val>, with any amount of +optional whitespace around the equals sign. Valid option names are +the same as long command line options, but without the leading B<-->. + +For example: + + # xmlindex settings: + base-directory = /srv/websites/ellinika-dev + exclude = NMZ.body.html + exclude = dict.html + exclude = htdig.html + exclude = map.html + exclude = nea.html + exclude = nomatch.html + exclude = conj.html + exclude = search.html + # Index.php settings. + base-url = http://ellinika-dev.gnu.org.ua + index = ellinika_dev_${l}_idx1 + +=head1 AUTHOR + +Sergey Poznyakoff <gray@gnu.org> + +=cut + + diff --git a/xml/lingua.conf.in b/xml/lingua.conf.in index 66e62cb..d736bcb 100644 --- a/xml/lingua.conf.in +++ b/xml/lingua.conf.in @@ -13,6 +13,6 @@ <BASE HREF="=BASE_HREF=" /> - <SEARCH ARGS="(list (lingua:gettext "Search"))"> + <SEARCH ARGS="(list (lingua:gettext "Search") (lingua:attr "LINGUA" "LANG"))"> <![CDATA[ <form method="get" - action="http://www.google.com/custom" + action="/search" enctype="application/x-www-form-urlencoded"> @@ -20,11 +20,8 @@ <label for="searchinput"> - ~A <img src="=TARGET_DIR=/graphics/google.png" alt="[Google]" width="75" height="32" /> - <input id="searchinput" type="text" size="30" maxlength="255" name="q" value="" /> - </label> - <input type="hidden" name="cof" value="L:http://ellinika.gnu.org.ua/graphics/parthenon-gnu.png;LW:180;LH:140;T:black;ALC:#ff3300;LC:#000099;BGC:white;AH:left;VLC:#660066;GL:0;" /> - - <input type="hidden" id="searchEllinika" name="sitesearch" checked="checked" value="ellinika.gnu.org.ua" /> - <input type="hidden" name="domains" value="ellinika.gnu.org.ua" /> + ~A <input id="searchinput" type="text" size="30" maxlength="255" name="q" value="" /> + </label> + <input type="hidden" id="searchlang" name="l" value="~A" /> </div> - </form>]]> + </form> + ]]> </SEARCH> @@ -55,2 +52,7 @@ +(define-macro (install-srch) + `(letrec ((srch (lambda (. args) + (string-append "/search?l=" (lingua:LANG args))))) + (xmltrans:set-attr "LINGUA" "SRCH" srch))) + (xmltrans:end-tag diff --git a/xml/pl/ellinika.xml b/xml/pl/ellinika.xml index cc4a7f3..42dc5ce 100644 --- a/xml/pl/ellinika.xml +++ b/xml/pl/ellinika.xml @@ -41,2 +41,3 @@ Copyright <![CDATA[©]]> 2004-2011 Sergey Poznyakoff (install-conj) + (install-srch) #f @@ -118,2 +119,10 @@ Copyright <![CDATA[©]]> 2004-2011 Sergey Poznyakoff +<GROUP TITLE="search" HIDDEN="1"> +<PAGE PREFIX="search" HEADER="Wyniki szukania" REF="SRCH"> + +@@result@@ + +</PAGE> +</GROUP> + </LINGUA> diff --git a/xml/ru/ellinika.xml b/xml/ru/ellinika.xml index d9707df..2a0068d 100644 --- a/xml/ru/ellinika.xml +++ b/xml/ru/ellinika.xml @@ -40,2 +40,3 @@ Copyright <![CDATA[©]]> 2004-2011 Sergey Poznyakoff (install-nea) + (install-srch) #f @@ -117,2 +118,10 @@ Copyright <![CDATA[©]]> 2004-2011 Sergey Poznyakoff +<GROUP TITLE="search" HIDDEN="1"> +<PAGE PREFIX="search" HEADER="Результаты поиска" REF="SRCH"> + +@@result@@ + +</PAGE> +</GROUP> + </LINGUA> diff --git a/xml/uk/ellinika.xml b/xml/uk/ellinika.xml index 96a9cf2..61945d9 100644 --- a/xml/uk/ellinika.xml +++ b/xml/uk/ellinika.xml @@ -40,2 +40,3 @@ Copyright <![CDATA[©]]> 2004-2011 Sergey Poznyakoff (install-nea) + (install-srch) #f @@ -117,2 +118,10 @@ Copyright <![CDATA[©]]> 2004-2011 Sergey Poznyakoff +<GROUP TITLE="search" HIDDEN="1"> +<PAGE PREFIX="search" HEADER="Пошук" REF="SRCH"> + +@@result@@ + +</PAGE> +</GROUP> + </LINGUA> |