From b27976bda04ef7a2219c44dfc7150441b62b3a0a Mon Sep 17 00:00:00 2001 From: Sergey Poznyakoff Date: Thu, 7 Oct 2004 16:00:25 +0000 Subject: Dictionary parser git-svn-id: file:///home/puszcza/svnroot/ellinika/trunk@195 941c8c0f-9102-463b-b60b-cd22ce0e6858 --- scm/dictrans.scm | 654 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 654 insertions(+) create mode 100644 scm/dictrans.scm (limited to 'scm') diff --git a/scm/dictrans.scm b/scm/dictrans.scm new file mode 100644 index 0000000..5cfc50e --- /dev/null +++ b/scm/dictrans.scm @@ -0,0 +1,654 @@ +;;;; This file is part of Ellinika +;;;; Copyright (C) 2004 Sergey Poznyakoff +;;;; +;;;; Ellinika is free software; you can redistribute it and/or modify +;;;; it under the terms of the GNU General Public License as published by +;;;; the Free Software Foundation; either version 2 of the License, or +;;;; (at your option) any later version. +;;;; +;;;; Ellinika is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;;; GNU General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU General Public License +;;;; along with Ellinika; if not, write to the Free Software +;;;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +;;;; + +;;;; Dictionary structure +;;;; Internal representation: +;;;; +;;;; * Each dictionary entry is represented as a vector: +;;;; +;;;; #(KEY FORMS XREF P-LIST) +;;;; +;;;; KEY is list of strings +;;;; +;;;; * FORMS is either #f or a string describing forms of the key if they +;;;; are formed in an irregular manner. +;;;; +;;;; * XREF is a list of cross-references +;;;; +;;;; * P-LIST is a list of P-ARTICLES. Each P-ARTICLE is a vector: +;;;; +;;;; #(POS ARTICLE AREF XREF TOPIC) +;;;; +;;;; Member Type Meaning +;;;; POS string part of speech +;;;; ARTICLE list(string) Dictionary article associated with this key/pos +;;;; AREF list(string) List of antonyms +;;;; XREF list(string) List of cross-references +;;;; TOPIC list(string) List of topics this item pertains to +;;;; +;;;; +;;;; External representation (XML): +;;;; +;;;; +;;;; string+ +;;;; [string] +;;;;

+;;;; string+ +;;;; string* +;;;; string* +;;;; * +;;;;

+ +;;;; string* +;;;;
+;;;; +;;;; If only one P entry is present, the following alternative forms +;;;; are understood: +;;;; +;;;; +;;;; string+ +;;;; [string] +;;;;

string

+;;;; string* +;;;; string* +;;;; string* +;;;; * +;;;;
+;;;; +;;;; or +;;;; +;;;; +;;;; string+ +;;;; [string] +;;;;

string

+;;;; string* +;;;; * +;;;;
+ +(set! %load-path (cons "/usr/local/share/guile-sql" %load-path)) +(use-modules (xmltools xmltrans) + (sql) + (ice-9 getopt-long)) + +(use-syntax (ice-9 syncase)) + +(define compile-only #f) +(define cleanup-option #f) +(define preserve-option #f) + +(define sql-iface "mysql") +(define sql-host "localhost") +(define sql-database "ellinika") +(define sql-port 3306) +(define sql-password #f) +(define sql-user #f) +(define verbose-option #f) +(define debug-level 0) +(define input-files '()) + +(define (debug level . rest) + (if (>= debug-level level) + (begin + (for-each + (lambda (x) + (display x)) + rest) + (newline)))) + + +;;;; XML definitions + +;;; Set the default handler +(define tag-list '()) + +(define (lingua:default-start tag attr) + (xmltrans:set-attr #f "__START__" 1) + #f) + +(xmltrans:set-default-start-handler lingua:default-start) + +(define (lingua:default-end tag attr text) + (if (xmltrans:attr attr "__START__") + (xmltrans:parse-error #f "Unhandled element " tag)) + (set! tag-list + (cons + (xmltrans:make-tag tag attr text) + tag-list)) + #f) + +(xmltrans:set-default-end-handler lingua:default-end) + +;;; Current node +(define current-node #f) + +(define-syntax node-get + (syntax-rules () + ((node-get #:key) + (vector-ref current-node 0)) + ((node-get #:forms) + (vector-ref current-node 1)) + ((node-get #:xref) + (vector-ref current-node 2)) + ((node-get #:p-list) + (vector-ref current-node 3)))) + +(define (mark-invalid) + (xmltrans:set-attr "NODE" "__INVALID__" 1)) + +(define-syntax node-set + (syntax-rules () + ((node-set #:key k) + (cond + ((node-get #:key) => + (lambda (klist) + (vector-set! current-node 0 (append klist (list k))))) + (else + (vector-set! current-node 0 (list k))))) + ((node-set #:forms f) + (begin + (cond + ((node-get #:forms) + (xmltrans:parse-error #f "Forms already set") + (mark-invalid))) + (vector-set! current-node 1 f))) + ((node-set #:xref x) + (cond + ((node-get #:xref) => + (lambda (xlist) + (vector-set! current-node 2 (append xlist (list x))))) + (else + (vector-set! current-node 2 (list x))))) + ((node-set! #:p-article p) + (cond + ((node-get #:p-list) => + (lambda (plist) + (vector-set! current-node 3 (append plist (list p))))) + (else + (vector-set! current-node 3 (list p))))) + ((node-set #:topic t) + (for-each + (lambda (p) + ;; FIXME: Use p-article-set + (vector-set! p 4 (append (vector-ref p 4) (list t)))) + (node-get #:p-list))))) + +(define p-article #f) + +(define-syntax p-article-get + (syntax-rules () + ((p-article-get #:pos) + (vector-ref p-article 0)) + ((p-article-get #:article) + (vector-ref p-article 1)) + ((p-article-get #:aref) + (vector-ref p-article 2)) + ((p-article-get #:xref) + (vector-ref p-article 3)) + ((p-article-get #:topic) + (vector-ref p-article 4)))) + +(define-syntax p-set + (syntax-rules () + ((p-set key n val) + (vector-set! p-article n + (cond + ((p-article-get key) => + (lambda (alst) + (append alst (list val)))) + (else + (list val))))))) + +(define-syntax p-article-set + (syntax-rules () + ((p-article-set #:pos val) + (vector-set! p-article 0 val)) + ((p-article-set #:article val) + (p-set #:article 1 val)) + ((p-article-set #:aref val) + (p-set #:aref 2 val)) + ((p-article-set #:xref val) + (p-set #:xref 3 val)) + ((p-article-set #:topic val) + (p-set #:topic 4 val)))) + + +;;; Node list +(define node-list '()) + +(define (push-node node) + (if (>= debug-level 100) + (begin + (write node) + (newline))) + (set! node-list (cons current-node node-list))) + +;;; Topic stack +(define topic-list '()) + +(define (push-topic id) + (debug 10 "PUSH " id) + (set! topic-list (cons id topic-list))) + +(define (pop-topic) + (debug 10 "POP " (car topic-list)) + (set! topic-list (cdr topic-list))) + +(define (append-topics) + (debug 10 "APPEND") + (for-each + (lambda (x) + (p-article-set #:topic x)) + topic-list)) + +;;; +(define (expect-context tag . rest) + (let ((parent (xmltrans:get-parent #t))) + (call-with-current-continuation + (lambda (return) + (for-each + (lambda (ctx-name) + (if (string=? ctx-name parent) + (return))) + rest) + (xmltrans:parse-error #f tag " used in wrong context"))))) + +;;; Hanlde main element +(xmltrans:start-tag + "DICT" + (tag attr) + #f) + +(xmltrans:end-tag + "DICT" + (tag attr text) + ;; FIXME + #f) + + +(define expected-k #f) + +(define (k-expect) + (set! expected-k #f)) + +;;; +(xmltrans:start-tag + "NODE" + (tag attr) + (set! current-node (vector #f #f '() '())) + (set! p-article #f) + (set! expected-k #t) + (xmltrans:expect-next-tag "K" k-expect) + #f) + +(xmltrans:end-tag + "NODE" + (tag attr text) + (cond + (p-article + (if (not (null? (node-get #:p-list))) + (xmltrans:parse-warning #f "Mixed definition style")) + (append-topics) + (node-set #:p-article p-article) + (set! p-article #f))) + + (cond + ((xmltrans:attr attr "__INVALID__")) ; Ignore + ((not (node-get #:key)) + (xmltrans:parse-error #f "K element is missing")) + ((or (null? (node-get #:p-list)) (null? (null? (node-get #:xref)))) +; (display current-node)(newline) +; (display p-article)(newline) + (xmltrans:parse-error #f "No articles and no cross references for the node")) + (else + (push-node current-node))) + #f) + +;;; Topic +(xmltrans:start-tag + "T" + (tag attr) + (if (not (xmltrans:attr attr "ID")) + (letrec ((loc (xmltrans:get-input-location)) + (proc (lambda () + (xmltrans:parse-error + loc + "T element does not have ID attribute.")))) + (xmltrans:expect-next-tag "ID" proc)) + (if (not (or (xmltrans:parent? "NODE") (xmltrans:parent? "P"))) + (push-topic (xmltrans:attr attr "ID")))) + #f) + +(xmltrans:end-tag + "T" + (tag attr text) + (cond + ((not (xmltrans:attr attr "ID"))) ;; Ignore. Warnings has already been issued + ((xmltrans:parent? "P") + (p-article-set #:topic (xmltrans:attr attr "ID"))) + ((xmltrans:parent? "NODE") + (node-set #:topic (xmltrans:attr attr "ID"))) + (else + (pop-topic))) + #f) + +(xmltrans:end-tag + "ID" + (tag attr text) + (cond + ((xmltrans:parent? "P")) ;; OK + ((not (xmltrans:parent? "T")) + (xmltrans:parse-error #f "ID used in wrong context")) + ((xmltrans:attr #t "ID") + (xmltrans:parse-error #f "T element already has ID attribute")) + (else + (xmltrans:set-attr #t "ID" text) + (if (not (xmltrans:parent? "NODE" "T")) + (push-topic text)))) + #f) + +;;; K - KEY +(xmltrans:end-tag + "K" + (tag attr text) + (expect-context tag "NODE") + (if (not expected-k) + (begin + (xmltrans:parse-error #f "K tag cannot be used here") + (mark-invalid))) + (xmltrans:expect-next-tag "K" k-expect) + (node-set #:key text) + #f) + +;;; F - FORMS +(xmltrans:end-tag + "F" + (tag attr text) + (expect-context tag "NODE") + (node-set #:forms text) + #f) + +;;; P - PartOfSpeach +(define p-location #f) + +(xmltrans:start-tag + "P" + (tag attr) + (set! p-location (xmltrans:get-input-location)) + (set! p-article (vector #f '() '() '() '())) + #f) + +(xmltrans:end-tag + "P" + (tag attr text) + (expect-context tag "NODE") + (cond + ((xmltrans:attr attr "ID") => + (lambda (id) + (p-article-set #:pos id) + (append-topics) + (node-set #:p-article p-article) + (set! p-article #f))) + ((not (null? (p-article-get #:article))) ;; FIXME: other elements too + (xmltrans:parse-error p-location + "P element is missing ID, but has nested elements")) + (else + (set! p-article (vector text '() '() '() '())))) + #f) + +;;; A - Antonym + +(xmltrans:end-tag + "A" + (tag attr text) + (expect-context tag "NODE" "P") + (p-article-set #:aref text) + #f) + +;;; X - Xref +(xmltrans:end-tag + "X" + (tag attr text) + (expect-context tag "NODE" "P") + (let ((ref (cond + ((string-null? text) + (call-with-current-continuation + (lambda (return) + ;; FIXME: Use node-get/p-article-get + (do ((node node-list (cdr node))) + ((null? node) #f) + (if (not (null? + (vector-ref + (car (vector-ref (car node) 3)) 1))) + (return (car (vector-ref (car node) 0)))))))) + (else + text)))) + + (cond + ((not ref) + (xmltrans:parse-error #f "Empty reference") + (mark-invalid)) + ((xmltrans:parent? "P") + (p-article-set #:xref ref)) + ((xmltrans:parent? "NODE") + (node-set #:xref ref)))) + #f) + +;;; M - MEANING +(xmltrans:end-tag + "M" + (tag attr text) + (expect-context tag "P" "NODE") + (p-article-set #:article text) + #f) + +;;; Formatting elements +(xmltrans:end-tag + "C" + (tag attr text) + (expect-context tag "M" "F") + (list + "" + text + "")) + +(xmltrans:end-tag + "E" + (tag attr text) + (expect-context tag "M" "F") + (list + "" + text + "")) + + + +;;;; Main +(define grammar + `((check (single-char #\c)) + (cleanup) + (database (single-char #\d) (value #t)) + (host (single-char #\h) (value #t)) + (port (single-char #\P) (value #t)) + (password (single-char #\p) (value #t)) + (user (single-char #\u) (value #t)) + (interface (value #t)) + (verbose (single-char #\v)) + (debug (value #t)) + (preserve (value #t)) + (help))) + +(define (usage) + (display "usage: dictrans OPTIONS FILES\n")) + +(define (cons? p) + (and (pair? p) (not (list? p)))) + +(for-each + (lambda (x) + (cond + ((cons? x) + (case (car x) + ((check) + (set! compile-only #t)) + ((cleanup) + (set! cleanup-option #t)) + ((database) + (set! sql-database (cdr x))) + ((host) + (set! sql-host (cdr x))) + ((port) + (set! sql-port (cdr x))) + ((password) + (set! sql-password (cdr x))) + ((user) + (set! sql-user (cdr x))) + ((interface) + (set! sql-iface (cdr x))) + ((verbose) + (set! verbose-option #t)) + ((preserve) + (set! preserve-option #t)) + ((debug) + (set! debug-level (string->number (cdr x)))) + ((help) + (usage) + (exit 0)))) + (else + (set! input-files (cdr x))))) + (getopt-long (command-line) grammar)) + + +(if (null? input-files) + (begin + (display "Input files not specified\n" (current-error-port)) + (exit 1))) + +(for-each + (lambda (x) + (if (not (xmltrans:parse-file x)) + (exit 1))) + input-files) + +(if compile-only + (exit 0)) + +(define (assert cond . rest) + (cond + (cond) + ((null? rest) + (throw 'misc-error 'assert + "Assertion failed" + (list) #f) + (exit 1)) + (else + (with-output-to-port + (current-error-port) + (lambda () + (for-each + (lambda (x) + (display x)) + rest) + (newline))) + (exit 1)))) + +(define dict-index 0) +(define article-index 0) + +(define (get-dict-index conn) + (let ((res (sql-query "SELECT ident FROM dict ORDER BY ident DESC LIMIT 1"))) + (assert (not (null? res))) + (set! dict-index (car res)))) + +(define (cleanup-db conn) + (sql-query "DELETE FROM links") + (sql-query "DELETE FROM articles") + (sql-query "DELETE FROM dict") + (sql-query "DELETE FROM topic") + (sql-query "DELETE FROM topic_tab") + (sql-query "DELETE FROM pending_links")) + +(define (pending-fixup) + (sql-query "INSERT IGNORE INTO links \ + SELECT p.type,p.originator,d.ident \ + FROM dict d, pending_links p \ + WHERE p.word = d.word AND p.type != 'CLOSED'") + (sql-query "INSERT IGNORE INTO links \ + SELECT p.type,d.ident,p.originator \ + FROM dict d, pending_links p \ + WHERE p.word = d.word AND p.type != 'CLOSED'") + (sql-query + (if (string=? sql-iface "mysql") + "UPDATE pending_links p, dict d SET p.type='CLOSED' WHERE p.word = d.word" + ;; Else assume SQL92 + "UPDATE pending_links SET type='CLOSED' \ + WHERE originator in (SELECT d.ident from dict d, pending_links p \ + WHERE p.word=d.word)")) + + (if (not preserve-pending-option) + sql_query("DELETE FROM pending_links WHERE type = 'CLOSED'")) + + (let ((count (query-number "SELECT count(*) FROM pending_links \ + WHERE type != 'CLOSED' \ + GROUP BY word"))) + (if (> count 0) + (display (string-append + (car res) + " unresolved references") + (current-error-port))))) + +(define (update-stat) + (let ((count (query-number("SELECT count(*) from dict")))) + (sql_query "DELETE from stat"); + (sql-query (string-append + "INSERT INTO stat (count,updated) VALUES(" + (number->string count) + ",now())")))) + + +(define (insert-node conn node) + (set! article-index 0) + (set! dict-index (1+ dict-index)) + + ;;; FIXME + ) + +(let ((conn (sql-connect sql-iface sql-host sql-port sql-database + sql-user sql-password))) + + (if (not conn) + (begin + (display "Cannot connect to the database\n" (current-error-port)) + (exit 1))) + + (if cleanup-option + (cleanup-db conn)) + + (get-dict-index conn) + + (for-each + (lambda (node) + (insert-node conn node)) + node-list) + + (pending-fixup) + (update-stat) + + (sql-connect-close conn)) + +(exit 0) + + \ No newline at end of file -- cgit v1.2.1