summaryrefslogtreecommitdiffabout
path: root/scm/dictrans.scm
authorSergey Poznyakoff <gray@gnu.org.ua>2004-10-07 16:00:25 (GMT)
committer Sergey Poznyakoff <gray@gnu.org.ua>2004-10-07 16:00:25 (GMT)
commitb27976bda04ef7a2219c44dfc7150441b62b3a0a (patch) (side-by-side diff)
treee003a51f0b829d1664358b6cc4dfac79ef67f7be /scm/dictrans.scm
parent21ee0a3c2fc48d51b9a3e29a61a661afdaee4fc9 (diff)
downloadellinika-b27976bda04ef7a2219c44dfc7150441b62b3a0a.tar.gz
ellinika-b27976bda04ef7a2219c44dfc7150441b62b3a0a.tar.bz2
Dictionary parser
git-svn-id: file:///home/puszcza/svnroot/ellinika/trunk@195 941c8c0f-9102-463b-b60b-cd22ce0e6858
Diffstat (limited to 'scm/dictrans.scm') (more/less context) (ignore whitespace changes)
-rw-r--r--scm/dictrans.scm654
1 files changed, 654 insertions, 0 deletions
diff --git a/scm/dictrans.scm b/scm/dictrans.scm
new file mode 100644
index 0000000..5cfc50e
--- a/dev/null
+++ b/scm/dictrans.scm
@@ -0,0 +1,654 @@
+;;;; This file is part of Ellinika
+;;;; Copyright (C) 2004 Sergey Poznyakoff
+;;;;
+;;;; Ellinika is free software; you can redistribute it and/or modify
+;;;; it under the terms of the GNU General Public License as published by
+;;;; the Free Software Foundation; either version 2 of the License, or
+;;;; (at your option) any later version.
+;;;;
+;;;; Ellinika is distributed in the hope that it will be useful,
+;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;;; GNU General Public License for more details.
+;;;;
+;;;; You should have received a copy of the GNU General Public License
+;;;; along with Ellinika; if not, write to the Free Software
+;;;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+;;;;
+
+;;;; Dictionary structure
+;;;; Internal representation:
+;;;;
+;;;; * Each dictionary entry is represented as a vector:
+;;;;
+;;;; #(KEY FORMS XREF P-LIST)
+;;;;
+;;;; KEY is list of strings
+;;;;
+;;;; * FORMS is either #f or a string describing forms of the key if they
+;;;; are formed in an irregular manner.
+;;;;
+;;;; * XREF is a list of cross-references
+;;;;
+;;;; * P-LIST is a list of P-ARTICLES. Each P-ARTICLE is a vector:
+;;;;
+;;;; #(POS ARTICLE AREF XREF TOPIC)
+;;;;
+;;;; Member Type Meaning
+;;;; POS string part of speech
+;;;; ARTICLE list(string) Dictionary article associated with this key/pos
+;;;; AREF list(string) List of antonyms
+;;;; XREF list(string) List of cross-references
+;;;; TOPIC list(string) List of topics this item pertains to
+;;;;
+;;;;
+;;;; External representation (XML):
+;;;;
+;;;; <NODE>
+;;;; <K>string</K>+
+;;;; [<F>string</F>]
+;;;; <P ID="string">
+;;;; <M>string</M>+
+;;;; <A>string</A>*
+;;;; <X>string</X>*
+;;;; <T ID="string" />*
+;;;; </P>+
+;;;; <X>string</X>*
+;;;; </NODE>
+;;;;
+;;;; If only one P entry is present, the following alternative forms
+;;;; are understood:
+;;;;
+;;;; <NODE>
+;;;; <K>string</K>+
+;;;; [<F>string</F>]
+;;;; <P>string</P>
+;;;; <M>string</M>*
+;;;; <A>string</A>*
+;;;; <X>string</X>*
+;;;; <T ID="string" />*
+;;;; </NODE>
+;;;;
+;;;; or
+;;;;
+;;;; <NODE>
+;;;; <K>string</K>+
+;;;; [<F>string</F>]
+;;;; <P>string</P>
+;;;; <X>string</X>*
+;;;; <T ID="string" />*
+;;;; </NODE>
+
+(set! %load-path (cons "/usr/local/share/guile-sql" %load-path))
+(use-modules (xmltools xmltrans)
+ (sql)
+ (ice-9 getopt-long))
+
+(use-syntax (ice-9 syncase))
+
+(define compile-only #f)
+(define cleanup-option #f)
+(define preserve-option #f)
+
+(define sql-iface "mysql")
+(define sql-host "localhost")
+(define sql-database "ellinika")
+(define sql-port 3306)
+(define sql-password #f)
+(define sql-user #f)
+(define verbose-option #f)
+(define debug-level 0)
+(define input-files '())
+
+(define (debug level . rest)
+ (if (>= debug-level level)
+ (begin
+ (for-each
+ (lambda (x)
+ (display x))
+ rest)
+ (newline))))
+
+
+;;;; XML definitions
+
+;;; Set the default handler
+(define tag-list '())
+
+(define (lingua:default-start tag attr)
+ (xmltrans:set-attr #f "__START__" 1)
+ #f)
+
+(xmltrans:set-default-start-handler lingua:default-start)
+
+(define (lingua:default-end tag attr text)
+ (if (xmltrans:attr attr "__START__")
+ (xmltrans:parse-error #f "Unhandled element " tag))
+ (set! tag-list
+ (cons
+ (xmltrans:make-tag tag attr text)
+ tag-list))
+ #f)
+
+(xmltrans:set-default-end-handler lingua:default-end)
+
+;;; Current node
+(define current-node #f)
+
+(define-syntax node-get
+ (syntax-rules ()
+ ((node-get #:key)
+ (vector-ref current-node 0))
+ ((node-get #:forms)
+ (vector-ref current-node 1))
+ ((node-get #:xref)
+ (vector-ref current-node 2))
+ ((node-get #:p-list)
+ (vector-ref current-node 3))))
+
+(define (mark-invalid)
+ (xmltrans:set-attr "NODE" "__INVALID__" 1))
+
+(define-syntax node-set
+ (syntax-rules ()
+ ((node-set #:key k)
+ (cond
+ ((node-get #:key) =>
+ (lambda (klist)
+ (vector-set! current-node 0 (append klist (list k)))))
+ (else
+ (vector-set! current-node 0 (list k)))))
+ ((node-set #:forms f)
+ (begin
+ (cond
+ ((node-get #:forms)
+ (xmltrans:parse-error #f "Forms already set")
+ (mark-invalid)))
+ (vector-set! current-node 1 f)))
+ ((node-set #:xref x)
+ (cond
+ ((node-get #:xref) =>
+ (lambda (xlist)
+ (vector-set! current-node 2 (append xlist (list x)))))
+ (else
+ (vector-set! current-node 2 (list x)))))
+ ((node-set! #:p-article p)
+ (cond
+ ((node-get #:p-list) =>
+ (lambda (plist)
+ (vector-set! current-node 3 (append plist (list p)))))
+ (else
+ (vector-set! current-node 3 (list p)))))
+ ((node-set #:topic t)
+ (for-each
+ (lambda (p)
+ ;; FIXME: Use p-article-set
+ (vector-set! p 4 (append (vector-ref p 4) (list t))))
+ (node-get #:p-list)))))
+
+(define p-article #f)
+
+(define-syntax p-article-get
+ (syntax-rules ()
+ ((p-article-get #:pos)
+ (vector-ref p-article 0))
+ ((p-article-get #:article)
+ (vector-ref p-article 1))
+ ((p-article-get #:aref)
+ (vector-ref p-article 2))
+ ((p-article-get #:xref)
+ (vector-ref p-article 3))
+ ((p-article-get #:topic)
+ (vector-ref p-article 4))))
+
+(define-syntax p-set
+ (syntax-rules ()
+ ((p-set key n val)
+ (vector-set! p-article n
+ (cond
+ ((p-article-get key) =>
+ (lambda (alst)
+ (append alst (list val))))
+ (else
+ (list val)))))))
+
+(define-syntax p-article-set
+ (syntax-rules ()
+ ((p-article-set #:pos val)
+ (vector-set! p-article 0 val))
+ ((p-article-set #:article val)
+ (p-set #:article 1 val))
+ ((p-article-set #:aref val)
+ (p-set #:aref 2 val))
+ ((p-article-set #:xref val)
+ (p-set #:xref 3 val))
+ ((p-article-set #:topic val)
+ (p-set #:topic 4 val))))
+
+
+;;; Node list
+(define node-list '())
+
+(define (push-node node)
+ (if (>= debug-level 100)
+ (begin
+ (write node)
+ (newline)))
+ (set! node-list (cons current-node node-list)))
+
+;;; Topic stack
+(define topic-list '())
+
+(define (push-topic id)
+ (debug 10 "PUSH " id)
+ (set! topic-list (cons id topic-list)))
+
+(define (pop-topic)
+ (debug 10 "POP " (car topic-list))
+ (set! topic-list (cdr topic-list)))
+
+(define (append-topics)
+ (debug 10 "APPEND")
+ (for-each
+ (lambda (x)
+ (p-article-set #:topic x))
+ topic-list))
+
+;;;
+(define (expect-context tag . rest)
+ (let ((parent (xmltrans:get-parent #t)))
+ (call-with-current-continuation
+ (lambda (return)
+ (for-each
+ (lambda (ctx-name)
+ (if (string=? ctx-name parent)
+ (return)))
+ rest)
+ (xmltrans:parse-error #f tag " used in wrong context")))))
+
+;;; Hanlde main element
+(xmltrans:start-tag
+ "DICT"
+ (tag attr)
+ #f)
+
+(xmltrans:end-tag
+ "DICT"
+ (tag attr text)
+ ;; FIXME
+ #f)
+
+
+(define expected-k #f)
+
+(define (k-expect)
+ (set! expected-k #f))
+
+;;;
+(xmltrans:start-tag
+ "NODE"
+ (tag attr)
+ (set! current-node (vector #f #f '() '()))
+ (set! p-article #f)
+ (set! expected-k #t)
+ (xmltrans:expect-next-tag "K" k-expect)
+ #f)
+
+(xmltrans:end-tag
+ "NODE"
+ (tag attr text)
+ (cond
+ (p-article
+ (if (not (null? (node-get #:p-list)))
+ (xmltrans:parse-warning #f "Mixed definition style"))
+ (append-topics)
+ (node-set #:p-article p-article)
+ (set! p-article #f)))
+
+ (cond
+ ((xmltrans:attr attr "__INVALID__")) ; Ignore
+ ((not (node-get #:key))
+ (xmltrans:parse-error #f "K element is missing"))
+ ((or (null? (node-get #:p-list)) (null? (null? (node-get #:xref))))
+; (display current-node)(newline)
+; (display p-article)(newline)
+ (xmltrans:parse-error #f "No articles and no cross references for the node"))
+ (else
+ (push-node current-node)))
+ #f)
+
+;;; Topic
+(xmltrans:start-tag
+ "T"
+ (tag attr)
+ (if (not (xmltrans:attr attr "ID"))
+ (letrec ((loc (xmltrans:get-input-location))
+ (proc (lambda ()
+ (xmltrans:parse-error
+ loc
+ "T element does not have ID attribute."))))
+ (xmltrans:expect-next-tag "ID" proc))
+ (if (not (or (xmltrans:parent? "NODE") (xmltrans:parent? "P")))
+ (push-topic (xmltrans:attr attr "ID"))))
+ #f)
+
+(xmltrans:end-tag
+ "T"
+ (tag attr text)
+ (cond
+ ((not (xmltrans:attr attr "ID"))) ;; Ignore. Warnings has already been issued
+ ((xmltrans:parent? "P")
+ (p-article-set #:topic (xmltrans:attr attr "ID")))
+ ((xmltrans:parent? "NODE")
+ (node-set #:topic (xmltrans:attr attr "ID")))
+ (else
+ (pop-topic)))
+ #f)
+
+(xmltrans:end-tag
+ "ID"
+ (tag attr text)
+ (cond
+ ((xmltrans:parent? "P")) ;; OK
+ ((not (xmltrans:parent? "T"))
+ (xmltrans:parse-error #f "ID used in wrong context"))
+ ((xmltrans:attr #t "ID")
+ (xmltrans:parse-error #f "T element already has ID attribute"))
+ (else
+ (xmltrans:set-attr #t "ID" text)
+ (if (not (xmltrans:parent? "NODE" "T"))
+ (push-topic text))))
+ #f)
+
+;;; K - KEY
+(xmltrans:end-tag
+ "K"
+ (tag attr text)
+ (expect-context tag "NODE")
+ (if (not expected-k)
+ (begin
+ (xmltrans:parse-error #f "K tag cannot be used here")
+ (mark-invalid)))
+ (xmltrans:expect-next-tag "K" k-expect)
+ (node-set #:key text)
+ #f)
+
+;;; F - FORMS
+(xmltrans:end-tag
+ "F"
+ (tag attr text)
+ (expect-context tag "NODE")
+ (node-set #:forms text)
+ #f)
+
+;;; P - PartOfSpeach
+(define p-location #f)
+
+(xmltrans:start-tag
+ "P"
+ (tag attr)
+ (set! p-location (xmltrans:get-input-location))
+ (set! p-article (vector #f '() '() '() '()))
+ #f)
+
+(xmltrans:end-tag
+ "P"
+ (tag attr text)
+ (expect-context tag "NODE")
+ (cond
+ ((xmltrans:attr attr "ID") =>
+ (lambda (id)
+ (p-article-set #:pos id)
+ (append-topics)
+ (node-set #:p-article p-article)
+ (set! p-article #f)))
+ ((not (null? (p-article-get #:article))) ;; FIXME: other elements too
+ (xmltrans:parse-error p-location
+ "P element is missing ID, but has nested elements"))
+ (else
+ (set! p-article (vector text '() '() '() '()))))
+ #f)
+
+;;; A - Antonym
+
+(xmltrans:end-tag
+ "A"
+ (tag attr text)
+ (expect-context tag "NODE" "P")
+ (p-article-set #:aref text)
+ #f)
+
+;;; X - Xref
+(xmltrans:end-tag
+ "X"
+ (tag attr text)
+ (expect-context tag "NODE" "P")
+ (let ((ref (cond
+ ((string-null? text)
+ (call-with-current-continuation
+ (lambda (return)
+ ;; FIXME: Use node-get/p-article-get
+ (do ((node node-list (cdr node)))
+ ((null? node) #f)
+ (if (not (null?
+ (vector-ref
+ (car (vector-ref (car node) 3)) 1)))
+ (return (car (vector-ref (car node) 0))))))))
+ (else
+ text))))
+
+ (cond
+ ((not ref)
+ (xmltrans:parse-error #f "Empty reference")
+ (mark-invalid))
+ ((xmltrans:parent? "P")
+ (p-article-set #:xref ref))
+ ((xmltrans:parent? "NODE")
+ (node-set #:xref ref))))
+ #f)
+
+;;; M - MEANING
+(xmltrans:end-tag
+ "M"
+ (tag attr text)
+ (expect-context tag "P" "NODE")
+ (p-article-set #:article text)
+ #f)
+
+;;; Formatting elements
+(xmltrans:end-tag
+ "C"
+ (tag attr text)
+ (expect-context tag "M" "F")
+ (list
+ "<span class=\"comment\">"
+ text
+ "</span>"))
+
+(xmltrans:end-tag
+ "E"
+ (tag attr text)
+ (expect-context tag "M" "F")
+ (list
+ "<span class=\"expl\">"
+ text
+ "</span>"))
+
+
+
+;;;; Main
+(define grammar
+ `((check (single-char #\c))
+ (cleanup)
+ (database (single-char #\d) (value #t))
+ (host (single-char #\h) (value #t))
+ (port (single-char #\P) (value #t))
+ (password (single-char #\p) (value #t))
+ (user (single-char #\u) (value #t))
+ (interface (value #t))
+ (verbose (single-char #\v))
+ (debug (value #t))
+ (preserve (value #t))
+ (help)))
+
+(define (usage)
+ (display "usage: dictrans OPTIONS FILES\n"))
+
+(define (cons? p)
+ (and (pair? p) (not (list? p))))
+
+(for-each
+ (lambda (x)
+ (cond
+ ((cons? x)
+ (case (car x)
+ ((check)
+ (set! compile-only #t))
+ ((cleanup)
+ (set! cleanup-option #t))
+ ((database)
+ (set! sql-database (cdr x)))
+ ((host)
+ (set! sql-host (cdr x)))
+ ((port)
+ (set! sql-port (cdr x)))
+ ((password)
+ (set! sql-password (cdr x)))
+ ((user)
+ (set! sql-user (cdr x)))
+ ((interface)
+ (set! sql-iface (cdr x)))
+ ((verbose)
+ (set! verbose-option #t))
+ ((preserve)
+ (set! preserve-option #t))
+ ((debug)
+ (set! debug-level (string->number (cdr x))))
+ ((help)
+ (usage)
+ (exit 0))))
+ (else
+ (set! input-files (cdr x)))))
+ (getopt-long (command-line) grammar))
+
+
+(if (null? input-files)
+ (begin
+ (display "Input files not specified\n" (current-error-port))
+ (exit 1)))
+
+(for-each
+ (lambda (x)
+ (if (not (xmltrans:parse-file x))
+ (exit 1)))
+ input-files)
+
+(if compile-only
+ (exit 0))
+
+(define (assert cond . rest)
+ (cond
+ (cond)
+ ((null? rest)
+ (throw 'misc-error 'assert
+ "Assertion failed"
+ (list) #f)
+ (exit 1))
+ (else
+ (with-output-to-port
+ (current-error-port)
+ (lambda ()
+ (for-each
+ (lambda (x)
+ (display x))
+ rest)
+ (newline)))
+ (exit 1))))
+
+(define dict-index 0)
+(define article-index 0)
+
+(define (get-dict-index conn)
+ (let ((res (sql-query "SELECT ident FROM dict ORDER BY ident DESC LIMIT 1")))
+ (assert (not (null? res)))
+ (set! dict-index (car res))))
+
+(define (cleanup-db conn)
+ (sql-query "DELETE FROM links")
+ (sql-query "DELETE FROM articles")
+ (sql-query "DELETE FROM dict")
+ (sql-query "DELETE FROM topic")
+ (sql-query "DELETE FROM topic_tab")
+ (sql-query "DELETE FROM pending_links"))
+
+(define (pending-fixup)
+ (sql-query "INSERT IGNORE INTO links \
+ SELECT p.type,p.originator,d.ident \
+ FROM dict d, pending_links p \
+ WHERE p.word = d.word AND p.type != 'CLOSED'")
+ (sql-query "INSERT IGNORE INTO links \
+ SELECT p.type,d.ident,p.originator \
+ FROM dict d, pending_links p \
+ WHERE p.word = d.word AND p.type != 'CLOSED'")
+ (sql-query
+ (if (string=? sql-iface "mysql")
+ "UPDATE pending_links p, dict d SET p.type='CLOSED' WHERE p.word = d.word"
+ ;; Else assume SQL92
+ "UPDATE pending_links SET type='CLOSED' \
+ WHERE originator in (SELECT d.ident from dict d, pending_links p \
+ WHERE p.word=d.word)"))
+
+ (if (not preserve-pending-option)
+ sql_query("DELETE FROM pending_links WHERE type = 'CLOSED'"))
+
+ (let ((count (query-number "SELECT count(*) FROM pending_links \
+ WHERE type != 'CLOSED' \
+ GROUP BY word")))
+ (if (> count 0)
+ (display (string-append
+ (car res)
+ " unresolved references")
+ (current-error-port)))))
+
+(define (update-stat)
+ (let ((count (query-number("SELECT count(*) from dict"))))
+ (sql_query "DELETE from stat");
+ (sql-query (string-append
+ "INSERT INTO stat (count,updated) VALUES("
+ (number->string count)
+ ",now())"))))
+
+
+(define (insert-node conn node)
+ (set! article-index 0)
+ (set! dict-index (1+ dict-index))
+
+ ;;; FIXME
+ )
+
+(let ((conn (sql-connect sql-iface sql-host sql-port sql-database
+ sql-user sql-password)))
+
+ (if (not conn)
+ (begin
+ (display "Cannot connect to the database\n" (current-error-port))
+ (exit 1)))
+
+ (if cleanup-option
+ (cleanup-db conn))
+
+ (get-dict-index conn)
+
+ (for-each
+ (lambda (node)
+ (insert-node conn node))
+ node-list)
+
+ (pending-fixup)
+ (update-stat)
+
+ (sql-connect-close conn))
+
+(exit 0)
+
+ \ No newline at end of file

Return to:

Send suggestions and report system problems to the System administrator.