;;;; This file is part of Ellinika ;;;; Copyright (C) 2004 Sergey Poznyakoff ;;;; ;;;; Ellinika is free software; you can redistribute it and/or modify ;;;; it under the terms of the GNU General Public License as published by ;;;; the Free Software Foundation; either version 2 of the License, or ;;;; (at your option) any later version. ;;;; ;;;; Ellinika is distributed in the hope that it will be useful, ;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;;;; GNU General Public License for more details. ;;;; ;;;; You should have received a copy of the GNU General Public License ;;;; along with Ellinika; if not, write to the Free Software ;;;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ;;;; ;;;; Dictionary structure ;;;; Internal representation: ;;;; ;;;; * Each dictionary entry is represented as a vector: ;;;; ;;;; #(KEY FORMS XREF P-LIST) ;;;; ;;;; KEY is list of strings ;;;; ;;;; * FORMS is either #f or a string describing forms of the key if they ;;;; are formed in an irregular manner. ;;;; ;;;; * XREF is a list of cross-references ;;;; ;;;; * P-LIST is a list of P-ARTICLES. Each P-ARTICLE is a vector: ;;;; ;;;; #(POS ARTICLE AREF XREF TOPIC) ;;;; ;;;; Member Type Meaning ;;;; POS string part of speech ;;;; ARTICLE list(string) Dictionary article associated with this key/pos ;;;; AREF list(string) List of antonyms ;;;; XREF list(string) List of cross-references ;;;; TOPIC list(string) List of topics this item pertains to ;;;; ;;;; ;;;; External representation (XML): ;;;; ;;;; ;;;; string+ ;;;; [string] ;;;;

;;;; string+ ;;;; string* ;;;; string* ;;;; * ;;;;

+ ;;;; string* ;;;;
;;;; ;;;; If only one P entry is present, the following alternative forms ;;;; are understood: ;;;; ;;;; ;;;; string+ ;;;; [string] ;;;;

string

;;;; string* ;;;; string* ;;;; string* ;;;; * ;;;;
;;;; ;;;; or ;;;; ;;;; ;;;; string+ ;;;; [string] ;;;;

string

;;;; string* ;;;; * ;;;;
(set! %load-path (cons "/usr/local/share/guile-sql" %load-path)) (set! %load-path (cons "/home/gray/linguae/ellinika" %load-path)) (use-modules (xmltools xmltrans) (sql) (ellinika xlat) (ice-9 getopt-long)) (use-syntax (ice-9 syncase)) (define compile-only #f) (define cleanup-option #f) (define preserve-option #f) (define sql-iface "mysql") (define sql-host "localhost") (define sql-database "ellinika") (define sql-port 3306) (define sql-password #f) (define sql-user #f) (define verbose-option #f) (define debug-level 0) (define input-files '()) (define (debug level . rest) (if (>= debug-level level) (begin (for-each (lambda (x) (display x)) rest) (newline)))) (define pos-xlat #f) (define (convert-pos pos) (cond (compile-only pos) ((assoc pos pos-xlat) => (lambda (x) (cdr x))) (else (xmltrans:parse-error #f "unknown or misspelled part of speech") (mark-invalid) pos))) (define (read-pos conn) (set! pos-xlat (map (lambda (p) (cons (car p) (string->number (cadr p)))) (append (run-query conn "SELECT abbr, id FROM pos") (run-query conn "SELECT abbr_lat, id FROM pos") (run-query conn "SELECT name, id FROM pos"))))) ;;;; XML definitions ;;; Set the default handler (define tag-list '()) (define (lingua:default-start tag attr) (xmltrans:set-attr #f "__START__" 1) #f) (xmltrans:set-default-start-handler lingua:default-start) (define (lingua:default-end tag attr text) (if (xmltrans:attr attr "__START__") (xmltrans:parse-error #f "Unhandled element " tag)) (set! tag-list (cons (xmltrans:make-tag tag attr text) tag-list)) #f) (xmltrans:set-default-end-handler lingua:default-end) ;;; Current node (define current-node #f) (define-syntax node-get (syntax-rules () ((node-get #:key node) (vector-ref node 0)) ((node-get #:forms node) (vector-ref node 1)) ((node-get #:xref node) (vector-ref node 2)) ((node-get #:p-list node) (vector-ref node 3)))) (define-macro (current-node-get key) `(node-get ,key current-node)) (define (mark-invalid) (xmltrans:set-attr "NODE" "__INVALID__" 1)) (define-syntax node-set (syntax-rules () ((node-set #:key node k) (cond ((node-get #:key node) => (lambda (klist) (vector-set! node 0 (append klist (list k))))) (else (vector-set! node 0 (list k))))) ((node-set #:forms node f) (begin (cond ((node-get #:forms node) (xmltrans:parse-error #f "Forms already set") (mark-invalid))) (vector-set! node 1 f))) ((node-set #:xref node x) (cond ((node-get #:xref node) => (lambda (xlist) (vector-set! node 2 (append xlist (list x))))) (else (vector-set! node 2 (list x))))) ((node-set #:p-article node p) (cond ((node-get #:p-list node) => (lambda (plist) (vector-set! node 3 (append plist (list p))))) (else (vector-set! node 3 (list p))))) ((node-set #:topic node t) (for-each (lambda (p) (p-article-set #:topic p t)) (node-get #:p-list node))))) (define-macro (current-node-set key val) `(node-set ,key current-node ,val)) (define p-article #f) (define-syntax p-article-get (syntax-rules () ((p-article-get #:pos article) (vector-ref article 0)) ((p-article-get #:article article) (vector-ref article 1)) ((p-article-get #:aref article) (vector-ref article 2)) ((p-article-get #:xref article) (vector-ref article 3)) ((p-article-get #:topic article) (vector-ref article 4)))) (define-syntax p-set (syntax-rules () ((p-set key article n val) (vector-set! article n (cond ((p-article-get key article) => (lambda (alst) (append alst (list val)))) (else (list val))))))) (define-macro (current-article-get key) `(p-article-get ,key p-article)) (define-syntax p-article-set (syntax-rules () ((p-article-set #:pos article val) (vector-set! article 0 val)) ((p-article-set #:article article val) (p-set #:article article 1 val)) ((p-article-set #:aref article val) (p-set #:aref article 2 val)) ((p-article-set #:xref article val) (p-set #:xref article 3 val)) ((p-article-set #:topic article val) (p-set #:topic article 4 val)))) (define-macro (current-article-set key val) `(p-article-set ,key p-article ,val)) ;;; Node list (define node-list '()) (define (push-node node) (if (>= debug-level 100) (begin (write node) (newline))) (set! node-list (cons current-node node-list))) ;;; Topic stack (define topic-list '()) (define (push-topic id) (debug 10 "PUSH " id) (set! topic-list (cons id topic-list))) (define (pop-topic) (debug 10 "POP " (car topic-list)) (set! topic-list (cdr topic-list))) (define (append-topics) (debug 10 "APPEND") (for-each (lambda (x) (current-article-set #:topic x)) topic-list)) ;;; (define (expect-context tag . rest) (let ((parent (xmltrans:get-parent #t))) (call-with-current-continuation (lambda (return) (for-each (lambda (ctx-name) (if (string=? ctx-name parent) (return))) rest) (xmltrans:parse-error #f tag " used in wrong context"))))) ;;; Hanlde main element (xmltrans:start-tag "DICT" (tag attr) #f) (xmltrans:end-tag "DICT" (tag attr text) ;; FIXME #f) (define expected-k #f) (define (k-expect) (set! expected-k #f)) ;;; (xmltrans:start-tag "NODE" (tag attr) (set! current-node (vector #f #f '() '())) (set! p-article #f) (set! expected-k #t) (xmltrans:expect-next-tag "K" k-expect) #f) (xmltrans:end-tag "NODE" (tag attr text) (cond (p-article (if (not (null? (current-node-get #:p-list))) (xmltrans:parse-warning #f "Mixed definition style")) (append-topics) (current-node-set #:p-article p-article) (set! p-article #f))) (cond ((xmltrans:attr attr "__INVALID__")) ; Ignore ((not (current-node-get #:key)) (xmltrans:parse-error #f "K element is missing")) ((or (null? (current-node-get #:p-list)) (null? (null? (current-node-get #:xref)))) ; (display current-node)(newline) ; (display p-article)(newline) (xmltrans:parse-error #f "No articles and no cross references for the node")) (else (push-node current-node))) #f) ;;; Topic (xmltrans:start-tag "T" (tag attr) (if (not (xmltrans:attr attr "ID")) (letrec ((loc (xmltrans:get-input-location)) (proc (lambda () (xmltrans:parse-error loc "T element does not have ID attribute.")))) (xmltrans:expect-next-tag "ID" proc)) (if (not (or (xmltrans:parent? "NODE") (xmltrans:parent? "P"))) (push-topic (xmltrans:attr attr "ID")))) #f) (xmltrans:end-tag "T" (tag attr text) (cond ((not (xmltrans:attr attr "ID"))) ;; Ignore. Warnings has already been issued ((xmltrans:parent? "P") (current-article-set #:topic (xmltrans:attr attr "ID"))) ((xmltrans:parent? "NODE") (current-node-set #:topic (xmltrans:attr attr "ID"))) (else (pop-topic))) #f) (xmltrans:end-tag "ID" (tag attr text) (cond ((xmltrans:parent? "P")) ;; OK ((not (xmltrans:parent? "T")) (xmltrans:parse-error #f "ID used in wrong context")) ((xmltrans:attr #t "ID") (xmltrans:parse-error #f "T element already has ID attribute")) (else (xmltrans:set-attr #t "ID" text) (if (not (xmltrans:parent? "NODE" "T")) (push-topic text)))) #f) ;;; K - KEY (xmltrans:end-tag "K" (tag attr text) (expect-context tag "NODE") (if (not expected-k) (begin (xmltrans:parse-error #f "K tag cannot be used here") (mark-invalid))) (xmltrans:expect-next-tag "K" k-expect) (current-node-set #:key text) #f) ;;; F - FORMS (xmltrans:end-tag "F" (tag attr text) (expect-context tag "NODE") (current-node-set #:forms text) #f) ;;; P - PartOfSpeach (define p-location #f) (xmltrans:start-tag "P" (tag attr) (set! p-location (xmltrans:get-input-location)) (set! p-article (vector #f '() '() '() '())) #f) (xmltrans:end-tag "P" (tag attr text) (expect-context tag "NODE") (cond ((xmltrans:attr attr "ID") => (lambda (id) (current-article-set #:pos (convert-pos id)) (append-topics) (current-node-set #:p-article p-article) (set! p-article #f))) ((not (null? (current-article-get #:article))) ;; FIXME: other elements too (xmltrans:parse-error p-location "P element is missing ID, but has nested elements")) (else (set! p-article (vector (convert-pos text) '() '() '() '())))) #f) ;;; A - Antonym (xmltrans:end-tag "A" (tag attr text) (expect-context tag "NODE" "P") (current-article-set #:aref text) #f) ;;; X - Xref (xmltrans:end-tag "X" (tag attr text) (expect-context tag "NODE" "P") (let ((ref (cond ((string-null? text) (call-with-current-continuation (lambda (return) (do ((node node-list (cdr node))) ((null? node) #f) (if (not (null? (p-article-get #:article (car (node-get #:p-list (car node)))))) (return (car (node-get #:key (car node))))))))) (else text)))) (cond ((not ref) (xmltrans:parse-error #f "Empty reference") (mark-invalid)) ((xmltrans:parent? "P") (current-article-set #:xref ref)) ((xmltrans:parent? "NODE") (current-node-set #:xref ref)))) #f) ;;; M - MEANING (xmltrans:end-tag "M" (tag attr text) (expect-context tag "P" "NODE") (current-article-set #:article text) #f) ;;; Formatting elements (xmltrans:end-tag "C" (tag attr text) (expect-context tag "M" "F") (list "" text "")) (xmltrans:end-tag "E" (tag attr text) (expect-context tag "M" "F") (list "" text "")) (define (assert condition . rest) (cond (condition) ((null? rest) (throw 'misc-error 'assert "Assertion failed" (list) #f) (exit 1)) (else (with-output-to-port (current-error-port) (lambda () (for-each (lambda (x) (display x)) rest) (newline))) (exit 1)))) (define (escape-string str) (let loop ((lst '()) (str str)) (cond ((string-index str #\") => (lambda (pos) (loop (append lst (list (substring str 0 pos) "\\\"")) (substring str (1+ pos))))) (else (apply string-append (append lst (list str))))))) (define (run-query conn q) (if verbose-option (format #t "QUERY: ~S\n" q)) (let ((res (sql-query conn q))) (if verbose-option (format #t "RESULT: ~S\n" res)) res)) (define (query-number conn q) (let ((res (run-query conn q))) (if (null? res) #f (string->number (caar res))))) (define (get-dict-index conn) (set! dict-index (or (query-number conn "SELECT ident FROM dict ORDER BY ident DESC LIMIT 1") 0))) (define (cleanup-db conn) (run-query conn "DELETE FROM links") (run-query conn "DELETE FROM articles") (run-query conn "DELETE FROM dict") (run-query conn "DELETE FROM topic") (run-query conn "DELETE FROM topic_tab") (run-query conn "DELETE FROM pending_links")) (define (pending-fixup conn) (run-query conn "INSERT IGNORE INTO links \ SELECT p.type,p.originator,d.ident \ FROM dict d, pending_links p \ WHERE p.word = d.word AND p.type != 'CLOSED'") (run-query conn "INSERT IGNORE INTO links \ SELECT p.type,d.ident,p.originator \ FROM dict d, pending_links p \ WHERE p.word = d.word AND p.type != 'CLOSED'") (run-query conn (if (string=? sql-iface "mysql") "UPDATE pending_links p, dict d SET p.type='CLOSED' WHERE p.word = d.word" ;; Else assume SQL92 "UPDATE pending_links SET type='CLOSED' \ WHERE originator in (SELECT d.ident from dict d, pending_links p \ WHERE p.word=d.word)")) (if (not preserve-option) run-query("DELETE FROM pending_links WHERE type = 'CLOSED'")) (let ((count (query-number conn "SELECT count(*) FROM pending_links \ WHERE type != 'CLOSED'"))) ;;; GROUP BY word"))) (if (> count 0) (display (string-append (number->string count) " unresolved references\n") (current-error-port))))) (define (update-stat conn) (let ((count (query-number conn "SELECT count(*) from dict"))) (run-query conn "DELETE from stat"); (run-query conn (string-append "INSERT INTO stat (count,updated) VALUES(" (number->string count) ",now())")))) (define dict-index 0) (define (insert-node conn node) (letrec ((insert-link (lambda (type value) (run-query conn (format #f "INSERT INTO pending_links (type,originator,word) VALUES('~A',~A,'~A')" type dict-index value))))) (for-each (lambda (p-article) (for-each (lambda (key) (let ((sound (ellinika:sounds-like key))) (set! dict-index (1+ dict-index)) ;;; Insert 'heading' info (run-query conn (cond ((node-get #:forms node) => (lambda (f) (format #f "INSERT INTO dict (ident,word,sound,pos,forms) VALUES(~A,\"~A\",\"~A\",~A,\"~A\")" dict-index key sound (p-article-get #:pos p-article) f))) (else (format #f "INSERT INTO dict (ident,word,sound,pos) VALUES(~A,\"~A\",\"~A\",~A)" dict-index key sound (p-article-get #:pos p-article))))) ;;; Insert cross-references (for-each (lambda (x) (insert-link "XREF" x)) (append (node-get #:xref node) (p-article-get #:xref p-article))) ;;; Insert antonyms (for-each (lambda (x) (insert-link "ANT" x)) (p-article-get #:aref p-article)) ;;; Insert topics (for-each (lambda (t) (let ((index (or (query-number conn (format #f "SELECT ident FROM topic WHERE title=\"~A\"" (escape-string t))) (begin (run-query conn (format #f "INSERT INTO topic (title) VALUES (\"~A\")" (escape-string t))) (query-number conn "SELECT LAST_INSERT_ID()"))))) (run-query conn (format #f "INSERT INTO topic_tab VALUES(~A,~A)" index dict-index)))) (p-article-get #:topic p-article)) ;;; Insert articles (let ((article-index 0)) (for-each (lambda (text) (run-query conn (format #f "INSERT INTO articles VALUES (~A, ~A, \"~A\")" dict-index article-index text)) (set! article-index (1+ article-index))) (p-article-get #:article p-article))))) (node-get #:key node))) (node-get #:p-list node)))) ;;;; Main (define grammar `((check (single-char #\c)) (cleanup) (database (single-char #\d) (value #t)) (host (single-char #\h) (value #t)) (port (single-char #\P) (value #t)) (password (single-char #\p) (value #t)) (user (single-char #\u) (value #t)) (interface (value #t)) (verbose (single-char #\v)) (debug (value #t)) (preserve (value #t)) (help))) (define (usage) (display "usage: dictrans OPTIONS FILES\n")) (define (cons? p) (and (pair? p) (not (list? p)))) (for-each (lambda (x) (cond ((cons? x) (case (car x) ((check) (set! compile-only #t)) ((cleanup) (set! cleanup-option #t)) ((database) (set! sql-database (cdr x))) ((host) (set! sql-host (cdr x))) ((port) (set! sql-port (string->number (cdr x)))) ((password) (set! sql-password (cdr x))) ((user) (set! sql-user (cdr x))) ((interface) (set! sql-iface (cdr x))) ((verbose) (set! verbose-option #t)) ((preserve) (set! preserve-option #t)) ((debug) (set! debug-level (string->number (cdr x)))) ((help) (usage) (exit 0)))) (else (set! input-files (cdr x))))) (getopt-long (command-line) grammar)) (if (null? input-files) (begin (display "Input files not specified\n" (current-error-port)) (exit 1))) (cond (compile-only (for-each (lambda (x) (if (not (xmltrans:parse-file x)) (exit 1))) input-files))) (let ((conn (sql-connect sql-iface sql-host sql-port sql-database sql-user sql-password))) (if (not conn) (begin (display "Cannot connect to the database\n" (current-error-port)) (exit 1))) (read-pos conn) (for-each (lambda (x) (if (not (xmltrans:parse-file x)) (exit 1))) input-files) (if cleanup-option (cleanup-db conn)) (get-dict-index conn) (for-each (lambda (node) (insert-node conn node)) (reverse node-list)) (pending-fixup conn) (update-stat conn) (sql-connect-close conn)) (exit 0)