From a1a5b7ddd6c3c0532c37551b24fd573a554ac181 Mon Sep 17 00:00:00 2001 From: Sergey Poznyakoff Date: Fri, 10 Jun 2011 23:04:53 +0300 Subject: Fix syllabification. * configure.ac: Add AC_PROG_YACC * src/ellinika/phoneme.y: New file. * src/ellinika/yyrename: New file. * src/ellinika/syllabificator.c: New file. * src/ellinika/.gitignore: Update. * src/ellinika/elchr.c (char_info_st): Move to header. (el_basic_ctype): (elchr_info): Remove static qualifier. Return a pointer to const. (elchr_letter,elchr_phoneme): New functions. (elchr_diphthong): Remove. * src/ellinika/elmorph.c (elstr): New members. (_elstr_syllabize): Rewrite. (invalidate_maps)" New static function. (_elstr_alloc): Initialize new fields, take function name as argument, for diagnostic purposes. (_elstr_print): Rewrite (deftab): Update. (elstr-syllable-prop,elstr-syllable) (_elstr_set_accent,_elstr_set_accent_on_char): Rewrite. (elstr-char-phoneme,elstr->phonetic-map): New functions. * src/ellinika/elmorph.h (CHF_DIPH1,CHF_DIPH2): Remove. (CHF_DIPHTHONG): New flag. (PHON_.*): New constants. (phoneme,syllable): New structures. (char_info_st): New members. (elchr_info,elchr_letter) (elchr_phoneme,phoneme_map) (syllable_map): New protos. (elchr_diphthong): Remove protos. * src/ellinika/elmorph.scm4: Move public definitions to elmorph-public.scm; include it here. * src/ellinika/xlat.scm (ellinika:sounds-like): Rewrite as a wrapper over elstr->soundslike. Describe Milesian numbers. * style.css (img.ellinika-img): New class. * xml/lingua.conf.in (IMAGE): New tag. * xml/pl/alfabhta.xml: Describe Milesian numbers. Various fixes. * data/dbverb.struct: fix a typo in flection. Use 'sub' theme for pas/sub/aor. * data/irregular-verbs.xml: Add more verbs. * scm/conjugator.scm: Various fixes. * scm/verbop.scm: Accept empty mood and voice declarations. --- src/ellinika/.gitignore | 2 + src/ellinika/Makefile.am | 23 ++- src/ellinika/elchr.c | 273 ++++++++++++++++++------------- src/ellinika/elmorph-public.scm | 106 ++++++++++++ src/ellinika/elmorph.c | 308 ++++++++++++++++++++++------------- src/ellinika/elmorph.h | 82 +++++++++- src/ellinika/elmorph.scm4 | 25 +-- src/ellinika/phoneme.y | 353 ++++++++++++++++++++++++++++++++++++++++ src/ellinika/syllabificator.c | 152 +++++++++++++++++ src/ellinika/tenses.scm | 38 +++++ src/ellinika/xlat.scm | 113 +------------ src/ellinika/yyrename | 97 +++++++++++ 12 files changed, 1206 insertions(+), 366 deletions(-) create mode 100644 src/ellinika/elmorph-public.scm create mode 100644 src/ellinika/phoneme.y create mode 100644 src/ellinika/syllabificator.c create mode 100644 src/ellinika/tenses.scm create mode 100755 src/ellinika/yyrename (limited to 'src/ellinika') diff --git a/src/ellinika/.gitignore b/src/ellinika/.gitignore index 9422f9a..11bf478 100644 --- a/src/ellinika/.gitignore +++ b/src/ellinika/.gitignore @@ -3,3 +3,5 @@ cgi.scm config.scm elmorph.scm elmorph.x +phoneme.c +phoneme.h diff --git a/src/ellinika/Makefile.am b/src/ellinika/Makefile.am index 274eea8..b8988d4 100644 --- a/src/ellinika/Makefile.am +++ b/src/ellinika/Makefile.am @@ -1,5 +1,5 @@ # This file is part of Ellinika project. -# Copyright (C) 2004,2006,2007,2008 Sergey Poznyakoff +# Copyright (C) 2004,2006,2007,2008,2011 Sergey Poznyakoff # # Ellinika is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -15,7 +15,14 @@ # along with this program. If not, see . guiledir=$(GUILE_SITE)/$(PACKAGE) -guile_DATA=xlat.scm cgi.scm i18n.scm config.scm dico.scm elmorph.scm +guile_DATA=\ + xlat.scm\ + cgi.scm\ + i18n.scm\ + config.scm\ + dico.scm\ + elmorph.scm\ + tenses.scm cgi.m4: Makefile echo 'divert(-1)' > $@ @@ -39,11 +46,11 @@ cgi.m4: Makefile SUFFIXES = .scm4 .scm .x .scm4.scm: - m4 cgi.m4 $< > $@ + m4 -I$(srcdir) cgi.m4 $< > $@ cgi.scm: cgi.scm4 cgi.m4 config.scm: config.scm4 cgi.m4 -elmorph.scm: elmorph.scm4 cgi.m4 +elmorph.scm: elmorph.scm4 elmorph-public.scm cgi.m4 pkglib_LTLIBRARIES=libelmorph.la @@ -52,7 +59,9 @@ libelmorph_la_SOURCES = \ utf8.c\ elchr.c\ elmorph.c\ - elmorph.h + elmorph.h\ + phoneme.y\ + syllabificator.c DOT_X_FILES = elmorph.x @@ -80,4 +89,6 @@ install-data-hook: done; \ cd $$here - +AM_YFLAGS = -d +YACCCOMPILE = $(srcdir)/yyrename '$(YACC) $(YFLAGS) $(AM_YFLAGS)' +EXTRA_DIST = yyrename elmorph-public.scm \ No newline at end of file diff --git a/src/ellinika/elchr.c b/src/ellinika/elchr.c index 3142b6f..621ac03 100644 --- a/src/ellinika/elchr.c +++ b/src/ellinika/elchr.c @@ -23,20 +23,7 @@ #include #include "utf8.h" #include "elmorph.h" - -struct char_info_st { - unsigned ch; /* Characters */ - int flags; /* Flags (see above) */ - unsigned base; /* for vowels - a corresponding vowel with all diacritics - removed */ - unsigned trans; /* a counter-case equivalent, i.e. a corresponding uppercase - letter if flags & CHF_LOWER and a corresponding lowerrcase - letter if flags & CHF_UPPER */ - unsigned numval; /* Numeric value */ - unsigned accented[3]; /* For vowels - corresponding accented variant */ - unsigned deaccent; /* For accented vowels with diaeresis - corresponding - non-accented character */ -}; +#include "phoneme.h" /* See http://www.unicode.org/charts/PDF/Unicode-5.1/U51-0370.pdf */ struct char_info_st el_basic_ctype[] = { @@ -174,80 +161,149 @@ struct char_info_st el_basic_ctype[] = { { 0x0383, }, { 0x0384, CHF_MODIFIER }, /* Oxeia */ { 0x0385, CHF_MODIFIER }, /* dialytika */ - { 0x0386, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0391, 0x03AC }, /* Ά */ + { 0x0386, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0391, 0x03AC, + 0, { 0, 0, 0}, 0, LETTER_A_ACC, PHON_A }, /* Ά */ { 0x0387, CHF_PUNCT }, /* ano teleia */ - { 0x0388, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0395, 0x03AD }, /* Έ */ - { 0x0389, CHF_VOWEL|CHF_UPPER|CHF_OXEIA|CHF_DIPH2, 0x0397, 0x03AE }, /* Ή */ - { 0x038A, CHF_VOWEL|CHF_UPPER|CHF_OXEIA|CHF_DIPH2, 0x0399, 0x03AF }, /* Ί */ + { 0x0388, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0395, 0x03AD, + 0, { 0, 0, 0}, 0, LETTER_E_ACC, PHON_E }, /* Έ */ + { 0x0389, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0397, 0x03AE, + 0, { 0, 0, 0}, 0, LETTER_H_ACC, PHON_I }, /* Ή */ + { 0x038A, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0399, 0x03AF, + 0, { 0, 0, 0}, 0, LETTER_I_ACC, PHON_I }, /* Ί */ { 0x038B, }, - { 0x038C, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x039F, 0x03CC }, /* Ό */ + { 0x038C, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x039F, 0x03CC, + 0, { 0, 0, 0}, 0, LETTER_OMICRON_ACC, PHON_O }, /* Ό */ { 0x038D, }, - { 0x038E, CHF_VOWEL|CHF_UPPER|CHF_OXEIA|CHF_DIPH2, 0x03A5, 0x03CD }, /* Ύ */ - { 0x038F, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x03A9, 0x03CE }, /* Ώ */ - { 0x0390, CHF_VOWEL|CHF_LOWER|CHF_TREMA|CHF_OXEIA, 0x03B9, 0, 0, 0, 0, 0x03CA }, /* ΐ */ - { 0x0391, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH1, 0, 0x03B1, 1, 0x0386 }, /* Α */ - { 0x0392, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B2, 2 }, /* Β */ - { 0x0393, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B3, 3 }, /* Γ */ - { 0x0394, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B4, 4 }, /* Δ */ - { 0x0395, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH1, 0, 0x03B5, 5, 0x0388 }, /* Ε */ - { 0x0396, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B6, 7 }, /* Ζ */ - { 0x0397, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03B7, 8, 0x0389 }, /* Η */ - { 0x0398, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B8, 9 }, /* Θ */ - { 0x0399, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH2, 0, 0x03B9, 10, 0x038A }, /* Ι */ - { 0x039A, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BA, 20 }, /* Κ */ - { 0x039B, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BB, 30 }, /* Λ */ - { 0x039C, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BC, 40 }, /* Μ */ - { 0x039D, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BD, 50 }, /* Ν */ - { 0x039E, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BE, 60 }, /* Ξ */ - { 0x039F, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03BF, 70, 0x038C }, /* Ο */ - { 0x03A0, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C0, 80 }, /* Π */ - { 0x03A1, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C1, 100 }, /* Ρ */ + { 0x038E, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x03A5, 0x03CD, + 0, { 0, 0, 0}, 0, LETTER_Y_ACC, PHON_I }, /* Ύ */ + { 0x038F, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x03A9, 0x03CE, + 0, { 0, 0, 0}, 0, LETTER_OMEGA_ACC, PHON_O }, /* Ώ */ + { 0x0390, CHF_VOWEL|CHF_LOWER|CHF_TREMA|CHF_OXEIA, 0x03B9, 0, + 0, { 0, 0, 0}, 0x03CA, LETTER_I_TREMA_ACC, PHON_I }, /* ΐ */ + { 0x0391, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03B1, + 1, { 0x0386, 0, 0}, 0, LETTER_A, PHON_A }, /* Α */ + { 0x0392, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B2, + 2, {0, 0, 0}, 0, LETTER_B, PHON_BH },/* Β */ + { 0x0393, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B3, + 3, {0, 0, 0}, 0, LETTER_G, PHON_GH }, /* Γ */ + { 0x0394, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B4, + 4, {0, 0, 0}, 0, LETTER_D, PHON_DH }, /* Δ */ + { 0x0395, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03B5, + 5, { 0x0388, 0, 0}, 0, LETTER_E, PHON_E }, /* Ε */ + { 0x0396, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B6, + 7, {0, 0, 0}, 0, LETTER_Z, PHON_Z }, /* Ζ */ + { 0x0397, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03B7, + 8, {0x0389, 0, 0}, 0, LETTER_H, PHON_I }, /* Η */ + { 0x0398, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B8, + 9, {0, 0, 0}, 0, LETTER_TH, PHON_TH }, /* Θ */ + { 0x0399, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03B9, + 10, { 0x038A, 0, 0}, 0, LETTER_I, PHON_I }, /* Ι */ + { 0x039A, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BA, + 20, {0, 0, 0}, 0, LETTER_K, PHON_K }, /* Κ */ + { 0x039B, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BB, + 30, {0, 0, 0}, 0, LETTER_L, PHON_L }, /* Λ */ + { 0x039C, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BC, + 40, {0, 0, 0}, 0, LETTER_M, PHON_M }, /* Μ */ + { 0x039D, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BD, + 50, {0, 0, 0}, 0, LETTER_N, PHON_N }, /* Ν */ + { 0x039E, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BE, + 60, {0, 0, 0}, 0, LETTER_KS, PHON_X }, /* Ξ */ + { 0x039F, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03BF, + 70, { 0x038C, 0, 0}, 0, LETTER_OMICRON, PHON_O }, /* Ο */ + { 0x03A0, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C0, + 80, {0, 0, 0}, 0, LETTER_P, PHON_P }, /* Π */ + { 0x03A1, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C1, + 100, {0, 0, 0}, 0, LETTER_R, PHON_R }, /* Ρ */ { 0x03A2, }, - { 0x03A3, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C3, 200 }, /* Σ */ - { 0x03A4, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C4, 300 }, /* Τ */ - { 0x03A5, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH1|CHF_DIPH2, 0, 0x03C5, 400, 0x038E }, /* Υ */ - { 0x03A6, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C6, 500 }, /* Φ */ - { 0x03A7, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C7, 600 }, /* Χ */ - { 0x03A8, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C8, 700 }, /* Ψ */ - { 0x03A9, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03C9, 800, 0x038F }, /* Ω */ - { 0x03AA, CHF_VOWEL|CHF_UPPER|CHF_TREMA|CHF_DIPH2, 0x0399, 0x03CA }, /* Ϊ */ - { 0x03AB, CHF_VOWEL|CHF_UPPER|CHF_TREMA, 0x03A5, 0x03CB }, /* Ϋ */ - { 0x03AC, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B1, 0x0386 }, /* ά */ - { 0x03AD, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B5, 0x0388 }, /* έ */ - { 0x03AE, CHF_VOWEL|CHF_LOWER|CHF_OXEIA|CHF_DIPH2, 0x03B7, 0x0389 }, /* ή */ - { 0x03AF, CHF_VOWEL|CHF_LOWER|CHF_OXEIA|CHF_DIPH2, 0x03B9, 0x038A }, /* ί */ - { 0x03B0, CHF_VOWEL|CHF_OXEIA|CHF_TREMA, 0x03C5, 0, 0, 0, 0, 0x03CB }, /* ΰ */ - { 0x03B1, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH1, 0, 0x0391, 1, 0x03AC }, /* α */ - { 0x03B2, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0392, 2 }, /* β */ - { 0x03B3, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0393, 3 }, /* γ */ - { 0x03B4, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0394, 4 }, /* δ */ - { 0x03B5, CHF_CONSONANT|CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH1, 0, 0x0395, 5, 0x03AD }, /* ε */ - { 0x03B6, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0396, 7 }, /* ζ */ - { 0x03B7, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH1|CHF_DIPH2, 0, 0x0397, 8, 0x03AE }, /* η */ - { 0x03B8, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0398, 9 }, /* θ */ - { 0x03B9, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x0399, 10, 0x03AF }, /* ι */ - { 0x03BA, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039A, 20 }, /* κ */ - { 0x03BB, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039B, 30 }, /* λ */ - { 0x03BC, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039C, 40 }, /* μ */ - { 0x03BD, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039D, 50 }, /* ν */ - { 0x03BE, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039E, 60 }, /* ξ */ + { 0x03A3, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C3, + 200, {0, 0, 0}, 0, LETTER_S, PHON_S }, /* Σ */ + { 0x03A4, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C4, + 300, {0, 0, 0}, 0, LETTER_T, PHON_T }, /* Τ */ + { 0x03A5, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03C5, + 400, { 0x038E, 0, 0}, 0, LETTER_Y, PHON_I }, /* Υ */ + { 0x03A6, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C6, + 500, {0, 0, 0}, 0, LETTER_F, PHON_F }, /* Φ */ + { 0x03A7, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C7, + 600, {0, 0, 0}, 0, LETTER_X, PHON_H }, /* Χ */ + { 0x03A8, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C8, + 700, {0, 0, 0}, 0, LETTER_PS, PHON_PS }, /* Ψ */ + { 0x03A9, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03C9, + 800, { 0x038F, 0, 0}, 0, LETTER_OMEGA, PHON_O }, /* Ω */ + { 0x03AA, CHF_VOWEL|CHF_UPPER|CHF_TREMA, 0x0399, 0x03CA, + 0, {0, 0, 0}, 0, LETTER_I_TREMA, PHON_I }, /* Ϊ */ + { 0x03AB, CHF_VOWEL|CHF_UPPER|CHF_TREMA, 0x03A5, 0x03CB, + 0, {0, 0, 0}, 0, LETTER_Y_TREMA, PHON_I }, /* Ϋ */ + { 0x03AC, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B1, 0x0386, + 0, {0, 0, 0}, 0, LETTER_A_ACC, PHON_A }, /* ά */ + { 0x03AD, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B5, 0x0388, + 0, {0, 0, 0}, 0, LETTER_E_ACC, PHON_E }, /* έ */ + { 0x03AE, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B7, 0x0389, + 0, {0, 0, 0}, 0, LETTER_H_ACC, PHON_I }, /* ή */ + { 0x03AF, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B9, 0x038A, + 0, {0, 0, 0}, 0, LETTER_I_ACC, PHON_I }, /* ί */ + { 0x03B0, CHF_VOWEL|CHF_OXEIA|CHF_TREMA, 0x03C5, 0, + 0, { 0, 0, 0 }, 0x03CB, LETTER_Y_TREMA_ACC, PHON_I }, /* ΰ */ + { 0x03B1, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x0391, + 1, {0x03AC, 0, 0}, 0, LETTER_A, PHON_A }, /* α */ + { 0x03B2, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0392, + 2, {0, 0, 0}, 0, LETTER_B, PHON_BH }, /* β */ + { 0x03B3, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0393, + 3, {0, 0, 0}, 0, LETTER_G, PHON_GH }, /* γ */ + { 0x03B4, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0394, + 4, {0, 0, 0}, 0, LETTER_D, PHON_DH }, /* δ */ + { 0x03B5, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x0395, + 5, { 0x03AD, 0, 0}, 0, LETTER_E, PHON_E }, /* ε */ + { 0x03B6, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0396, + 7, {0, 0, 0}, 0, LETTER_Z, PHON_Z }, /* ζ */ + { 0x03B7, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x0397, + 8, { 0x03AE, 0, 0}, 0, LETTER_H, PHON_I }, /* η */ + { 0x03B8, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0398, + 9, {0, 0, 0}, 0, LETTER_TH, PHON_TH }, /* θ */ + { 0x03B9, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x0399, + 10, {0x03AF, 0, 0}, 0, LETTER_I, PHON_I }, /* ι */ + { 0x03BA, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039A, + 20, {0, 0, 0}, 0, LETTER_K, PHON_K }, /* κ */ + { 0x03BB, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039B, + 30, {0, 0, 0}, 0, LETTER_L, PHON_L }, /* λ */ + { 0x03BC, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039C, + 40, {0, 0, 0}, 0, LETTER_M, PHON_M }, /* μ */ + { 0x03BD, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039D, + 50, {0, 0, 0}, 0, LETTER_N, PHON_N }, /* ν */ + { 0x03BE, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039E, + 60, {0, 0, 0}, 0, LETTER_KS, PHON_X }, /* ξ */ - { 0x03BF, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH1, 0, 0x039F, 70, 0x03CC }, /* ο */ - { 0x03C0, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A0, 80 }, /* π */ - { 0x03C1, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A1, 100 }, /* ρ */ - { 0x03C2, CHF_CONSONANT|CHF_LOWER, 0, 0x03A3 }, /* ς */ - { 0x03C3, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A3, 200 }, /* σ */ - { 0x03C4, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A4, 300 }, /* τ */ - { 0x03C5, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH2, 0, 0x03A5, 400, 0x03CD }, /* υ */ - { 0x03C6, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A6, 500 }, /* φ */ - { 0x03C7, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A7, 600 }, /* χ */ - { 0x03C8, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A8, 700 }, /* ψ */ - { 0x03C9, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x03A9, 800, 0x03CE }, /* ω */ - { 0x03CA, CHF_VOWEL|CHF_LOWER|CHF_TREMA|CHF_DIPH2, 0x03B9, 0x03AA, 0, 0x0390 }, /* ϊ */ - { 0x03CB, CHF_VOWEL|CHF_LOWER|CHF_TREMA, 0x03C5, 0x03AB, 0, 0x03B0 }, /* ϋ */ - { 0x03CC, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03BF, 0x038C }, /* ό */ - { 0x03CD, CHF_VOWEL|CHF_LOWER|CHF_OXEIA|CHF_DIPH2, 0x03C5, 0x038E }, /* ύ */ - { 0x03CE, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03CE, 0x038F }, /* ώ */ + { 0x03BF, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x039F, + 70, {0x03CC, 0, 0}, 0, LETTER_OMICRON, PHON_O }, /* ο */ + { 0x03C0, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A0, + 80, {0, 0, 0}, 0, LETTER_P, PHON_P }, /* π */ + { 0x03C1, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A1, + 100, {0, 0, 0}, 0, LETTER_R, PHON_R }, /* ρ */ + { 0x03C2, CHF_CONSONANT|CHF_LOWER, 0, 0x03A3, + 0, {0, 0, 0}, 0, LETTER_S, PHON_S }, /* ς */ + { 0x03C3, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A3, + 200, {0, 0, 0}, 0, LETTER_S, PHON_S }, /* σ */ + { 0x03C4, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A4, + 300, {0, 0, 0}, 0, LETTER_T, PHON_T }, /* τ */ + { 0x03C5, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x03A5, + 400, { 0x03CD, 0, 0}, 0, LETTER_Y, PHON_I }, /* υ */ + { 0x03C6, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A6, + 500, {0, 0, 0}, 0, LETTER_F, PHON_F }, /* φ */ + { 0x03C7, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A7, + 600, {0, 0, 0}, 0, LETTER_X, PHON_H }, /* χ */ + { 0x03C8, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A8, + 700, {0, 0, 0}, 0, LETTER_PS, PHON_PS }, /* ψ */ + { 0x03C9, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x03A9, + 800, {0x03CE, 0, 0}, 0, LETTER_OMEGA, PHON_O }, /* ω */ + { 0x03CA, CHF_VOWEL|CHF_LOWER|CHF_TREMA, 0x03B9, 0x03AA, + 0, {0x0390, 0, 0}, 0, LETTER_I_TREMA, PHON_I }, /* ϊ */ + { 0x03CB, CHF_VOWEL|CHF_LOWER|CHF_TREMA, 0x03C5, 0x03AB, + 0, {0x03B0, 0, 0}, 0, LETTER_Y_TREMA, PHON_I }, /* ϋ */ + { 0x03CC, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03BF, 0x038C, + 0, {0, 0, 0}, 0, LETTER_OMICRON_ACC, PHON_O }, /* ό */ + { 0x03CD, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03C5, 0x038E, + 0, {0, 0, 0}, 0x03C5, LETTER_Y_ACC, PHON_I }, /* ύ */ + { 0x03CE, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03C9, 0x038F, + 0, {0, 0, 0}, 0x03C9, LETTER_OMEGA_ACC, PHON_O }, /* ώ */ { 0x03CF, CHF_SYMBOL|CHF_UPPER, 0x03D7 }, /* KAI */ { 0x03D0, CHF_CONSONANT|CHF_LOWER, 0, 0x0392 }, /* curled beta */ { 0x03D1, CHF_CONSONANT|CHF_LOWER, 0, 0x0398 }, /* script theta */ @@ -561,7 +617,7 @@ struct char_info_st el_extended_ctype[] = { { 0x1FFF, } }; -static struct char_info_st * +struct char_info_st const * elchr_info(unsigned ch) { if (ch >= 0x0300 && ch <= 0x03FF) @@ -574,10 +630,24 @@ elchr_info(unsigned ch) int elchr_flags(unsigned ch) { - struct char_info_st *ci = elchr_info(ch); + struct char_info_st const *ci = elchr_info(ch); return ci ? ci->flags : 0; } +int +elchr_letter(unsigned ch) +{ + struct char_info_st const *ci = elchr_info(ch); + return ci ? ci->letter : 0; +} + +int +elchr_phoneme(unsigned ch) +{ + struct char_info_st const *ci = elchr_info(ch); + return ci ? ci->phoneme : 0; +} + int elchr_isupper(unsigned ch) { @@ -654,35 +724,35 @@ elchr_isnumeric(unsigned ch) unsigned elchr_numeric_value(unsigned ch) { - struct char_info_st *ci = elchr_info(ch); + struct char_info_st const *ci = elchr_info(ch); return (ci && (ci->flags & CHF_NUMERIC)) ? ci->numval: 0; } unsigned elchr_toupper(unsigned ch) { - struct char_info_st *ci = elchr_info(ch); + struct char_info_st const *ci = elchr_info(ch); return (ci && (ci->flags & CHF_LOWER)) ? ci->trans: ch; } unsigned elchr_tolower(unsigned ch) { - struct char_info_st *ci = elchr_info(ch); + struct char_info_st const *ci = elchr_info(ch); return (ci && (ci->flags & CHF_UPPER)) ? ci->trans : ch; } unsigned elchr_base(unsigned ch) { - struct char_info_st *ci = elchr_info(ch); + struct char_info_st const *ci = elchr_info(ch); return (ci && (ci->flags & CHF_ACCENT_MASK) && ci->base) ? ci->base : ch; } unsigned elchr_deaccent(unsigned ch) { - struct char_info_st *ci = elchr_info(ch); + struct char_info_st const *ci = elchr_info(ch); if (ci && (ci->flags & CHF_ACCENT_MASK)) return ci->deaccent ? ci->deaccent : ci->base ? ci->base : ch; return ch; @@ -691,28 +761,7 @@ elchr_deaccent(unsigned ch) unsigned elchr_accent(unsigned ch, int acc) { - struct char_info_st *ci = elchr_info(ch); + struct char_info_st const *ci = elchr_info(ch); return (ci && ci->accented[acc-1]) ? ci->accented[acc-1] : ch; } -int -elchr_diphthong(unsigned ch, int state) -{ - struct char_info_st *ci = elchr_info(ch); - - if (!ci || !(ci->flags & CHF_VOWEL)) - return 0; - switch (state) { - case 0: - if (ci->flags & CHF_DIPH1) - state = 1; - break; - case 1: - if (ci->flags & CHF_DIPH2) - state = 2; - break; - default: - state = 0; - } - return state; -} diff --git a/src/ellinika/elmorph-public.scm b/src/ellinika/elmorph-public.scm new file mode 100644 index 0000000..329fe4a --- /dev/null +++ b/src/ellinika/elmorph-public.scm @@ -0,0 +1,106 @@ +;;;; This file is part of Ellinika project. +;;;; Copyright (C) 2011 Sergey Poznyakoff +;;;; +;;;; Ellinika is free software; you can redistribute it and/or modify +;;;; it under the terms of the GNU General Public License as published by +;;;; the Free Software Foundation; either version 3 of the License, or +;;;; (at your option) any later version. +;;;; +;;;; Ellinika is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;;; GNU General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU General Public License +;;;; along with this program. If not, see . + +(use-modules ((srfi srfi-1))) + +(define-public (elstr-trim word n) + (let ((word (if (string? word) + (string->elstr word) + word))) + (cond + ((> n 0) + (elstr-slice word n (- (elstr-length word) n))) + ((< n 0) + (elstr-slice word 0 (+ (elstr-length word) n))) + (else + word)))) + +(define-public (elstr-trim! word n) + (let ((word (if (string? word) + (string->elstr word) + word))) + (cond + ((> n 0) + (elstr-slice! word n (- (elstr-length word) n))) + ((< n 0) + (elstr-slice! word 0 (+ (elstr-length word) n)))))) + +(define-public (phoneme:code ph) + (list-ref ph 0)) + +(define-public (phoneme:start ph) + (list-ref ph 1)) + +(define-public (phoneme:count ph) + (list-ref ph 2)) + +(define-public (phoneme:flags ph) + (list-ref ph 3)) + +(define-public (phoneme:accented? ph) + (logand (phoneme:flags ph) elmorph:accent-mask)) + +(define-public (phoneme:vowel? ph) + (= (logand (phoneme:flags ph) elmorph:vowel))) + +(define-public (phoneme:consonant? ph) + (= (logand (phoneme:flags ph) elmorph:consonant))) + +(define-public (phoneme:diphthong? ph) + (= (logand (phoneme:flags ph) elmorph:diphthong))) + + +(define soundslike-transcription-list + '((1 . "a") + (2 . "e") + (3 . "i") + (4 . "o") + (5 . "u") + (6 . "b") + (7 . "g") + (8 . "d") + (9 . "z") + (10 . "t") + (11 . "k") + (12 . "l") + (13 . "m") + (14 . "n") + (15 . "x") + (16 . "p") + (17 . "r") + (18 . "s") + (19 . "t") + (20 . "f") + (21 . "h") + (22 . "P") + (23 . "b") + (24 . "d") + (25 . "g") + (26 . "sm") + (27 . "ts") + (28 . "tz") + (29 . "ngz") + (30 . "au") + (31 . "eu"))) + +(define-public (elstr->soundslike word) + (let ((phon-map (elstr->phonetic-map word))) + (apply string-append + (filter-map + (lambda (elt) + (assoc-ref soundslike-transcription-list (phoneme:code elt))) + phon-map)))) + diff --git a/src/ellinika/elmorph.c b/src/ellinika/elmorph.c index 1831610..5a8acdf 100644 --- a/src/ellinika/elmorph.c +++ b/src/ellinika/elmorph.c @@ -26,55 +26,63 @@ struct elstr { unsigned *str; /* UTF-8 string */ size_t len; /* Its length */ + + struct phoneme *phoneme; /* Phonetical map*/ + unsigned phoneme_count; /* Number of phonemes */ + + struct syllable *sylmap; /* Syllable map (nsyl elements) */ unsigned nsyl; /* Number of syllables. */ - unsigned *sylmap; /* Syllable map (nsyl elements) */ unsigned acc_syl; /* Number of the accented syllable (1-based, from the last syllable) */ unsigned acc_pos; /* Number of the accented character (0-based, from str[0]) */ + }; scm_t_bits _elstr_tag; static void -_elstr_syllabize(struct elstr *elstr) -{ - unsigned *sylmap; - unsigned i, nsyl = 0, accchr = 0; - int accsyl = -1; - int dstate = 0; - int acc = 0; - - if (!elstr->sylmap) { - elstr->sylmap = calloc(elstr->len, sizeof(sylmap[0])); - if (!elstr->sylmap) - scm_memory_error("_elstr_syllabize"); - } - sylmap = elstr->sylmap; +_elstr_syllabize(struct elstr *elstr, const char *func_name) +{ + unsigned i; + + free(elstr->phoneme); + free(elstr->sylmap); - for (i = 0; i < elstr->len; i++) { - int nstate; - - if (elchr_getaccent(elstr->str[i])) { - accsyl = nsyl; - accchr = i; - } - nstate = elchr_diphthong(elstr->str[i], dstate); - if (nstate) - /* skip */; - else if (dstate) - sylmap[nsyl++] = i - 1; - else if (elchr_isvowel(elstr->str[i])) - sylmap[nsyl++] = i; - dstate = nstate; + if (phoneme_map(&elstr->phoneme, &elstr->phoneme_count, + elstr->str, elstr->len)) + scm_misc_error(func_name, + "cannot create phonetic map: ~S", + scm_from_int(errno)); + + if (syllable_map(&elstr->sylmap, &elstr->nsyl, + elstr->phoneme, elstr->phoneme_count)) + scm_misc_error(func_name, + "cannot create syllable map: ~S", + scm_from_int(errno)); + + for (i = elstr->nsyl; i > 0; i--) { + if (elstr->sylmap[elstr->nsyl - i].flags & CHF_ACCENT_MASK) + break; } - if (dstate) - sylmap[nsyl++] = i - 1; - else if (nsyl) - sylmap[nsyl-1] = i - 1; - elstr->nsyl = nsyl; - elstr->acc_pos = accchr; - elstr->acc_syl = (accsyl >= 0) ? nsyl - accsyl : 0; + elstr->acc_syl = i; + for (i = 0; i < elstr->len; i++) + if (elchr_getaccent(elstr->str[i])) + break; + elstr->acc_pos = i; +} + +static void +invalidate_maps(struct elstr *elstr) +{ + free(elstr->sylmap); + elstr->sylmap = NULL; + elstr->nsyl = 0; + free(elstr->phoneme); + elstr->phoneme = NULL; + elstr->phoneme_count = 0; + elstr->acc_pos = 0; + elstr->acc_syl = 0; } static SCM @@ -89,7 +97,7 @@ _elstr_alloc_empty(struct elstr **pelstr) } static SCM -_elstr_alloc(const char *instr, int syl) +_elstr_alloc(const char *instr, int syl, const char *func_name) { struct elstr *elstr; unsigned *wptr; @@ -105,8 +113,10 @@ _elstr_alloc(const char *instr, int syl) elstr->nsyl = 0; elstr->acc_syl = 0; elstr->acc_pos = 0; + elstr->phoneme = 0; + elstr->phoneme_count = 0; if (syl) - _elstr_syllabize(elstr); + _elstr_syllabize(elstr, func_name); SCM_RETURN_NEWSMOB(_elstr_tag, elstr); } @@ -120,19 +130,34 @@ _elstr_dup(struct elstr *elstr) elnew->str = calloc(elstr->len, sizeof(elnew->str[0])); if (!elnew->str) scm_memory_error("_elstr_dup"); + memcpy(elnew->str, elstr->str, sizeof(elstr->str[0]) * elstr->len); + elnew->len = elstr->len; + + if (elstr->phoneme) { + elnew->phoneme = calloc(elstr->phoneme_count, + sizeof(elnew->phoneme[0])); + if (!elnew->phoneme) { + free(elnew->str); + scm_memory_error("_elstr_dup"); + } + memcpy(elnew->phoneme, elstr->phoneme, + sizeof(elstr->phoneme[0]) * elstr->phoneme_count); + } else + elnew->phoneme = NULL; + elnew->phoneme_count = elstr->phoneme_count; + if (elstr->sylmap) { elnew->sylmap = calloc(elstr->nsyl, sizeof(elnew->sylmap[0])); if (!elnew->sylmap) { free(elnew->str); scm_memory_error("_elstr_dup"); } + memcpy(elnew->sylmap, elstr->sylmap, + sizeof(elstr->sylmap[0]) * elstr->nsyl); } else elnew->sylmap = NULL; - memcpy(elnew->str, elstr->str, sizeof(elstr->str[0]) * elstr->len); - elnew->len = elstr->len; elnew->nsyl = elstr->nsyl; - memcpy(elnew->sylmap, elstr->sylmap, - sizeof(elstr->sylmap[0]) * elstr->nsyl); + elnew->acc_syl = elstr->acc_syl; elnew->acc_pos = elstr->acc_pos; SCM_RETURN_NEWSMOB(_elstr_tag, elnew); @@ -162,6 +187,7 @@ _elstr_free(SCM smob) struct elstr *elstr = (struct elstr *) SCM_CDR(smob); free(elstr->str); free(elstr->sylmap); + free(elstr->phoneme); scm_gc_free(elstr, sizeof(struct elstr), "elstr"); return 0; } @@ -170,34 +196,31 @@ static int _elstr_print(SCM smob, SCM port, scm_print_state *pstate) { struct elstr *elstr = (struct elstr *) SCM_CDR(smob); - int i, j, an; + int i, j; char *s; scm_puts("#sylmap) { scm_puts("``", port); - an = elstr->nsyl - elstr->acc_syl; - if (an == 0) - scm_puts("[", port); - for (i = j = 0; i < elstr->len; i++) { - char r[6]; - int n; - - if (i == elstr->sylmap[j] + 1) { - if (j == an) - scm_puts("]", port); + for (i = 0; i < elstr->nsyl; i++) { + size_t start = elstr->sylmap[i].char_start; + if (i) scm_puts("-", port); - if (++j == an) - scm_puts("[", port); + if (elstr->sylmap[i].flags & CHF_ACCENT_MASK) + scm_puts("[", port); + for (j = 0; j < elstr->sylmap[i].char_count; j++) { + char r[6]; + int n; + + n = utf8_wctomb(r, elstr->str[start+j]); + if (n == -1) + continue; + r[n] = 0; + scm_puts(r, port); } - n = utf8_wctomb(r, elstr->str[i]); - if (n == -1) - continue; - r[n] = 0; - scm_puts(r, port); + if (elstr->sylmap[i].flags & CHF_ACCENT_MASK) + scm_puts("]", port); } - if (j == an) - scm_puts("]", port); } else { scm_puts("[NS] ``", port); for (i = j = 0; i < elstr->len; i++) { @@ -238,7 +261,7 @@ force_elstr(struct elstr **ep, SCM scm, int sylopt, SCM_ASSERT(scm_is_string(scm), scm, arg, func_name); str = scm_to_locale_string(scm); - newscm = _elstr_alloc(str, sylopt); + newscm = _elstr_alloc(str, sylopt, func_name); free(str); if (newscm == SCM_EOL) scm_misc_error(func_name, @@ -336,13 +359,10 @@ SCM_DEFINE_PUBLIC(scm_elstr_syllable_prop, "elstr-syllable-prop", "cannot get syllable #~S: not enough syllables: ~S", scm_list_2(el, n)); num = elstr->nsyl - num; - if (num == 0) - start = 0; - else - start = elstr->sylmap[num - 1] + 1; - return scm_cons(scm_from_uint(start), - scm_from_uint(elstr->sylmap[num])); + return scm_list_3(scm_from_uint(elstr->sylmap[num].char_start), + scm_from_uint(elstr->sylmap[num].char_count), + scm_from_int(elstr->sylmap[num].flags)); } #undef FUNC_NAME @@ -388,12 +408,8 @@ SCM_DEFINE_PUBLIC(scm_elstr_syllable, "elstr-syllable", "cannot get syllable #~S: not enough syllables: ~S", scm_list_2(el, n)); num = elstr->nsyl - num; - if (num == 0) - start = 0; - else - start = elstr->sylmap[num - 1] + 1; - if (utf8_wc_to_mbstr(elstr->str + start, - elstr->sylmap[num] - start + 1, + if (utf8_wc_to_mbstr(elstr->str + elstr->sylmap[num].char_start, + elstr->sylmap[num].char_count, &s)) scm_misc_error(FUNC_NAME, "cannot convert elstr to Scheme", @@ -514,8 +530,7 @@ _elstr_deaccent(SCM el, int destructive, const char *func_name) } for (i = 0; i < elstr->len; i++) elstr->str[i] = elchr_deaccent(elstr->str[i]); - elstr->acc_pos = 0; - elstr->acc_syl = 0; + invalidate_maps(elstr); return scm; } @@ -544,9 +559,10 @@ _elstr_set_accent(SCM el, SCM n, int destructive, const char *func_name) { struct elstr *elstr; unsigned i; - unsigned acc_num, num, len, start; + unsigned acc_num, num, start; SCM scm; - int dstate; + unsigned pos; + struct phoneme *phoneme = NULL; if (destructive) { SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); @@ -556,15 +572,11 @@ _elstr_set_accent(SCM el, SCM n, int destructive, const char *func_name) SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name); num = scm_to_uint(n); - if (num > elstr->nsyl) + if (num == 0 | num > elstr->nsyl) scm_misc_error(func_name, "cannot set accent on syllable #~S: not enough syllables: ~S", scm_list_2(n, el)); acc_num = elstr->nsyl - num; - if (acc_num == 0) - start = 0; - else - start = elstr->sylmap[acc_num - 1] + 1; if (destructive) scm = SCM_UNSPECIFIED; @@ -576,25 +588,38 @@ _elstr_set_accent(SCM el, SCM n, int destructive, const char *func_name) /* Clear all accents */ for (i = 0; i < elstr->len; i++) elstr->str[i] = elchr_deaccent(elstr->str[i]); - len = elstr->sylmap[acc_num] - start + 1; - dstate = 0; - for (i = start; i <= start + len; i++) { - int nstate; - - if (!elchr_isvowel(elstr->str[i])) { - if (dstate) { - --i; - break; - } - continue; - } - nstate = elchr_diphthong(elstr->str[i], dstate); - if (!nstate) + for (i = 0; i < elstr->nsyl; i++) + elstr->sylmap[i].flags &= ~CHF_ACCENT_MASK; + for (i = 0; i < elstr->phoneme_count; i++) + elstr->phoneme[i].flags &= ~CHF_ACCENT_MASK; + + start = elstr->sylmap[acc_num].phoneme_start; + pos = 0; + for (i = 0; i < elstr->sylmap[acc_num].phoneme_count; i++) { + struct phoneme *ph = elstr->phoneme + start + i; + if (ph->flags & CHF_CONSONANT) + /* skip */ ; + else if (ph->flags & CHF_DIPHTHONG) { + phoneme = ph; + pos = ph->start + 1; + break; + } else if (ph->flags & CHF_VOWEL) { + phoneme = ph; + pos = ph->start; break; - dstate = nstate; + } } - elstr->str[i] = elchr_accent(elstr->str[i], CHF_OXEIA); + if (!phoneme) + scm_misc_error(func_name, + "cannot set accent on syllable #~S of ~S: " + "INTERNAL ERROR", + scm_list_2(n, el)); + phoneme->flags |= CHF_OXEIA; + elstr->sylmap[acc_num].flags |= CHF_OXEIA; + elstr->str[pos] = elchr_accent(elstr->str[pos], CHF_OXEIA); + elstr->acc_syl = num; + elstr->acc_pos = pos; return scm; } @@ -652,7 +677,8 @@ _elstr_set_accent_on_char(SCM el, SCM n, int destructive, const char *func_name) elstr->str[i] = elchr_deaccent(elstr->str[i]); elstr->str[num] = elchr_accent(elstr->str[num], CHF_OXEIA); - _elstr_syllabize(elstr); + invalidate_maps(elstr); + _elstr_syllabize(elstr, func_name); return scm; } @@ -716,11 +742,31 @@ static struct deftab { { CHF_LOWER, "elmorph:lower" }, { CHF_UPPER, "elmorph:upper" }, { CHF_NUMERIC, "elmorph:numeric" }, - - { CHF_DIPH1, "elmorph:diph1" }, - { CHF_DIPH2, "elmorph:diph2" } + { CHF_DIPHTHONG, "elmorph:diphthong" }, }; - + +SCM_DEFINE_PUBLIC(scm_elstr_char_phoneme, "elstr-char-phoneme", + 2, 0, 0, + (SCM el, SCM n), +"Returns a phoneme code of the Nth char in EL\n") +#define FUNC_NAME s_scm_elstr_char_phoneme +{ + struct elstr *elstr; + int num; + + force_elstr(&elstr, el, 0, SCM_ARG1, FUNC_NAME); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); + num = scm_to_int(n); + if (num < 0) + num += elstr->len; + if (num >= elstr->len) + scm_misc_error(FUNC_NAME, + "cannot get character #~S: not enough characters: ~S", + scm_list_2(el, n)); + return scm_from_uint(elchr_phoneme(elstr->str[num])); +} +#undef FUNC_NAME + SCM_DEFINE_PUBLIC(scm_utf8_toupper, "utf8-toupper", 1, 0, 0, (SCM string), "Convert STRING to uppercase\n") @@ -818,7 +864,8 @@ _elstr_slice(SCM el, SCM n, SCM l, int destructive, const char *func_name) SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); elstr = (struct elstr*) SCM_CDR(el); } else - scm = force_elstr(&elstr, el, 1, SCM_ARG1, func_name); + scm = force_elstr(&elstr, el, 0, SCM_ARG1, func_name); + invalidate_maps(elstr); SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name); SCM_ASSERT(scm_is_integer(l), l, SCM_ARG3, func_name); num = scm_to_int(n); @@ -842,7 +889,7 @@ _elstr_slice(SCM el, SCM n, SCM l, int destructive, const char *func_name) memmove(elstr->str, elstr->str + num, sizeof(elstr->str[0]) * len); elstr->len = len; - _elstr_syllabize(elstr); + _elstr_syllabize(elstr, func_name); return scm; } @@ -869,7 +916,7 @@ SCM_DEFINE_PUBLIC(scm_elstr_slice_x, "elstr-slice!", SCM_DEFINE_PUBLIC(scm_elstr_index, "elstr-index", 2, 0, 0, (SCM word, SCM needle), -"") +"Returns position of NEEDLE in the WORD") #define FUNC_NAME s_scm_elstr_index { struct elstr *elstr, *ep; @@ -920,10 +967,10 @@ SCM_DEFINE_PUBLIC(scm_elstr_suffix_p, "elstr-suffix?", SCM_DEFINE_PUBLIC(scm_elstr_append, "elstr-append", 0, 0, 1, (SCM rest), -"") +"Concatenates arguments.\n") #define FUNC_NAME s_scm_elstr_append { - SCM ret = _elstr_alloc("", 0); + SCM ret = _elstr_alloc("", 0, FUNC_NAME); struct elstr *elstr = (struct elstr*) SCM_CDR(ret); for (; !scm_is_null(rest); rest = SCM_CDR(rest)) { @@ -933,11 +980,48 @@ SCM_DEFINE_PUBLIC(scm_elstr_append, "elstr-append", force_elstr(&elt, val, 0, SCM_ARGn, FUNC_NAME); _elstr_concat(elstr, elt, FUNC_NAME); } - _elstr_syllabize(elstr); + _elstr_syllabize(elstr, FUNC_NAME); return ret; } #undef FUNC_NAME + +static SCM +elmorph_scm_from_phoneme(struct phoneme *phoneme) +{ + return scm_list_4(scm_from_int(phoneme->code), + scm_from_uint(phoneme->start), + scm_from_uint(phoneme->count), + scm_from_bool(phoneme->flags)); +} +SCM_DEFINE_PUBLIC(scm_elstr__phonetic_map, "elstr->phonetic-map", + 1, 0, 0, + (SCM word), +"Converts WORD to a phonetic map.\n") +#define FUNC_NAME s_scm_elstr__phonetic_map +{ + struct elstr *elstr; + struct phoneme *phmap; + size_t phlen, i; + SCM head = SCM_EOL, tail = SCM_EOL; + + force_elstr(&elstr, word, 1, SCM_ARG1, FUNC_NAME); + phmap = elstr->phoneme; + phlen = elstr->phoneme_count; + for (i = 0; i < phlen; i++) { + SCM elt = scm_cons(elmorph_scm_from_phoneme(phmap + i), + SCM_EOL); + if (scm_is_null(head)) + head = tail = elt; + else { + SCM_SETCDR(tail, elt); + tail = elt; + } + } + free(phmap); + return head; +} +#undef FUNC_NAME void scm_init_ellinika_elmorph_module() diff --git a/src/ellinika/elmorph.h b/src/ellinika/elmorph.h index eacbde5..2399b8a 100644 --- a/src/ellinika/elmorph.h +++ b/src/ellinika/elmorph.h @@ -33,10 +33,82 @@ #define CHF_UPPER 0x02000 #define CHF_NUMERIC 0x04000 -#define CHF_DIPH1 0x10000 -#define CHF_DIPH2 0x20000 +#define CHF_DIPHTHONG 0x08000 +/* Phonemes */ +#define PHON_A 1 /* α */ +#define PHON_E 2 /* ε αι */ +#define PHON_I 3 /* ι η υ ει οι υι */ +#define PHON_O 4 /* ο ω */ +#define PHON_U 5 /* ου */ + +#define PHON_BH 6 /* β */ +#define PHON_GH 7 /* γ */ +#define PHON_DH 8 /* δ */ +#define PHON_Z 9 /* ζ */ +#define PHON_TH 10 /* θ */ +#define PHON_K 11 /* κ */ +#define PHON_L 12 /* λ */ +#define PHON_M 13 /* μ */ +#define PHON_N 14 /* ν */ +#define PHON_X 15 /* ξ */ +#define PHON_P 16 /* π */ +#define PHON_R 17 /* ρ */ +#define PHON_S 18 /* σ */ +#define PHON_T 19 /* τ */ +#define PHON_F 20 /* φ */ +#define PHON_H 21 /* χ */ +#define PHON_PS 22 /* ψ */ + +#define PHON_B 23 /* μπ */ +#define PHON_D 24 /* ντ */ +#define PHON_G 25 /* γγ γκ γχ */ +#define PHON_ZM 26 /* σμ */ +#define PHON_TS 27 /* τσ */ +#define PHON_DZ 28 /* τζ */ +#define PHON_NGZ 29 /* νγζ */ + +#define PHON_AV 30 /* αυ */ +#define PHON_EV 31 /* ευ */ + +#define _PHON_MAX 32 + +struct phoneme { + int code; /* Phoneme code */ + unsigned start; /* Start of phoneme */ + unsigned count; /* Number of characters in it */ + int flags; +}; + +struct syllable { + unsigned char_start; /* Start of syllable */ + unsigned char_count; /* Number of characters in it */ + unsigned phoneme_start; + unsigned phoneme_count; + int flags; +}; + +struct char_info_st { + unsigned ch; /* Characters */ + int flags; /* Flags (see above) */ + unsigned base; /* for vowels - a corresponding vowel with + all diacritics removed */ + unsigned trans; /* a counter-case equivalent, i.e. a + corresponding uppercase letter if + flags & CHF_LOWER and a corresponding + lowercase letter if flags & CHF_UPPER */ + unsigned numval; /* Numeric value */ + unsigned accented[3]; /* For vowels - corresponding accented variant */ + unsigned deaccent; /* For accented vowels with diaeresis - + corresponding non-accented character */ + int letter; /* Letter code */ + int phoneme; /* Phoneme code */ +}; + +struct char_info_st const *elchr_info(unsigned ch); int elchr_flags(unsigned ch); +int elchr_letter(unsigned ch); +int elchr_phoneme(unsigned ch); int elchr_isupper(unsigned ch); int elchr_islower(unsigned ch); int elchr_getaccent(unsigned ch); @@ -55,8 +127,12 @@ unsigned elchr_tolower(unsigned ch); unsigned elchr_base(unsigned ch); unsigned elchr_deaccent(unsigned ch); unsigned elchr_accent(unsigned ch, int acc); -int elchr_diphthong(unsigned ch, int state); int elmorph_thema_aoristoy(unsigned *word, size_t len, unsigned **thema, size_t *tlen); + +int phoneme_map(struct phoneme **pph, size_t *plen, + unsigned *word, size_t len); +int syllable_map(struct syllable **psyl, size_t *plen, + struct phoneme *phon, size_t nphon); diff --git a/src/ellinika/elmorph.scm4 b/src/ellinika/elmorph.scm4 index f916d1c..ede4d50 100644 --- a/src/ellinika/elmorph.scm4 +++ b/src/ellinika/elmorph.scm4 @@ -20,27 +20,4 @@ "LIBDIR/libguile-elmorph-v-VERSION" "scm_init_ellinika_elmorph_module") -(define-public (elstr-trim word n) - (let ((word (if (string? word) - (string->elstr word) - word))) - (cond - ((> n 0) - (elstr-slice word n (- (elstr-length word) n))) - ((< n 0) - (elstr-slice word 0 (+ (elstr-length word) n))) - (else - word)))) - -(define-public (elstr-trim! word n) - (let ((word (if (string? word) - (string->elstr word) - word))) - (cond - ((> n 0) - (elstr-slice! word n (- (elstr-length word) n))) - ((< n 0) - (elstr-slice! word 0 (+ (elstr-length word) n)))))) - - - +include([elmorph-public.scm]) diff --git a/src/ellinika/phoneme.y b/src/ellinika/phoneme.y new file mode 100644 index 0000000..353d175 --- /dev/null +++ b/src/ellinika/phoneme.y @@ -0,0 +1,353 @@ +/* This file is part of Ellinika project. + Copyright (C) 2011 Sergey Poznyakoff + + Ellinika is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + Ellinika is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ +%{ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "utf8.h" +#include "elmorph.h" + +static struct phoneme *phoneme_base; +static size_t phoneme_max; +static size_t phoneme_count; +static int error_state; + +#define PHONEME_MAP_INITIAL_ALLOC 16 + +static void +phoneme_append(struct phoneme *phoneme) +{ + if (error_state) + return; + + if (phoneme_max == phoneme_count) { + struct phoneme *np; + size_t nsize; + + if (!phoneme_max) + nsize = PHONEME_MAP_INITIAL_ALLOC; + else { + nsize = 2 * phoneme_max; + if (nsize < phoneme_max) { + error_state = ENOMEM; + return; + } + } + np = realloc(phoneme_base, nsize * sizeof(phoneme_base[0])); + if (!np) { + error_state = ENOMEM; + return; + } + phoneme_max = nsize; + phoneme_base = np; + } + phoneme_base[phoneme_count++] = *phoneme; +} + +#define DIPHTHONG(a,b,pc,fl) do { \ + (a).count = 2; \ + (a).code = pc; \ + (a).flags = (fl) | CHF_DIPHTHONG | \ + (((a.flags) | (b).flags) & CHF_ACCENT_MASK); \ + } while (0) + +%} +%union { + struct phoneme phoneme; +}; + +%token LETTER_A 1 +%token LETTER_A_ACC 2 +%token LETTER_B 3 +%token LETTER_G 4 +%token LETTER_D 5 +%token LETTER_E 6 +%token LETTER_E_ACC 7 +%token LETTER_Z 8 +%token LETTER_H 9 +%token LETTER_H_ACC 10 +%token LETTER_TH 11 +%token LETTER_I 12 +%token LETTER_I_ACC 13 +%token LETTER_I_TREMA 14 +%token LETTER_I_TREMA_ACC 15 +%token LETTER_K 16 +%token LETTER_L 17 +%token LETTER_M 18 +%token LETTER_N 19 +%token LETTER_KS 20 +%token LETTER_OMICRON 21 +%token LETTER_OMICRON_ACC 22 +%token LETTER_P 23 +%token LETTER_R 24 +%token LETTER_S 25 +%token LETTER_T 26 +%token LETTER_Y 27 +%token LETTER_Y_ACC 28 +%token LETTER_Y_TREMA 29 +%token LETTER_Y_TREMA_ACC 30 +%token LETTER_F 31 +%token LETTER_X 32 +%token LETTER_PS 33 +%token LETTER_OMEGA 34 +%token LETTER_OMEGA_ACC 35 + +%type monophthong diphthong phoneme + +%% +input : phoneme + { + phoneme_append(&$1); + } + | input phoneme + { + phoneme_append(&$2); + } + ; + +phoneme : monophthong + | diphthong + ; + +monophthong: + LETTER_A + | LETTER_A_ACC + | LETTER_B + | LETTER_G + | LETTER_D + | LETTER_E + | LETTER_E_ACC + | LETTER_Z + | LETTER_H + | LETTER_H_ACC + | LETTER_TH + | LETTER_I + | LETTER_I_ACC + | LETTER_I_TREMA + | LETTER_I_TREMA_ACC + | LETTER_K + | LETTER_L + | LETTER_M + | LETTER_N + | LETTER_KS + | LETTER_OMICRON + | LETTER_OMICRON_ACC + | LETTER_P + | LETTER_R + | LETTER_S + | LETTER_T + | LETTER_Y + | LETTER_Y_ACC + | LETTER_Y_TREMA + | LETTER_Y_TREMA_ACC + | LETTER_F + | LETTER_X + | LETTER_PS + | LETTER_OMEGA + | LETTER_OMEGA_ACC + ; + +diphthong: + LETTER_A LETTER_I + { + DIPHTHONG($1, $2, PHON_E, CHF_VOWEL); + $$ = $1; + } + | LETTER_A LETTER_I_ACC + { + DIPHTHONG($1, $2, PHON_E, CHF_VOWEL); + $$ = $1; + } + | LETTER_E LETTER_I + { + DIPHTHONG($1, $2, PHON_I, CHF_VOWEL); + $$ = $1; + } + | LETTER_E LETTER_I_ACC + { + DIPHTHONG($1, $2, PHON_I, CHF_VOWEL); + $$ = $1; + } + | LETTER_OMICRON LETTER_I + { + DIPHTHONG($1, $2, PHON_I, CHF_VOWEL); + $$ = $1; + } + | LETTER_OMICRON LETTER_I_ACC + { + DIPHTHONG($1, $2, PHON_I, CHF_VOWEL); + $$ = $1; + } + | LETTER_Y LETTER_I + { + DIPHTHONG($1, $2, PHON_I, CHF_VOWEL); + $$ = $1; + } + | LETTER_Y LETTER_I_ACC + { + DIPHTHONG($1, $2, PHON_I, CHF_VOWEL); + $$ = $1; + } + | LETTER_OMICRON LETTER_Y + { + DIPHTHONG($1, $2, PHON_U, CHF_VOWEL); + $$ = $1; + } + | LETTER_OMICRON LETTER_Y_ACC + { + DIPHTHONG($1, $2, PHON_U, CHF_VOWEL); + $$ = $1; + } + | LETTER_M LETTER_P + { + DIPHTHONG($1, $2, PHON_B, CHF_CONSONANT); + $$ = $1; + } + | LETTER_N LETTER_T + { + DIPHTHONG($1, $2, PHON_D, CHF_CONSONANT); + $$ = $1; + } + | LETTER_G LETTER_G + { + DIPHTHONG($1, $2, PHON_G, CHF_CONSONANT); + $$ = $1; + } + | LETTER_G LETTER_K + { + DIPHTHONG($1, $2, PHON_G, CHF_CONSONANT); + $$ = $1; + } + | LETTER_G LETTER_X + { + DIPHTHONG($1, $2, PHON_G, CHF_CONSONANT); + $$ = $1; + } + | LETTER_S LETTER_M + { + DIPHTHONG($1, $2, PHON_ZM, CHF_CONSONANT); + $$ = $1; + } + | LETTER_T LETTER_S + { + DIPHTHONG($1, $2, PHON_TS, CHF_CONSONANT); + $$ = $1; + } + | LETTER_T LETTER_Z + { + DIPHTHONG($1, $2, PHON_DZ, CHF_CONSONANT); + $$ = $1; + } + | LETTER_G LETTER_KS + { + DIPHTHONG($1, $2, PHON_NGZ, CHF_CONSONANT); + $$ = $1; + } + | LETTER_A LETTER_Y + { + DIPHTHONG($1, $2, PHON_AV, 0); + $$ = $1; + } + | LETTER_A LETTER_Y_ACC + { + DIPHTHONG($1, $2, PHON_AV, 0); + $$ = $1; + } + | LETTER_E LETTER_Y + { + DIPHTHONG($1, $2, PHON_EV, 0); + $$ = $1; + } + | LETTER_E LETTER_Y_ACC + { + DIPHTHONG($1, $2, PHON_EV, 0); + $$ = $1; + } + ; + +%% + +static unsigned *input_base; +static size_t input_len; +static size_t input_pos; + +#define ISALPHA(ci) ((ci) && ci->letter) + +#define PHONEME_FLAG_MASK \ + (CHF_ACCENT_MASK|CHF_VOWEL|CHF_CONSONANT) + +int +yylex() +{ + unsigned c; + struct char_info_st const *ci; + + do { + if (input_pos == input_len) + return 0; + c = input_base[input_pos++]; + ci = elchr_info(c); + } while (!ISALPHA(ci)); + + yylval.phoneme.code = ci->phoneme; + yylval.phoneme.start = input_pos - 1; + yylval.phoneme.count = 1; + yylval.phoneme.flags = ci->flags & PHONEME_FLAG_MASK; + return ci->letter; +} + +int +yyerror(const char *s) +{ + fprintf("\n%s:%d: INTERNAL ERROR: %s\n", __FILE__, __LINE__, s); + abort(); +} + +int +phoneme_map(struct phoneme **pph, size_t *plen, unsigned *word, size_t len) +{ + int rc; + + input_base = word; + input_len = len; + input_pos = 0; + phoneme_base = NULL; + phoneme_max = 0; + phoneme_count = 0; + error_state = 0; + rc = yyparse(); + if (rc) { + free(phoneme_base); + errno = EINVAL; + return errno; + } + if (error_state) { + free(phoneme_base); + errno = error_state; + return errno; + } + if (phoneme_count < phoneme_max) + phoneme_base = + realloc(phoneme_base, + phoneme_count * sizeof(phoneme_base[0])); + *pph = phoneme_base; + *plen = phoneme_count; + return 0; +} diff --git a/src/ellinika/syllabificator.c b/src/ellinika/syllabificator.c new file mode 100644 index 0000000..c4105ec --- /dev/null +++ b/src/ellinika/syllabificator.c @@ -0,0 +1,152 @@ +/* This file is part of Ellinika project. + Copyright (C) 2011 Sergey Poznyakoff + + Ellinika is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + Ellinika is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include "utf8.h" +#include "elmorph.h" + +struct syllabificator { + struct syllable *syl; + size_t syl_count; + size_t syl_max; + struct phoneme *phon; + size_t phon_cur; + size_t phon_max; + int err; +}; + +#define SYL_FLAG_MASK (CHF_ACCENT_MASK) + +#define ISIOTA(ph) \ + ((ph).code == PHON_I && (ph).count == 1 && \ + !((ph).flags & (CHF_ACCENT_MASK|CHF_TREMA))) + +int +next_syllable(struct syllabificator *sp) +{ + struct syllable *syl; + + if (sp->phon_cur == sp->phon_max) + return 1; + + if (sp->syl_count == sp->syl_max) { + struct syllable *newsyl; + size_t newmax = sp->syl_max + 16; + + newsyl = realloc(sp->syl, sizeof(newsyl[0]) * newmax); + if (!newsyl) { + sp->err = errno; + return 1; + } + + sp->syl = newsyl; + sp->syl_max = newmax; + } + syl = sp->syl + sp->syl_count++; + syl->char_start = sp->phon[sp->phon_cur].start; + syl->char_count = sp->phon[sp->phon_cur].count; + syl->phoneme_start = sp->phon_cur; + syl->phoneme_count = 1; + syl->flags = sp->phon[sp->phon_cur].flags; + + sp->phon_cur++; + + /* A diphthong forms a single syllable. */ + if ((syl->flags & CHF_DIPHTHONG) && !(syl->flags & CHF_CONSONANT)) + return 0; + + /* If the syllable begins with a consonant, it includes all + subsequent consonants up to the first vowel. */ + if (syl->flags & CHF_CONSONANT) { + for (; sp->phon_cur < sp->phon_max && + (sp->phon[sp->phon_cur].flags & CHF_CONSONANT); + sp->phon_cur++) { + syl->char_count += sp->phon[sp->phon_cur].count; + syl->phoneme_count++; + } + } else if ((sp->phon[sp->phon_cur].flags & CHF_VOWEL) && + !ISIOTA(sp->phon[sp->phon_cur-1])) + /* V-V boundary */ + return 0; + + if (sp->phon_cur == sp->phon_max) + return 0; + + if (ISIOTA(sp->phon[sp->phon_cur])) { + /* incorporate iota */; + syl->char_count += sp->phon[sp->phon_cur].count; + syl->phoneme_count++; + sp->phon_cur++; + } + + if (sp->phon[sp->phon_cur].flags & CHF_VOWEL) + syl->flags |= sp->phon[sp->phon_cur].flags & CHF_ACCENT_MASK; + + syl->char_count += sp->phon[sp->phon_cur].count; + syl->phoneme_count++; + sp->phon_cur++; + + if (sp->phon_cur == sp->phon_max) + return 0; + + if (sp->phon[sp->phon_cur - 1].flags & CHF_VOWEL) { + /* If next phoneme is a consonant, incorporate it into the + current syllable */ + if ((sp->phon[sp->phon_cur].flags & CHF_CONSONANT) && + (sp->phon_cur + 1 == sp->phon_max || + (sp->phon[sp->phon_cur + 1].flags & CHF_CONSONANT))) { + syl->char_count += sp->phon[sp->phon_cur].count; + syl->phoneme_count++; + sp->phon_cur++; + } + } + + return 0; +} + + +int +syllable_map(struct syllable **psyl, size_t *plen, + struct phoneme *phon, size_t nphon) +{ + struct syllabificator sd; + + + sd.syl = NULL; + sd.syl_count = 0; + sd.syl_max = 0; + sd.phon = phon; + sd.phon_cur = 0; + sd.phon_max = nphon; + sd.err = 0; + + while (next_syllable(&sd) == 0) + sd.syl[sd.syl_count-1].flags &= SYL_FLAG_MASK; + + if (sd.err) { + free(sd.syl); + return sd.err; + } + + *psyl = sd.syl; + *plen = sd.syl_count; + + return 0; +} diff --git a/src/ellinika/tenses.scm b/src/ellinika/tenses.scm new file mode 100644 index 0000000..f830870 --- /dev/null +++ b/src/ellinika/tenses.scm @@ -0,0 +1,38 @@ +;;;; This file is part of Ellinika +;;;; Copyright (C) 2011 Sergey Poznyakoff +;;;; +;;;; Ellinika is free software; you can redistribute it and/or modify +;;;; it under the terms of the GNU General Public License as published by +;;;; the Free Software Foundation; either version 3 of the License, or +;;;; (at your option) any later version. +;;;; +;;;; Ellinika is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;;; GNU General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU General Public License +;;;; along with this program. If not, see . +;;;; +(define-module (ellinika tenses)) + +(define-public ellinika-tense-list + (list + (cons "ind" + (list "Ενεστώτας" + "Παρατατικός" + "Μέλλοντας διαρκείας" + "Αόριστος" + "Παρακείμενος" + "Υπερσυντέλικος" + "Συντελεσμένος μέλλοντας" + "Μέλλοντας στιγμιαίος")) + (cons "sub" + (list "Ενεστώτας" + "Αόριστος" + "Παρακείμενος")) + (cons "imp" + (list "Ενεστώτας" + "Αόριστος" + "Παρακείμενος")))) + \ No newline at end of file diff --git a/src/ellinika/xlat.scm b/src/ellinika/xlat.scm index c51edaa..63af468 100644 --- a/src/ellinika/xlat.scm +++ b/src/ellinika/xlat.scm @@ -16,6 +16,8 @@ ;;;; (define-module (ellinika xlat)) +(use-modules (ellinika elmorph)) + (define greek-postfix-map (list (cons #\: (list (cons "ι" "ϊ") (cons "υ" "ϋ") @@ -107,7 +109,7 @@ ;;; +-----------------------------+ ;;; ;;; -;;; The followin escape sequences are recognized: +;;; The following escape sequences are recognized: ;;; ;;; '\ks' -> 'ξ' ;;; '\ps' -> 'ψ' @@ -195,115 +197,8 @@ -(define transcription-list - (list - (cons "μπ" "b" ) - (cons "γγ" "g" ) - (cons "γκ" "g" ) - (cons "γχ" "g" ) - (cons "ντ" "d" ) - (cons "αι" "e" ) - (cons "αί" "e" ) - (cons "αυ" "au") - (cons "αύ" "au") - (cons "ου" "ou") - (cons "ού" "ou") - (cons "ευ" "eu") - (cons "εύ" "eu") - (cons "οι" "i" ) - (cons "ει" "i" ) - (cons "εί" "i" ) - (cons "υι" "i" ) - - (cons "α" "a" ) - (cons "Α" "a" ) - (cons "Ά" "a" ) - (cons "ά" "a" ) - (cons "β" "b" ) - (cons "Β" "b" ) - (cons "γ" "g" ) - (cons "Γ" "g" ) - (cons "δ" "d" ) - (cons "Δ" "d" ) - (cons "ε" "e" ) - (cons "Ε" "e" ) - (cons "Έ" "e" ) - (cons "έ" "e" ) - (cons "ζ" "z" ) - (cons "Ζ" "z" ) - (cons "η" "i" ) - (cons "Η" "i" ) - (cons "Ή" "i" ) - (cons "ή" "i" ) - (cons "θ" "t" ) - (cons "Θ" "t" ) - (cons "ι" "i" ) - (cons "Ι" "i" ) - (cons "Ί" "i" ) - (cons "ί" "i" ) - (cons "κ" "k" ) - (cons "Κ" "k" ) - (cons "λ" "l" ) - (cons "Λ" "l" ) - (cons "μ" "m" ) - (cons "Μ" "m" ) - (cons "ν" "n" ) - (cons "Ν" "n" ) - (cons "ξ" "x" ) - (cons "Ξ" "x" ) - (cons "ο" "o" ) - (cons "Ο" "o" ) - (cons "Ό" "o" ) - (cons "ό" "o" ) - (cons "π" "p" ) - (cons "Π" "p" ) - (cons "ρ" "r" ) - (cons "Ρ" "r" ) - (cons "σ" "s" ) - (cons "Σ" "s" ) - (cons "ς" "s" ) - (cons "τ" "t" ) - (cons "Τ" "t" ) - (cons "υ" "i" ) - (cons "Υ" "i" ) - (cons "Ύ" "i" ) - (cons "ύ" "i" ) - (cons "φ" "f" ) - (cons "Φ" "f" ) - (cons "χ" "h" ) - (cons "Χ" "h" ) - (cons "ψ" "P" ) - (cons "Ψ" "P" ) - (cons "ω" "o" ) - (cons "Ω" "o" ) - (cons "Ώ" "o" ) - (cons "ώ" "o" ) - (cons "Ϊ" "i" ) - (cons "ΐ" "i" ) - (cons "Ϋ" "i" ) - (cons "ΰ" "i" ))) - (define-public (ellinika:sounds-like str) - (let ((len (string-length str))) - (do ((i 0) - (sl '())) - ((= i len) (apply string-append (reverse sl))) - (set! sl (cons - (cond - ((and (<= (+ i 4) len) - (assoc (substring str i (+ i 4)) transcription-list)) => - (lambda (x) - (set! i (+ i 4)) - (cdr x))) - ((and (<= (+ i 2) len) - (assoc (substring str i (+ i 2)) transcription-list)) => - (lambda (x) - (set! i (+ i 2)) - (cdr x))) - (else - (set! i (1+ i)) - (substring str (- i 1) i))) - sl))))) + (elstr->soundslike str)) ;;;; End of ellinika.scm diff --git a/src/ellinika/yyrename b/src/ellinika/yyrename new file mode 100755 index 0000000..996abf2 --- /dev/null +++ b/src/ellinika/yyrename @@ -0,0 +1,97 @@ +#! /bin/sh +# Rename yy.* identifiers to avoid name clashes. This file is part of Grecs. +# Copyright (C) 2011 Sergey Poznyakoff +# +# Grecs is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 3, or (at your option) +# any later version. +# +# Grecs is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Grecs. If not, see . + +# Usage: yyrename [-f "OUTFILE [OUTFILE...]"] COMMAND INFILE +# Makefile.am: +# LEXCOMPILE = yyrename -f $(LEX_OUTPUT_ROOT).c \ +# '$(LEX) $(LFLAGS) $(AM_LFLAGS)' +# YACCCOMPILE = yyrename '$(YACC) $(YFLAGS) $(AM_YFLAGS)' +# +# This script runs COMMAND with INFILE as its argument and scans +# OUTFILEs for identifiers starting with 'yy'. It then renames these +# identifiers by replacing 'yy' with the selected prefix. +# +# The prefix is looked up in the file yytrans, located in the INFILE's +# directory. If this file does not exist, the prefix is constructed +# by concatenating the string 'grecs_' and the ``root name''. The root +# name is built by removing '-lex.l' or '-gram.y' from the base name. +# If the latter does not end in any of these, the root name +# is constructed by removing the suffix from the base name. +# +# The yytrans file is a line-oriented lookup table. Empty lines are +# ignored, usual UNIX comment lines are honored. The remaining lines +# shall consist of two words separated by any amount of whitespace. +# The first word is a look-up key, the second one provides a translation +# (yy replacement) for that key. +# +# Two look-ups are tried: first the base name and then the root name. +# If both result in a non-empty replacement, the former is preferred +# over the latter. +# +# The -f option supplies a list of output file names generated by COMMAND. +# If not supplied, the following defaults are used: y.tab.c and y.tab.h, if +# INFILE ends in '.y', and yy.lex.c, if it ends in '.l'. If INFILE does not +# end in any of these suffixes, error is reported. +# +# BUGS: Any occurrence of 'yy' is replaced, not only 'yy' prefixes. +# +case $1 in +-f) files=$2 + shift + shift +esac + +if test $# -ne 2; then + echo >&2 "usage: yyrename [-f "OUTFILE [OUTFILE...]"] COMMAND INFILE" + exit 1 +fi + +base=`expr "$2" : '.*/\(.*\)\.[ly]'` +dir=`dirname "$2"` +case $2 in +*.y) test -z "$files" && files="y.tab.c y.tab.h" + root=`expr "$2" : '.*/\(.*\)-gram\.y'`;; +*.l) test -z "$files" && files=lex.yy.c + root=`expr "$2" : '.*/\(.*\)-lex\.l'`;; +*) if test -z "$files"; then + echo >&2 "$0: suffix unknown, files must be given (use -f)" + exit 1 + fi + root=$base +esac + +if test -f $dir/yytrans; then + pfx=`awk ' +{ sub(/#.*$/,"") } +NF == 2 && $1=="'$base'" { exact=$2 } +NF == 2 && $1=="'$root'" { root=$2 } +{ next } +END { print exact ? exact : root ? root : "" }' $dir/yytrans` +else + pfx= +fi +if test -z "$pfx"; then + pfx=ellinika_`echo $root | tr .- __` +fi + +eval $* || exit $? + +for file in $files +do + mv $file ${file}.tmp + sed "/^#line/b;/^# *[0-9]/b;s/yy/$pfx/g;s/YY/$pfx/g" ${file}.tmp > $file +done -- cgit v1.2.1