aboutsummaryrefslogtreecommitdiff
path: root/src/ellinika
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org.ua>2011-06-04 10:27:59 +0000
committerSergey Poznyakoff <gray@gnu.org.ua>2011-06-04 10:27:59 +0000
commitceb837f01112d2cfde96ba9e6ddc9c9ccbd0d0a4 (patch)
tree316933c27051392c5cd48b873ae0697cd389d52a /src/ellinika
parent99076de629a6f5f2b654118cde3612f9ba05edf0 (diff)
downloadellinika-ceb837f01112d2cfde96ba9e6ddc9c9ccbd0d0a4.tar.gz
ellinika-ceb837f01112d2cfde96ba9e6ddc9c9ccbd0d0a4.tar.bz2
Implement new morphological functions. Move elmorph to scm/ellinika
git-svn-id: file:///home/puszcza/svnroot/ellinika/trunk@554 941c8c0f-9102-463b-b60b-cd22ce0e6858
Diffstat (limited to 'src/ellinika')
-rw-r--r--src/ellinika/Makefile.am44
-rw-r--r--src/ellinika/aorist.c73
-rw-r--r--src/ellinika/elchr.c701
-rw-r--r--src/ellinika/elmorph.c655
-rw-r--r--src/ellinika/elmorph.h46
-rw-r--r--src/ellinika/elmorph.scm45
-rw-r--r--src/ellinika/utf8.c2149
-rw-r--r--src/ellinika/utf8.h71
8 files changed, 3742 insertions, 2 deletions
diff --git a/src/ellinika/Makefile.am b/src/ellinika/Makefile.am
index 136b44f..274eea8 100644
--- a/src/ellinika/Makefile.am
+++ b/src/ellinika/Makefile.am
@@ -15,7 +15,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
guiledir=$(GUILE_SITE)/$(PACKAGE)
-guile_DATA=xlat.scm cgi.scm i18n.scm config.scm dico.scm
+guile_DATA=xlat.scm cgi.scm i18n.scm config.scm dico.scm elmorph.scm
cgi.m4: Makefile
echo 'divert(-1)' > $@
@@ -31,13 +31,53 @@ cgi.m4: Makefile
echo 'define([SYSCONFDIR],$(sysconfdir))' >> $@
echo 'define([LOCALEDIR],$(datadir)/locale)' >> $@
echo 'define([HTMLDIR],$(HTMLDIR))' >> $@
+ echo 'define([VERSION],$(VERSION))' >> $@
+ echo 'define([LIBDIR],$(pkglibdir))' >> $@
echo 'divert(0)dnl' >> $@
echo '@AUTOGENERATED@' >> $@
-SUFFIXES = .scm4 .scm
+SUFFIXES = .scm4 .scm .x
.scm4.scm:
m4 cgi.m4 $< > $@
cgi.scm: cgi.scm4 cgi.m4
config.scm: config.scm4 cgi.m4
+elmorph.scm: elmorph.scm4 cgi.m4
+
+pkglib_LTLIBRARIES=libelmorph.la
+
+libelmorph_la_SOURCES = \
+ aorist.c\
+ utf8.c\
+ elchr.c\
+ elmorph.c\
+ elmorph.h
+
+DOT_X_FILES = elmorph.x
+
+BUILT_SOURCES = $(DOT_X_FILES)
+
+DISTCLEANFILES = $(DOT_X_FILES)
+
+snarfcppopts = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+
+.c.x:
+ AWK=$(AWK) \
+ guile-snarf -o $@ $< $(snarfcppopts)
+
+pkglibnames=elmorph
+
+install-data-hook:
+ here=`pwd`; \
+ cd $(DESTDIR)$(pkglibdir);\
+ for name in $(pkglibnames); do \
+ if test -f lib$$name.la; then \
+ dlname=`sed -n 's/dlname='\''\(.*\)'\''/\1/p' lib$$name.la`; \
+ test -z "$$dlname" && dlname='lib$$name.so'; \
+ $(LN_S) -f "$$dlname" libguile-$$name-v-$(VERSION).so; \
+ fi; \
+ done; \
+ cd $$here
+
+
diff --git a/src/ellinika/aorist.c b/src/ellinika/aorist.c
new file mode 100644
index 0000000..995fce8
--- /dev/null
+++ b/src/ellinika/aorist.c
@@ -0,0 +1,73 @@
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+#include <errno.h>
+#include <stdlib.h>
+#include <libguile.h>
+#include "utf8.h"
+#include "elmorph.h"
+
+int
+elmorph_thema_aoristoy(unsigned *word, size_t len,
+ unsigned **thema, size_t *tlen)
+{
+ unsigned ch, *pw;
+
+ switch (word[len-1]) {
+ case 0x03B6: /* ζ */
+ /* FIXME: This can produce ξ as well: αλλάζω => άλλαξα */
+ case 0x03B8: /* θ */
+ ch = 0x03C3; /* σ */
+ break;
+
+ case 0x03B3: /* γ */
+ case 0x03C7: /* χ */
+ ch = 0x03BE; /* ξ */
+ break;
+
+ case 0x03BA: /* κ */
+ if (len > 1 && word[len-2] == 0x03C3 /* σκ */)
+ len--;
+ ch = 0x03BE; /* ξ */
+ break;
+
+ case 0x03BD: /* ν */
+ if (len > 1 && word[len-2] == 0x03C7 /* χν */) {
+ len--;
+ ch = 0x03BE; /* ξ */
+ } else
+ ch = 0x03C3; /* σ */
+ break;
+
+ case 0x03B2: /* β */
+ case 0x03C0: /* π */
+ case 0x03C6: /* φ */
+ ch = 0x03C8; /* ψ */
+ break;
+
+ case 0x03CD: /* ύ */
+ case 0x03C5: /* υ FIXME: This assumes the word has been deaccentized */
+ if (len > 1 && (word[len-2] == 0x03B1 /* αύ */ ||
+ word[len-2] == 0x03B5 /* εύ */)) {
+ ch = 0x03C8; /* ψ */
+ break;
+ }
+
+ default:
+ len++;
+ ch = 0x03C3; /* σ */
+ }
+
+ pw = calloc(len, sizeof(pw[0]));
+ if (!pw)
+ return -1;
+ memcpy(pw, word, sizeof(word[0]) * (len - 1));
+ pw[len-1] = ch;
+
+ *thema = pw;
+ *tlen = len;
+ return 0;
+}
+
+
+
diff --git a/src/ellinika/elchr.c b/src/ellinika/elchr.c
new file mode 100644
index 0000000..9b4e7ad
--- /dev/null
+++ b/src/ellinika/elchr.c
@@ -0,0 +1,701 @@
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+#include <errno.h>
+#include <stdlib.h>
+#include <libguile.h>
+#include "utf8.h"
+#include "elmorph.h"
+
+struct char_info_st {
+ unsigned ch; /* Characters */
+ int flags; /* Flags (see above) */
+ unsigned base; /* for vowels - a corresponding vowel with all diacritics
+ removed */
+ unsigned trans; /* a counter-case equivalent, i.e. a corresponding uppercase
+ letter if flags & CHF_LOWER and a corresponding lowerrcase
+ letter if flags & CHF_UPPER */
+ unsigned numval; /* Numeric value */
+ unsigned accented[3]; /* For vowels - corresponding accented variant */
+ unsigned deaccent; /* For accented vowels with diaeresis - corresponding
+ non-accented character */
+};
+
+/* See http://www.unicode.org/charts/PDF/Unicode-5.1/U51-0370.pdf */
+struct char_info_st el_basic_ctype[] = {
+ { 0x0300, },
+ { 0x0301, },
+ { 0x0302, },
+ { 0x0303, },
+ { 0x0304, },
+ { 0x0305, },
+ { 0x0306, },
+ { 0x0307, },
+ { 0x0308, },
+ { 0x0309, },
+ { 0x030A, },
+ { 0x030B, },
+ { 0x030C, },
+ { 0x030D, },
+ { 0x030E, },
+ { 0x030F, },
+ { 0x0310, },
+ { 0x0311, },
+ { 0x0312, },
+ { 0x0313, },
+ { 0x0314, },
+ { 0x0315, },
+ { 0x0316, },
+ { 0x0317, },
+ { 0x0318, },
+ { 0x0319, },
+ { 0x031A, },
+ { 0x031B, },
+ { 0x031C, },
+ { 0x031D, },
+ { 0x031E, },
+ { 0x031F, },
+ { 0x0320, },
+ { 0x0321, },
+ { 0x0322, },
+ { 0x0323, },
+ { 0x0324, },
+ { 0x0325, },
+ { 0x0326, },
+ { 0x0327, },
+ { 0x0328, },
+ { 0x0329, },
+ { 0x032A, },
+ { 0x032B, },
+ { 0x032C, },
+ { 0x032D, },
+ { 0x032E, },
+ { 0x032F, },
+ { 0x0330, },
+ { 0x0331, },
+ { 0x0332, },
+ { 0x0333, },
+ { 0x0334, },
+ { 0x0335, },
+ { 0x0336, },
+ { 0x0337, },
+ { 0x0338, },
+ { 0x0339, },
+ { 0x033A, },
+ { 0x033B, },
+ { 0x033C, },
+ { 0x033D, },
+ { 0x033E, },
+ { 0x033F, },
+ { 0x0340, },
+ { 0x0341, },
+ { 0x0342, },
+ { 0x0343, },
+ { 0x0344, },
+ { 0x0345, },
+ { 0x0346, },
+ { 0x0347, },
+ { 0x0348, },
+ { 0x0349, },
+ { 0x034A, },
+ { 0x034B, },
+ { 0x034C, },
+ { 0x034D, },
+ { 0x034E, },
+ { 0x034F, },
+ { 0x0350, },
+ { 0x0351, },
+ { 0x0352, },
+ { 0x0353, },
+ { 0x0354, },
+ { 0x0355, },
+ { 0x0356, },
+ { 0x0357, },
+ { 0x0358, },
+ { 0x0359, },
+ { 0x035A, },
+ { 0x035B, },
+ { 0x035C, },
+ { 0x035D, },
+ { 0x035E, },
+ { 0x035F, },
+ { 0x0360, },
+ { 0x0361, },
+ { 0x0362, },
+ { 0x0363, },
+ { 0x0364, },
+ { 0x0365, },
+ { 0x0366, },
+ { 0x0367, },
+ { 0x0368, },
+ { 0x0369, },
+ { 0x036A, },
+ { 0x036B, },
+ { 0x036C, },
+ { 0x036D, },
+ { 0x036E, },
+ { 0x036F, },
+ { 0x0370, CHF_ARCHAIC|CHF_CONSONANT|CHF_UPPER, 0, 0x0371 }, /* CAPITAL HETTA */
+ { 0x0371, CHF_ARCHAIC|CHF_CONSONANT|CHF_LOWER, 0, 0x0370 }, /* SMALL HETA */
+ { 0x0372, CHF_ARCHAIC|CHF_CONSONANT|CHF_UPPER, 0, 0x0373 }, /* CAPITAL SAMPI */
+ { 0x0373, CHF_ARCHAIC|CHF_CONSONANT|CHF_LOWER, 0, 0x0372 }, /* SMALL SAMPI */
+ { 0x0374, CHF_MODIFIER|CHF_UPPER, 0, 0x0375 }, /* NUMERAL SIGN = dexia keraia */
+ { 0x0375, CHF_MODIFIER|CHF_LOWER, 0, 0x0374 }, /* aristeri keraia */
+ { 0x0376, CHF_ARCHAIC|CHF_SEMIVOWEL|CHF_UPPER, 0, 0x0377}, /* CAPITAL PAMPHYLIAN DIGAMMA */
+ { 0x0377, CHF_ARCHAIC|CHF_SEMIVOWEL|CHF_LOWER, 0, 0x0376}, /* SMALL PAMPHYLIAN DIGAMMA */
+ { 0x0378, },
+ { 0x0379, },
+ { 0x037A, CHF_ARCHAIC|CHF_MODIFIER }, /* YPOGEGRAMMENI */
+ { 0x037B, CHF_SYMBOL, 0, 0x03FD }, /* SMALL REVERSED LUNATE SIGMA */
+ { 0x037C, CHF_SYMBOL, 0, 0x03FE }, /* SMALL DOTTED LUNATE SIGMA */
+ { 0x037D, CHF_SYMBOL, 0, 0x03FF }, /* SMALL REVERSED DOTTED LUNATE SIGMA */
+ { 0x037E, CHF_PUNCT }, /* erotimatiko */
+ { 0x037F, },
+ { 0x0380, },
+ { 0x0381, },
+ { 0x0382, },
+ { 0x0383, },
+ { 0x0384, CHF_MODIFIER }, /* Oxeia */
+ { 0x0385, CHF_MODIFIER }, /* dialytika */
+ { 0x0386, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0391, 0x03AC }, /* Ά */
+ { 0x0387, CHF_PUNCT }, /* ano teleia */
+ { 0x0388, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0395, 0x03AD }, /* Έ */
+ { 0x0389, CHF_VOWEL|CHF_UPPER|CHF_OXEIA|CHF_DIPH2, 0x0397, 0x03AE }, /* Ή */
+ { 0x038A, CHF_VOWEL|CHF_UPPER|CHF_OXEIA|CHF_DIPH2, 0x0399, 0x03AF }, /* Ί */
+ { 0x038B, },
+ { 0x038C, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x039F, 0x03CC }, /* Ό */
+ { 0x038D, },
+ { 0x038E, CHF_VOWEL|CHF_UPPER|CHF_OXEIA|CHF_DIPH2, 0x03A5, 0x03CD }, /* Ύ */
+ { 0x038F, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x03A9, 0x03CE }, /* Ώ */
+ { 0x0390, CHF_VOWEL|CHF_LOWER|CHF_TREMA|CHF_OXEIA, 0x03B9, 0, 0, 0, 0, 0x03CA }, /* ΐ */
+ { 0x0391, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH1, 0, 0x03B1, 1, 0x0386 }, /* Α */
+ { 0x0392, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B2, 2 }, /* Β */
+ { 0x0393, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B3, 3 }, /* Γ */
+ { 0x0394, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B4, 4 }, /* Δ */
+ { 0x0395, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH1, 0, 0x03B5, 5, 0x0388 }, /* Ε */
+ { 0x0396, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B6, 7 }, /* Ζ */
+ { 0x0397, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03B7, 8, 0x0389 }, /* Η */
+ { 0x0398, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B8, 9 }, /* Θ */
+ { 0x0399, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH2, 0, 0x03B9, 10, 0x038A }, /* Ι */
+ { 0x039A, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BA, 20 }, /* Κ */
+ { 0x039B, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BB, 30 }, /* Λ */
+ { 0x039C, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BC, 40 }, /* Μ */
+ { 0x039D, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BD, 50 }, /* Ν */
+ { 0x039E, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BE, 60 }, /* Ξ */
+ { 0x039F, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03BF, 70, 0x038C }, /* Ο */
+ { 0x03A0, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C0, 80 }, /* Π */
+ { 0x03A1, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C1, 100 }, /* Ρ */
+ { 0x03A2, },
+ { 0x03A3, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C3, 200 }, /* Σ */
+ { 0x03A4, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C4, 300 }, /* Τ */
+ { 0x03A5, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH1|CHF_DIPH2, 0, 0x03C5, 400, 0x038E }, /* Υ */
+ { 0x03A6, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C6, 500 }, /* Φ */
+ { 0x03A7, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C7, 600 }, /* Χ */
+ { 0x03A8, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C8, 700 }, /* Ψ */
+ { 0x03A9, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03C9, 800, 0x038F }, /* Ω */
+ { 0x03AA, CHF_VOWEL|CHF_UPPER|CHF_TREMA|CHF_DIPH2, 0x0399, 0x03CA }, /* Ϊ */
+ { 0x03AB, CHF_VOWEL|CHF_UPPER|CHF_TREMA, 0x03A5, 0x03CB }, /* Ϋ */
+ { 0x03AC, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B1, 0x0386 }, /* ά */
+ { 0x03AD, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B4, 0x0388 }, /* έ */
+ { 0x03AE, CHF_VOWEL|CHF_LOWER|CHF_OXEIA|CHF_DIPH2, 0x03B7, 0x0389 }, /* ή */
+ { 0x03AF, CHF_VOWEL|CHF_LOWER|CHF_OXEIA|CHF_DIPH2, 0x03B9, 0x038A }, /* ί */
+ { 0x03B0, CHF_VOWEL|CHF_OXEIA|CHF_TREMA, 0x03C5, 0, 0, 0, 0, 0x03CB }, /* ΰ */
+ { 0x03B1, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH1, 0, 0x0391, 1, 0x03AC }, /* α */
+ { 0x03B2, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0392, 2 }, /* β */
+ { 0x03B3, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0393, 3 }, /* γ */
+ { 0x03B4, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0394, 4 }, /* δ */
+ { 0x03B5, CHF_CONSONANT|CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH1, 0, 0x0395, 5, 0x03AD }, /* ε */
+ { 0x03B6, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0396, 7 }, /* ζ */
+ { 0x03B7, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH1|CHF_DIPH2, 0, 0x0397, 8, 0x03AE }, /* η */
+ { 0x03B8, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0398, 9 }, /* θ */
+ { 0x03B9, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x0399, 10, 0x03AF }, /* ι */
+ { 0x03BA, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039A, 20 }, /* κ */
+ { 0x03BB, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039B, 30 }, /* λ */
+ { 0x03BC, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039C, 40 }, /* μ */
+ { 0x03BD, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039D, 50 }, /* ν */
+ { 0x03BE, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039E, 60 }, /* ξ */
+
+ { 0x03BF, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x039F, 70, 0x03CC }, /* ο */
+ { 0x03C0, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A0, 80 }, /* π */
+ { 0x03C1, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A1, 100 }, /* ρ */
+ { 0x03C2, CHF_CONSONANT|CHF_LOWER, 0, 0x03A3 }, /* ς */
+ { 0x03C3, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A3, 200 }, /* σ */
+ { 0x03C4, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A4, 300 }, /* τ */
+ { 0x03C5, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH2, 0, 0x03A5, 400, 0x03CD }, /* υ */
+ { 0x03C6, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A6, 500 }, /* φ */
+ { 0x03C7, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A7, 600 }, /* χ */
+ { 0x03C8, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A8, 700 }, /* ψ */
+ { 0x03C9, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x03A9, 800, 0x03CE }, /* ω */
+ { 0x03CA, CHF_VOWEL|CHF_LOWER|CHF_TREMA|CHF_DIPH2, 0x03B9, 0x03AA, 0, 0x0390 }, /* ϊ */
+ { 0x03CB, CHF_VOWEL|CHF_LOWER|CHF_TREMA, 0x03C5, 0x03AB, 0, 0x03B0 }, /* ϋ */
+ { 0x03CC, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03BF, 0x038C }, /* ό */
+ { 0x03CD, CHF_VOWEL|CHF_LOWER|CHF_OXEIA|CHF_DIPH2, 0x03C5, 0x038E }, /* ύ */
+ { 0x03CE, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03CE, 0x038F }, /* ώ */
+ { 0x03CF, CHF_SYMBOL|CHF_UPPER, 0x03D7 }, /* KAI */
+ { 0x03D0, CHF_CONSONANT|CHF_LOWER, 0, 0x0392 }, /* curled beta */
+ { 0x03D1, CHF_CONSONANT|CHF_LOWER, 0, 0x0398 }, /* script theta */
+ { 0x03D2, CHF_VOWEL|CHF_UPPER, }, /* capital ypsilon with hook */
+ { 0x03D3, CHF_VOWEL|CHF_OXEIA, 0x03D2 }, /* capital ypsilon with acute & hook */
+ { 0x03D4, CHF_VOWEL|CHF_TREMA, 0x03D2 }, /* capital ypsilon with diaeresis & hook */
+ { 0x03D5, CHF_CONSONANT|CHF_LOWER, 0, 0x03A6 }, /* phi */
+ { 0x03D6, CHF_CONSONANT|CHF_LOWER, 0, 0x03A0 }, /* pi */
+ { 0x03D7, CHF_SYMBOL|CHF_LOWER, 0, 0x03CF }, /* kai */
+ { 0x03D8, CHF_ARCHAIC|CHF_CONSONANT|CHF_UPPER, 0, 0x03D9 }, /* QOPPA */
+ { 0x03D9, CHF_ARCHAIC|CHF_CONSONANT|CHF_LOWER, 0, 0x03D8 }, /* qoppa */
+ { 0x03DA, CHF_ARCHAIC|CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03DB, 6 }, /* STIGMA */
+ { 0x03DB, CHF_ARCHAIC|CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03DA, 6 }, /* stigma */
+ { 0x03DC, CHF_ARCHAIC|CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03DD, 6 }, /* DIGAMMA */
+ { 0x03DD, CHF_ARCHAIC|CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03DC, 6 }, /* digamma */
+ { 0x03DE, CHF_ARCHAIC|CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03DF, 6 }, /* KOPPA */
+ { 0x03DF, CHF_ARCHAIC|CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03DE, 6 }, /* koppa */
+ { 0x03E0, CHF_ARCHAIC|CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03E1, 900 }, /* SAMPI */
+ { 0x03E1, CHF_ARCHAIC|CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03E0, 900 }, /* sampi */
+ { 0x03E2, },
+ { 0x03E3, },
+ { 0x03E4, },
+ { 0x03E5, },
+ { 0x03E6, },
+ { 0x03E7, },
+ { 0x03E8, },
+ { 0x03E9, },
+ { 0x03EA, },
+ { 0x03EB, },
+ { 0x03EC, },
+ { 0x03ED, },
+ { 0x03EE, },
+ { 0x03EF, },
+ { 0x03F0, CHF_CONSONANT|CHF_LOWER, 0, 0x039A }, /* kappa */
+ { 0x03F1, CHF_CONSONANT|CHF_LOWER, 0, 0x03A1 }, /* tailed rho */
+ { 0x03F2, CHF_CONSONANT, 0, 0x03F9 }, /* lunate sigma */
+ { 0x03F3, CHF_SEMIVOWEL|CHF_LOWER, }, /* yot */
+ { 0x03F4, CHF_CONSONANT|CHF_UPPER, 0, 0x03B8 }, /* THETA */
+ { 0x03F5, CHF_SYMBOL|CHF_LOWER, 0, 0x0395 }, /* lunate epsilon */
+ { 0x03F6, CHF_SYMBOL|CHF_LOWER, }, /* reversed lunate epsilon */
+ { 0x03F7, },
+ { 0x03F8, },
+ { 0x03F9, CHF_CONSONANT|CHF_UPPER, 0, 0x03F2 }, /* LUNATE SIGMA */
+ { 0x03FA, CHF_ARCHAIC|CHF_CONSONANT|CHF_UPPER, 0, 0x03FB }, /* SAN */
+ { 0x03FB, CHF_ARCHAIC|CHF_CONSONANT|CHF_LOWER, 0, 0x03FA }, /* san */
+ { 0x03FC, CHF_SYMBOL|CHF_CONSONANT|CHF_LOWER, }, /* rho with stroke */
+ { 0x03FD, CHF_SYMBOL|CHF_CONSONANT|CHF_UPPER, 0, 0x037B}, /* CAPITAL REV. LUNATE SIGMA
+ antisigma */
+ { 0x03FE, CHF_SYMBOL|CHF_CONSONANT|CHF_UPPER, 0, 0x037C }, /* CAPITAL DOTTED LUNATE SIGMA
+ sigma periestigmenon */
+ { 0x03FF, CHF_SYMBOL|CHF_CONSONANT|CHF_UPPER, 0, 0x037D }, /* antisigma periestigmenon */
+};
+
+/* FIXME: Implement http://www.unicode.org/charts/PDF/U1F00.pdf */
+struct char_info_st el_extended_ctype[] = {
+ { 0x1F00, },
+ { 0x1F01, },
+ { 0x1F02, },
+ { 0x1F03, },
+ { 0x1F04, },
+ { 0x1F05, },
+ { 0x1F06, },
+ { 0x1F07, },
+ { 0x1F08, },
+ { 0x1F09, },
+ { 0x1F0A, },
+ { 0x1F0B, },
+ { 0x1F0C, },
+ { 0x1F0D, },
+ { 0x1F0E, },
+ { 0x1F0F, },
+ { 0x1F10, },
+ { 0x1F11, },
+ { 0x1F12, },
+ { 0x1F13, },
+ { 0x1F14, },
+ { 0x1F15, },
+ { 0x1F16, },
+ { 0x1F17, },
+ { 0x1F18, },
+ { 0x1F19, },
+ { 0x1F1A, },
+ { 0x1F1B, },
+ { 0x1F1C, },
+ { 0x1F1D, },
+ { 0x1F1E, },
+ { 0x1F1F, },
+ { 0x1F20, },
+ { 0x1F21, },
+ { 0x1F22, },
+ { 0x1F23, },
+ { 0x1F24, },
+ { 0x1F25, },
+ { 0x1F26, },
+ { 0x1F27, },
+ { 0x1F28, },
+ { 0x1F29, },
+ { 0x1F2A, },
+ { 0x1F2B, },
+ { 0x1F2C, },
+ { 0x1F2D, },
+ { 0x1F2E, },
+ { 0x1F2F, },
+ { 0x1F30, },
+ { 0x1F31, },
+ { 0x1F32, },
+ { 0x1F33, },
+ { 0x1F34, },
+ { 0x1F35, },
+ { 0x1F36, },
+ { 0x1F37, },
+ { 0x1F38, },
+ { 0x1F39, },
+ { 0x1F3A, },
+ { 0x1F3B, },
+ { 0x1F3C, },
+ { 0x1F3D, },
+ { 0x1F3E, },
+ { 0x1F3F, },
+ { 0x1F40, },
+ { 0x1F41, },
+ { 0x1F42, },
+ { 0x1F43, },
+ { 0x1F44, },
+ { 0x1F45, },
+ { 0x1F46, },
+ { 0x1F47, },
+ { 0x1F48, },
+ { 0x1F49, },
+ { 0x1F4A, },
+ { 0x1F4B, },
+ { 0x1F4C, },
+ { 0x1F4D, },
+ { 0x1F4E, },
+ { 0x1F4F, },
+ { 0x1F50, },
+ { 0x1F51, },
+ { 0x1F52, },
+ { 0x1F53, },
+ { 0x1F54, },
+ { 0x1F55, },
+ { 0x1F56, },
+ { 0x1F57, },
+ { 0x1F58, },
+ { 0x1F59, },
+ { 0x1F5A, },
+ { 0x1F5B, },
+ { 0x1F5C, },
+ { 0x1F5D, },
+ { 0x1F5E, },
+ { 0x1F5F, },
+ { 0x1F60, },
+ { 0x1F61, },
+ { 0x1F62, },
+ { 0x1F63, },
+ { 0x1F64, },
+ { 0x1F65, },
+ { 0x1F66, },
+ { 0x1F67, },
+ { 0x1F68, },
+ { 0x1F69, },
+ { 0x1F6A, },
+ { 0x1F6B, },
+ { 0x1F6C, },
+ { 0x1F6D, },
+ { 0x1F6E, },
+ { 0x1F6F, },
+ { 0x1F70, },
+ { 0x1F71, },
+ { 0x1F72, },
+ { 0x1F73, },
+ { 0x1F74, },
+ { 0x1F75, },
+ { 0x1F76, },
+ { 0x1F77, },
+ { 0x1F78, },
+ { 0x1F79, },
+ { 0x1F7A, },
+ { 0x1F7B, },
+ { 0x1F7C, },
+ { 0x1F7D, },
+ { 0x1F7E, },
+ { 0x1F7F, },
+ { 0x1F80, },
+ { 0x1F81, },
+ { 0x1F82, },
+ { 0x1F83, },
+ { 0x1F84, },
+ { 0x1F85, },
+ { 0x1F86, },
+ { 0x1F87, },
+ { 0x1F88, },
+ { 0x1F89, },
+ { 0x1F8A, },
+ { 0x1F8B, },
+ { 0x1F8C, },
+ { 0x1F8D, },
+ { 0x1F8E, },
+ { 0x1F8F, },
+ { 0x1F90, },
+ { 0x1F91, },
+ { 0x1F92, },
+ { 0x1F93, },
+ { 0x1F94, },
+ { 0x1F95, },
+ { 0x1F96, },
+ { 0x1F97, },
+ { 0x1F98, },
+ { 0x1F99, },
+ { 0x1F9A, },
+ { 0x1F9B, },
+ { 0x1F9C, },
+ { 0x1F9D, },
+ { 0x1F9E, },
+ { 0x1F9F, },
+ { 0x1FA0, },
+ { 0x1FA1, },
+ { 0x1FA2, },
+ { 0x1FA3, },
+ { 0x1FA4, },
+ { 0x1FA5, },
+ { 0x1FA6, },
+ { 0x1FA7, },
+ { 0x1FA8, },
+ { 0x1FA9, },
+ { 0x1FAA, },
+ { 0x1FAB, },
+ { 0x1FAC, },
+ { 0x1FAD, },
+ { 0x1FAE, },
+ { 0x1FAF, },
+ { 0x1FB0, },
+ { 0x1FB1, },
+ { 0x1FB2, },
+ { 0x1FB3, },
+ { 0x1FB4, },
+ { 0x1FB5, },
+ { 0x1FB6, },
+ { 0x1FB7, },
+ { 0x1FB8, },
+ { 0x1FB9, },
+ { 0x1FBA, },
+ { 0x1FBB, },
+ { 0x1FBC, },
+ { 0x1FBD, },
+ { 0x1FBE, },
+ { 0x1FBF, },
+ { 0x1FC0, },
+ { 0x1FC1, },
+ { 0x1FC2, },
+ { 0x1FC3, },
+ { 0x1FC4, },
+ { 0x1FC5, },
+ { 0x1FC6, },
+ { 0x1FC7, },
+ { 0x1FC8, },
+ { 0x1FC9, },
+ { 0x1FCA, },
+ { 0x1FCB, },
+ { 0x1FCC, },
+ { 0x1FCD, },
+ { 0x1FCE, },
+ { 0x1FCF, },
+ { 0x1FD0, },
+ { 0x1FD1, },
+ { 0x1FD2, },
+ { 0x1FD3, },
+ { 0x1FD4, },
+ { 0x1FD5, },
+ { 0x1FD6, },
+ { 0x1FD7, },
+ { 0x1FD8, },
+ { 0x1FD9, },
+ { 0x1FDA, },
+ { 0x1FDB, },
+ { 0x1FDC, },
+ { 0x1FDD, },
+ { 0x1FDE, },
+ { 0x1FDF, },
+ { 0x1FE0, },
+ { 0x1FE1, },
+ { 0x1FE2, },
+ { 0x1FE3, },
+ { 0x1FE4, },
+ { 0x1FE5, },
+ { 0x1FE6, },
+ { 0x1FE7, },
+ { 0x1FE8, },
+ { 0x1FE9, },
+ { 0x1FEA, },
+ { 0x1FEB, },
+ { 0x1FEC, },
+ { 0x1FED, },
+ { 0x1FEE, },
+ { 0x1FEF, },
+ { 0x1FF0, },
+ { 0x1FF1, },
+ { 0x1FF2, },
+ { 0x1FF3, },
+ { 0x1FF4, },
+ { 0x1FF5, },
+ { 0x1FF6, },
+ { 0x1FF7, },
+ { 0x1FF8, },
+ { 0x1FF9, },
+ { 0x1FFA, },
+ { 0x1FFB, },
+ { 0x1FFC, },
+ { 0x1FFD, },
+ { 0x1FFE, },
+ { 0x1FFF, }
+};
+
+static struct char_info_st *
+elchr_info(unsigned ch)
+{
+ if (ch >= 0x0300 && ch <= 0x03FF)
+ return el_basic_ctype + ch - 0x0300;
+ else if (ch >= 0x1F00 && ch <= 0x1FFF)
+ return el_extended_ctype + ch - 0x1F00;
+ return NULL;
+}
+
+int
+elchr_flags(unsigned ch)
+{
+ struct char_info_st *ci = elchr_info(ch);
+ return ci ? ci->flags : 0;
+}
+
+int
+elchr_isupper(unsigned ch)
+{
+ return elchr_flags(ch) & CHF_UPPER;
+}
+
+int
+elchr_islower(unsigned ch)
+{
+ return elchr_flags(ch) & CHF_LOWER;
+}
+
+int
+elchr_getaccent(unsigned ch)
+{
+ return elchr_flags(ch) & CHF_ACCENT_MASK;
+}
+
+int
+elchr_istrema(unsigned ch)
+{
+ return elchr_flags(ch) & CHF_TREMA;
+}
+
+
+int
+elchr_isvowel(unsigned ch)
+{
+ return elchr_flags(ch) & CHF_VOWEL;
+}
+
+int
+elchr_isconsonant(unsigned ch)
+{
+ return elchr_flags(ch) & CHF_CONSONANT;
+}
+
+int
+elchr_issemivowel(unsigned ch)
+{
+ return elchr_flags(ch) & CHF_SEMIVOWEL;
+}
+
+int
+elchr_ispunct(unsigned ch)
+{
+ return elchr_flags(ch) & CHF_PUNCT;
+}
+
+int
+elchr_issymbol(unsigned ch)
+{
+ return elchr_flags(ch) & CHF_SYMBOL;
+}
+
+int
+elchr_ismodifier(unsigned ch)
+{
+ return elchr_flags(ch) & CHF_MODIFIER;
+}
+
+int
+elchr_isarchaic(unsigned ch)
+{
+ return elchr_flags(ch) & CHF_ARCHAIC;
+}
+
+int
+elchr_isnumeric(unsigned ch)
+{
+ return elchr_flags(ch) & CHF_NUMERIC;
+}
+
+unsigned
+elchr_numeric_value(unsigned ch)
+{
+ struct char_info_st *ci = elchr_info(ch);
+ return (ci && (ci->flags & CHF_NUMERIC)) ? ci->numval: 0;
+}
+
+unsigned
+elchr_toupper(unsigned ch)
+{
+ struct char_info_st *ci = elchr_info(ch);
+ return (ci && (ci->flags & CHF_LOWER)) ? ci->trans: ch;
+}
+
+unsigned
+elchr_tolower(unsigned ch)
+{
+ struct char_info_st *ci = elchr_info(ch);
+ return (ci && (ci->flags & CHF_UPPER)) ? ci->trans : ch;
+}
+
+unsigned
+elchr_base(unsigned ch)
+{
+ struct char_info_st *ci = elchr_info(ch);
+ return (ci && (ci->flags & CHF_ACCENT_MASK) && ci->base) ? ci->base : ch;
+}
+
+unsigned
+elchr_deaccent(unsigned ch)
+{
+ struct char_info_st *ci = elchr_info(ch);
+ if (ci && (ci->flags & CHF_ACCENT_MASK))
+ return ci->deaccent ? ci->deaccent : ci->base ? ci->base : ch;
+ return ch;
+}
+
+unsigned
+elchr_accent(unsigned ch, int acc)
+{
+ struct char_info_st *ci = elchr_info(ch);
+ return (ci && ci->accented[acc-1]) ? ci->accented[acc-1] : ch;
+}
+
+int
+elchr_diphthong(unsigned ch, int state)
+{
+ struct char_info_st *ci = elchr_info(ch);
+
+ if (!ci || !(ci->flags & CHF_VOWEL))
+ return 0;
+ switch (state) {
+ case 0:
+ if (ci->flags & CHF_DIPH1)
+ state = 1;
+ break;
+ case 1:
+ if (ci->flags & CHF_DIPH2)
+ state = 2;
+ break;
+ default:
+ state = 0;
+ }
+ return state;
+}
diff --git a/src/ellinika/elmorph.c b/src/ellinika/elmorph.c
new file mode 100644
index 0000000..5234eda
--- /dev/null
+++ b/src/ellinika/elmorph.c
@@ -0,0 +1,655 @@
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+#include <errno.h>
+#include <stdlib.h>
+#include <libguile.h>
+#include "utf8.h"
+#include "elmorph.h"
+
+struct elstr {
+ unsigned *str; /* UTF-8 string */
+ size_t len; /* Its length */
+ unsigned nsyl; /* Number of syllables. */
+ unsigned *sylmap; /* Syllable map (nsyl elements) */
+ unsigned acc_syl; /* Number of the accented syllable
+ (1-based, from the last syllable) */
+ unsigned acc_pos; /* Number of the accented character
+ (0-based, from str[0]) */
+};
+
+scm_t_bits _elstr_tag;
+
+static void
+_elstr_syllabize(struct elstr *elstr)
+{
+ unsigned *sylmap;
+ unsigned i, nsyl = 0, accsyl = 0, accchr = 0;
+ int dstate = 0;
+ int acc = 0;
+
+ sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len, "syllable map");
+
+ for (i = 0; i < elstr->len; i++) {
+ int nstate;
+
+ if (elchr_getaccent(elstr->str[i])) {
+ accsyl = nsyl;
+ accchr = i;
+ }
+ nstate = elchr_diphthong(elstr->str[i], dstate);
+ if (nstate)
+ /* skip */;
+ else if (dstate)
+ sylmap[nsyl++] = i - 1;
+ else if (elchr_isvowel(elstr->str[i]))
+ sylmap[nsyl++] = i;
+ dstate = nstate;
+ }
+ if (dstate)
+ sylmap[nsyl++] = i - 1;
+ else
+ sylmap[nsyl-1] = i - 1;
+ elstr->sylmap = sylmap;
+ elstr->nsyl = nsyl;
+ elstr->acc_pos = accchr;
+ elstr->acc_syl = nsyl - accsyl;
+}
+
+static SCM
+_elstr_alloc(const char *instr)
+{
+ struct elstr *elstr;
+ unsigned *wptr;
+ size_t wlen;
+
+ if (utf8_mbstr_to_wc(instr, &wptr, &wlen))
+ return SCM_EOL;
+
+ elstr = scm_gc_malloc(sizeof(*elstr), "Elstr");
+ elstr->str = wptr;
+ elstr->len = wlen;
+
+ _elstr_syllabize(elstr);
+
+ SCM_RETURN_NEWSMOB(_elstr_tag, elstr);
+}
+
+static SCM
+_elstr_dup(struct elstr *elstr)
+{
+ struct elstr *elnew;
+
+ elnew = scm_gc_malloc(sizeof(*elstr), "Elstr");
+ elnew->str = calloc(elstr->len, sizeof(elnew->str[0]));
+ if (!elnew->str)
+ scm_memory_error("_elstr_dup");
+ elnew->sylmap = calloc(elstr->nsyl, sizeof(elnew->sylmap[0]));
+ if (!elnew->sylmap) {
+ free(elnew->str);
+ scm_memory_error("_elstr_dup");
+ }
+ memcpy(elnew->str, elstr->str, sizeof(elstr->str[0]) * elstr->len);
+ elnew->len = elstr->len;
+ elnew->nsyl = elstr->nsyl;
+ memcpy(elnew->sylmap, elstr->sylmap,
+ sizeof(elstr->sylmap[0]) * elstr->nsyl);
+ elnew->acc_syl = elstr->acc_syl;
+ elnew->acc_pos = elstr->acc_pos;
+ SCM_RETURN_NEWSMOB(_elstr_tag, elnew);
+}
+
+static scm_sizet
+_elstr_free(SCM smob)
+{
+ struct elstr *elstr = (struct elstr *) SCM_CDR(smob);
+ free(elstr->str);
+ free(elstr->sylmap);
+ free(elstr);
+ return 0;
+}
+
+static int
+_elstr_print(SCM smob, SCM port, scm_print_state *pstate)
+{
+ struct elstr *elstr = (struct elstr *) SCM_CDR(smob);
+ int i, j, an;
+ char *s;
+
+ scm_puts("#<elstr ``", port);
+ an = elstr->nsyl - elstr->acc_syl;
+ if (an == 0)
+ scm_puts("[", port);
+ for (i = j = 0; i < elstr->len; i++) {
+ char r[6];
+ int n;
+
+ if (i == elstr->sylmap[j] + 1) {
+ if (j == an)
+ scm_puts("]", port);
+ scm_puts("-", port);
+ if (++j == an)
+ scm_puts("[", port);
+ }
+ n = utf8_wctomb(r, elstr->str[i]);
+ if (n == -1)
+ continue;
+ r[n] = 0;
+ scm_puts(r, port);
+ }
+ if (j == an)
+ scm_puts("]", port);
+ scm_puts("''>", port);
+ return 1;
+}
+
+static void
+_elstr_init()
+{
+ _elstr_tag = scm_make_smob_type("Elstr", sizeof(struct elstr));
+ scm_set_smob_free(_elstr_tag, _elstr_free);
+ scm_set_smob_print(_elstr_tag, _elstr_print);
+}
+
+SCM_DEFINE_PUBLIC(scm_string__elstr, "string->elstr", 1, 0, 0,
+ (SCM string),
+"Create new ELSTR from STRING\n")
+#define FUNC_NAME s_scm_string__elstr
+{
+ char *str;
+ SCM scm;
+
+ SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME);
+ str = scm_to_locale_string(string);
+ scm = _elstr_alloc(str);
+ free(str);
+ if (scm == SCM_EOL)
+ scm_misc_error(FUNC_NAME,
+ "Invalid input string: ~S",
+ scm_list_1(string));
+ return scm;
+}
+#undef FUNC_NAME
+
+#define scm_is_elstr(s) (!SCM_IMP(s) && SCM_CELL_TYPE(s) == _elstr_tag)
+
+SCM_DEFINE_PUBLIC(scm_elstr__string, "elstr->string", 1, 0, 0,
+ (SCM el),
+"Convert EL to a STRING\n")
+#define FUNC_NAME s_scm_elstr__string
+{
+ struct elstr *elstr;
+ char *s;
+ SCM scm;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
+ elstr = (struct elstr*) SCM_CDR(el);
+ if (utf8_wc_to_mbstr(elstr->str, elstr->len, &s))
+ scm_misc_error(FUNC_NAME,
+ "cannot convert elstr to Scheme",
+ SCM_EOL);
+ scm = scm_from_locale_string(s);
+ free(s);
+ return scm;
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_length, "elstr-length", 1, 0, 0,
+ (SCM el),
+"Returns the number of characters in EL\n")
+#define FUNC_NAME s_scm_elstr_length
+{
+ struct elstr *elstr;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
+ elstr = (struct elstr*) SCM_CDR(el);
+ return scm_from_uint(elstr->len);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_number_of_syllables, "elstr-number-of-syllables",
+ 1, 0, 0,
+ (SCM el),
+"Returns the number of characters in EL\n")
+#define FUNC_NAME s_scm_elstr_number_of_syllables
+{
+ struct elstr *elstr;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);