From ceb837f01112d2cfde96ba9e6ddc9c9ccbd0d0a4 Mon Sep 17 00:00:00 2001 From: Sergey Poznyakoff Date: Sat, 4 Jun 2011 10:27:59 +0000 Subject: Implement new morphological functions. Move elmorph to scm/ellinika git-svn-id: file:///home/puszcza/svnroot/ellinika/trunk@554 941c8c0f-9102-463b-b60b-cd22ce0e6858 --- src/ellinika/Makefile.am | 44 +- src/ellinika/aorist.c | 73 ++ src/ellinika/elchr.c | 701 +++++++++++++++ src/ellinika/elmorph.c | 655 ++++++++++++++ src/ellinika/elmorph.h | 46 + src/ellinika/elmorph.scm4 | 5 + src/ellinika/utf8.c | 2149 +++++++++++++++++++++++++++++++++++++++++++++ src/ellinika/utf8.h | 71 ++ 8 files changed, 3742 insertions(+), 2 deletions(-) create mode 100644 src/ellinika/aorist.c create mode 100644 src/ellinika/elchr.c create mode 100644 src/ellinika/elmorph.c create mode 100644 src/ellinika/elmorph.h create mode 100644 src/ellinika/elmorph.scm4 create mode 100644 src/ellinika/utf8.c create mode 100644 src/ellinika/utf8.h (limited to 'src') diff --git a/src/ellinika/Makefile.am b/src/ellinika/Makefile.am index 136b44f..274eea8 100644 --- a/src/ellinika/Makefile.am +++ b/src/ellinika/Makefile.am @@ -15,7 +15,7 @@ # along with this program. If not, see . guiledir=$(GUILE_SITE)/$(PACKAGE) -guile_DATA=xlat.scm cgi.scm i18n.scm config.scm dico.scm +guile_DATA=xlat.scm cgi.scm i18n.scm config.scm dico.scm elmorph.scm cgi.m4: Makefile echo 'divert(-1)' > $@ @@ -31,13 +31,53 @@ cgi.m4: Makefile echo 'define([SYSCONFDIR],$(sysconfdir))' >> $@ echo 'define([LOCALEDIR],$(datadir)/locale)' >> $@ echo 'define([HTMLDIR],$(HTMLDIR))' >> $@ + echo 'define([VERSION],$(VERSION))' >> $@ + echo 'define([LIBDIR],$(pkglibdir))' >> $@ echo 'divert(0)dnl' >> $@ echo '@AUTOGENERATED@' >> $@ -SUFFIXES = .scm4 .scm +SUFFIXES = .scm4 .scm .x .scm4.scm: m4 cgi.m4 $< > $@ cgi.scm: cgi.scm4 cgi.m4 config.scm: config.scm4 cgi.m4 +elmorph.scm: elmorph.scm4 cgi.m4 + +pkglib_LTLIBRARIES=libelmorph.la + +libelmorph_la_SOURCES = \ + aorist.c\ + utf8.c\ + elchr.c\ + elmorph.c\ + elmorph.h + +DOT_X_FILES = elmorph.x + +BUILT_SOURCES = $(DOT_X_FILES) + +DISTCLEANFILES = $(DOT_X_FILES) + +snarfcppopts = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) + +.c.x: + AWK=$(AWK) \ + guile-snarf -o $@ $< $(snarfcppopts) + +pkglibnames=elmorph + +install-data-hook: + here=`pwd`; \ + cd $(DESTDIR)$(pkglibdir);\ + for name in $(pkglibnames); do \ + if test -f lib$$name.la; then \ + dlname=`sed -n 's/dlname='\''\(.*\)'\''/\1/p' lib$$name.la`; \ + test -z "$$dlname" && dlname='lib$$name.so'; \ + $(LN_S) -f "$$dlname" libguile-$$name-v-$(VERSION).so; \ + fi; \ + done; \ + cd $$here + + diff --git a/src/ellinika/aorist.c b/src/ellinika/aorist.c new file mode 100644 index 0000000..995fce8 --- /dev/null +++ b/src/ellinika/aorist.c @@ -0,0 +1,73 @@ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "utf8.h" +#include "elmorph.h" + +int +elmorph_thema_aoristoy(unsigned *word, size_t len, + unsigned **thema, size_t *tlen) +{ + unsigned ch, *pw; + + switch (word[len-1]) { + case 0x03B6: /* ζ */ + /* FIXME: This can produce ξ as well: αλλάζω => άλλαξα */ + case 0x03B8: /* θ */ + ch = 0x03C3; /* σ */ + break; + + case 0x03B3: /* γ */ + case 0x03C7: /* χ */ + ch = 0x03BE; /* ξ */ + break; + + case 0x03BA: /* κ */ + if (len > 1 && word[len-2] == 0x03C3 /* σκ */) + len--; + ch = 0x03BE; /* ξ */ + break; + + case 0x03BD: /* ν */ + if (len > 1 && word[len-2] == 0x03C7 /* χν */) { + len--; + ch = 0x03BE; /* ξ */ + } else + ch = 0x03C3; /* σ */ + break; + + case 0x03B2: /* β */ + case 0x03C0: /* π */ + case 0x03C6: /* φ */ + ch = 0x03C8; /* ψ */ + break; + + case 0x03CD: /* ύ */ + case 0x03C5: /* υ FIXME: This assumes the word has been deaccentized */ + if (len > 1 && (word[len-2] == 0x03B1 /* αύ */ || + word[len-2] == 0x03B5 /* εύ */)) { + ch = 0x03C8; /* ψ */ + break; + } + + default: + len++; + ch = 0x03C3; /* σ */ + } + + pw = calloc(len, sizeof(pw[0])); + if (!pw) + return -1; + memcpy(pw, word, sizeof(word[0]) * (len - 1)); + pw[len-1] = ch; + + *thema = pw; + *tlen = len; + return 0; +} + + + diff --git a/src/ellinika/elchr.c b/src/ellinika/elchr.c new file mode 100644 index 0000000..9b4e7ad --- /dev/null +++ b/src/ellinika/elchr.c @@ -0,0 +1,701 @@ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "utf8.h" +#include "elmorph.h" + +struct char_info_st { + unsigned ch; /* Characters */ + int flags; /* Flags (see above) */ + unsigned base; /* for vowels - a corresponding vowel with all diacritics + removed */ + unsigned trans; /* a counter-case equivalent, i.e. a corresponding uppercase + letter if flags & CHF_LOWER and a corresponding lowerrcase + letter if flags & CHF_UPPER */ + unsigned numval; /* Numeric value */ + unsigned accented[3]; /* For vowels - corresponding accented variant */ + unsigned deaccent; /* For accented vowels with diaeresis - corresponding + non-accented character */ +}; + +/* See http://www.unicode.org/charts/PDF/Unicode-5.1/U51-0370.pdf */ +struct char_info_st el_basic_ctype[] = { + { 0x0300, }, + { 0x0301, }, + { 0x0302, }, + { 0x0303, }, + { 0x0304, }, + { 0x0305, }, + { 0x0306, }, + { 0x0307, }, + { 0x0308, }, + { 0x0309, }, + { 0x030A, }, + { 0x030B, }, + { 0x030C, }, + { 0x030D, }, + { 0x030E, }, + { 0x030F, }, + { 0x0310, }, + { 0x0311, }, + { 0x0312, }, + { 0x0313, }, + { 0x0314, }, + { 0x0315, }, + { 0x0316, }, + { 0x0317, }, + { 0x0318, }, + { 0x0319, }, + { 0x031A, }, + { 0x031B, }, + { 0x031C, }, + { 0x031D, }, + { 0x031E, }, + { 0x031F, }, + { 0x0320, }, + { 0x0321, }, + { 0x0322, }, + { 0x0323, }, + { 0x0324, }, + { 0x0325, }, + { 0x0326, }, + { 0x0327, }, + { 0x0328, }, + { 0x0329, }, + { 0x032A, }, + { 0x032B, }, + { 0x032C, }, + { 0x032D, }, + { 0x032E, }, + { 0x032F, }, + { 0x0330, }, + { 0x0331, }, + { 0x0332, }, + { 0x0333, }, + { 0x0334, }, + { 0x0335, }, + { 0x0336, }, + { 0x0337, }, + { 0x0338, }, + { 0x0339, }, + { 0x033A, }, + { 0x033B, }, + { 0x033C, }, + { 0x033D, }, + { 0x033E, }, + { 0x033F, }, + { 0x0340, }, + { 0x0341, }, + { 0x0342, }, + { 0x0343, }, + { 0x0344, }, + { 0x0345, }, + { 0x0346, }, + { 0x0347, }, + { 0x0348, }, + { 0x0349, }, + { 0x034A, }, + { 0x034B, }, + { 0x034C, }, + { 0x034D, }, + { 0x034E, }, + { 0x034F, }, + { 0x0350, }, + { 0x0351, }, + { 0x0352, }, + { 0x0353, }, + { 0x0354, }, + { 0x0355, }, + { 0x0356, }, + { 0x0357, }, + { 0x0358, }, + { 0x0359, }, + { 0x035A, }, + { 0x035B, }, + { 0x035C, }, + { 0x035D, }, + { 0x035E, }, + { 0x035F, }, + { 0x0360, }, + { 0x0361, }, + { 0x0362, }, + { 0x0363, }, + { 0x0364, }, + { 0x0365, }, + { 0x0366, }, + { 0x0367, }, + { 0x0368, }, + { 0x0369, }, + { 0x036A, }, + { 0x036B, }, + { 0x036C, }, + { 0x036D, }, + { 0x036E, }, + { 0x036F, }, + { 0x0370, CHF_ARCHAIC|CHF_CONSONANT|CHF_UPPER, 0, 0x0371 }, /* CAPITAL HETTA */ + { 0x0371, CHF_ARCHAIC|CHF_CONSONANT|CHF_LOWER, 0, 0x0370 }, /* SMALL HETA */ + { 0x0372, CHF_ARCHAIC|CHF_CONSONANT|CHF_UPPER, 0, 0x0373 }, /* CAPITAL SAMPI */ + { 0x0373, CHF_ARCHAIC|CHF_CONSONANT|CHF_LOWER, 0, 0x0372 }, /* SMALL SAMPI */ + { 0x0374, CHF_MODIFIER|CHF_UPPER, 0, 0x0375 }, /* NUMERAL SIGN = dexia keraia */ + { 0x0375, CHF_MODIFIER|CHF_LOWER, 0, 0x0374 }, /* aristeri keraia */ + { 0x0376, CHF_ARCHAIC|CHF_SEMIVOWEL|CHF_UPPER, 0, 0x0377}, /* CAPITAL PAMPHYLIAN DIGAMMA */ + { 0x0377, CHF_ARCHAIC|CHF_SEMIVOWEL|CHF_LOWER, 0, 0x0376}, /* SMALL PAMPHYLIAN DIGAMMA */ + { 0x0378, }, + { 0x0379, }, + { 0x037A, CHF_ARCHAIC|CHF_MODIFIER }, /* YPOGEGRAMMENI */ + { 0x037B, CHF_SYMBOL, 0, 0x03FD }, /* SMALL REVERSED LUNATE SIGMA */ + { 0x037C, CHF_SYMBOL, 0, 0x03FE }, /* SMALL DOTTED LUNATE SIGMA */ + { 0x037D, CHF_SYMBOL, 0, 0x03FF }, /* SMALL REVERSED DOTTED LUNATE SIGMA */ + { 0x037E, CHF_PUNCT }, /* erotimatiko */ + { 0x037F, }, + { 0x0380, }, + { 0x0381, }, + { 0x0382, }, + { 0x0383, }, + { 0x0384, CHF_MODIFIER }, /* Oxeia */ + { 0x0385, CHF_MODIFIER }, /* dialytika */ + { 0x0386, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0391, 0x03AC }, /* Ά */ + { 0x0387, CHF_PUNCT }, /* ano teleia */ + { 0x0388, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0395, 0x03AD }, /* Έ */ + { 0x0389, CHF_VOWEL|CHF_UPPER|CHF_OXEIA|CHF_DIPH2, 0x0397, 0x03AE }, /* Ή */ + { 0x038A, CHF_VOWEL|CHF_UPPER|CHF_OXEIA|CHF_DIPH2, 0x0399, 0x03AF }, /* Ί */ + { 0x038B, }, + { 0x038C, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x039F, 0x03CC }, /* Ό */ + { 0x038D, }, + { 0x038E, CHF_VOWEL|CHF_UPPER|CHF_OXEIA|CHF_DIPH2, 0x03A5, 0x03CD }, /* Ύ */ + { 0x038F, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x03A9, 0x03CE }, /* Ώ */ + { 0x0390, CHF_VOWEL|CHF_LOWER|CHF_TREMA|CHF_OXEIA, 0x03B9, 0, 0, 0, 0, 0x03CA }, /* ΐ */ + { 0x0391, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH1, 0, 0x03B1, 1, 0x0386 }, /* Α */ + { 0x0392, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B2, 2 }, /* Β */ + { 0x0393, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B3, 3 }, /* Γ */ + { 0x0394, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B4, 4 }, /* Δ */ + { 0x0395, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH1, 0, 0x03B5, 5, 0x0388 }, /* Ε */ + { 0x0396, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B6, 7 }, /* Ζ */ + { 0x0397, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03B7, 8, 0x0389 }, /* Η */ + { 0x0398, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B8, 9 }, /* Θ */ + { 0x0399, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH2, 0, 0x03B9, 10, 0x038A }, /* Ι */ + { 0x039A, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BA, 20 }, /* Κ */ + { 0x039B, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BB, 30 }, /* Λ */ + { 0x039C, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BC, 40 }, /* Μ */ + { 0x039D, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BD, 50 }, /* Ν */ + { 0x039E, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BE, 60 }, /* Ξ */ + { 0x039F, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03BF, 70, 0x038C }, /* Ο */ + { 0x03A0, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C0, 80 }, /* Π */ + { 0x03A1, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C1, 100 }, /* Ρ */ + { 0x03A2, }, + { 0x03A3, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C3, 200 }, /* Σ */ + { 0x03A4, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C4, 300 }, /* Τ */ + { 0x03A5, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH1|CHF_DIPH2, 0, 0x03C5, 400, 0x038E }, /* Υ */ + { 0x03A6, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C6, 500 }, /* Φ */ + { 0x03A7, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C7, 600 }, /* Χ */ + { 0x03A8, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C8, 700 }, /* Ψ */ + { 0x03A9, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03C9, 800, 0x038F }, /* Ω */ + { 0x03AA, CHF_VOWEL|CHF_UPPER|CHF_TREMA|CHF_DIPH2, 0x0399, 0x03CA }, /* Ϊ */ + { 0x03AB, CHF_VOWEL|CHF_UPPER|CHF_TREMA, 0x03A5, 0x03CB }, /* Ϋ */ + { 0x03AC, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B1, 0x0386 }, /* ά */ + { 0x03AD, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B4, 0x0388 }, /* έ */ + { 0x03AE, CHF_VOWEL|CHF_LOWER|CHF_OXEIA|CHF_DIPH2, 0x03B7, 0x0389 }, /* ή */ + { 0x03AF, CHF_VOWEL|CHF_LOWER|CHF_OXEIA|CHF_DIPH2, 0x03B9, 0x038A }, /* ί */ + { 0x03B0, CHF_VOWEL|CHF_OXEIA|CHF_TREMA, 0x03C5, 0, 0, 0, 0, 0x03CB }, /* ΰ */ + { 0x03B1, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH1, 0, 0x0391, 1, 0x03AC }, /* α */ + { 0x03B2, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0392, 2 }, /* β */ + { 0x03B3, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0393, 3 }, /* γ */ + { 0x03B4, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0394, 4 }, /* δ */ + { 0x03B5, CHF_CONSONANT|CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH1, 0, 0x0395, 5, 0x03AD }, /* ε */ + { 0x03B6, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0396, 7 }, /* ζ */ + { 0x03B7, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH1|CHF_DIPH2, 0, 0x0397, 8, 0x03AE }, /* η */ + { 0x03B8, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0398, 9 }, /* θ */ + { 0x03B9, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x0399, 10, 0x03AF }, /* ι */ + { 0x03BA, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039A, 20 }, /* κ */ + { 0x03BB, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039B, 30 }, /* λ */ + { 0x03BC, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039C, 40 }, /* μ */ + { 0x03BD, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039D, 50 }, /* ν */ + { 0x03BE, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039E, 60 }, /* ξ */ + + { 0x03BF, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x039F, 70, 0x03CC }, /* ο */ + { 0x03C0, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A0, 80 }, /* π */ + { 0x03C1, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A1, 100 }, /* ρ */ + { 0x03C2, CHF_CONSONANT|CHF_LOWER, 0, 0x03A3 }, /* ς */ + { 0x03C3, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A3, 200 }, /* σ */ + { 0x03C4, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A4, 300 }, /* τ */ + { 0x03C5, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH2, 0, 0x03A5, 400, 0x03CD }, /* υ */ + { 0x03C6, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A6, 500 }, /* φ */ + { 0x03C7, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A7, 600 }, /* χ */ + { 0x03C8, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A8, 700 }, /* ψ */ + { 0x03C9, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x03A9, 800, 0x03CE }, /* ω */ + { 0x03CA, CHF_VOWEL|CHF_LOWER|CHF_TREMA|CHF_DIPH2, 0x03B9, 0x03AA, 0, 0x0390 }, /* ϊ */ + { 0x03CB, CHF_VOWEL|CHF_LOWER|CHF_TREMA, 0x03C5, 0x03AB, 0, 0x03B0 }, /* ϋ */ + { 0x03CC, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03BF, 0x038C }, /* ό */ + { 0x03CD, CHF_VOWEL|CHF_LOWER|CHF_OXEIA|CHF_DIPH2, 0x03C5, 0x038E }, /* ύ */ + { 0x03CE, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03CE, 0x038F }, /* ώ */ + { 0x03CF, CHF_SYMBOL|CHF_UPPER, 0x03D7 }, /* KAI */ + { 0x03D0, CHF_CONSONANT|CHF_LOWER, 0, 0x0392 }, /* curled beta */ + { 0x03D1, CHF_CONSONANT|CHF_LOWER, 0, 0x0398 }, /* script theta */ + { 0x03D2, CHF_VOWEL|CHF_UPPER, }, /* capital ypsilon with hook */ + { 0x03D3, CHF_VOWEL|CHF_OXEIA, 0x03D2 }, /* capital ypsilon with acute & hook */ + { 0x03D4, CHF_VOWEL|CHF_TREMA, 0x03D2 }, /* capital ypsilon with diaeresis & hook */ + { 0x03D5, CHF_CONSONANT|CHF_LOWER, 0, 0x03A6 }, /* phi */ + { 0x03D6, CHF_CONSONANT|CHF_LOWER, 0, 0x03A0 }, /* pi */ + { 0x03D7, CHF_SYMBOL|CHF_LOWER, 0, 0x03CF }, /* kai */ + { 0x03D8, CHF_ARCHAIC|CHF_CONSONANT|CHF_UPPER, 0, 0x03D9 }, /* QOPPA */ + { 0x03D9, CHF_ARCHAIC|CHF_CONSONANT|CHF_LOWER, 0, 0x03D8 }, /* qoppa */ + { 0x03DA, CHF_ARCHAIC|CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03DB, 6 }, /* STIGMA */ + { 0x03DB, CHF_ARCHAIC|CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03DA, 6 }, /* stigma */ + { 0x03DC, CHF_ARCHAIC|CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03DD, 6 }, /* DIGAMMA */ + { 0x03DD, CHF_ARCHAIC|CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03DC, 6 }, /* digamma */ + { 0x03DE, CHF_ARCHAIC|CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03DF, 6 }, /* KOPPA */ + { 0x03DF, CHF_ARCHAIC|CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03DE, 6 }, /* koppa */ + { 0x03E0, CHF_ARCHAIC|CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03E1, 900 }, /* SAMPI */ + { 0x03E1, CHF_ARCHAIC|CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03E0, 900 }, /* sampi */ + { 0x03E2, }, + { 0x03E3, }, + { 0x03E4, }, + { 0x03E5, }, + { 0x03E6, }, + { 0x03E7, }, + { 0x03E8, }, + { 0x03E9, }, + { 0x03EA, }, + { 0x03EB, }, + { 0x03EC, }, + { 0x03ED, }, + { 0x03EE, }, + { 0x03EF, }, + { 0x03F0, CHF_CONSONANT|CHF_LOWER, 0, 0x039A }, /* kappa */ + { 0x03F1, CHF_CONSONANT|CHF_LOWER, 0, 0x03A1 }, /* tailed rho */ + { 0x03F2, CHF_CONSONANT, 0, 0x03F9 }, /* lunate sigma */ + { 0x03F3, CHF_SEMIVOWEL|CHF_LOWER, }, /* yot */ + { 0x03F4, CHF_CONSONANT|CHF_UPPER, 0, 0x03B8 }, /* THETA */ + { 0x03F5, CHF_SYMBOL|CHF_LOWER, 0, 0x0395 }, /* lunate epsilon */ + { 0x03F6, CHF_SYMBOL|CHF_LOWER, }, /* reversed lunate epsilon */ + { 0x03F7, }, + { 0x03F8, }, + { 0x03F9, CHF_CONSONANT|CHF_UPPER, 0, 0x03F2 }, /* LUNATE SIGMA */ + { 0x03FA, CHF_ARCHAIC|CHF_CONSONANT|CHF_UPPER, 0, 0x03FB }, /* SAN */ + { 0x03FB, CHF_ARCHAIC|CHF_CONSONANT|CHF_LOWER, 0, 0x03FA }, /* san */ + { 0x03FC, CHF_SYMBOL|CHF_CONSONANT|CHF_LOWER, }, /* rho with stroke */ + { 0x03FD, CHF_SYMBOL|CHF_CONSONANT|CHF_UPPER, 0, 0x037B}, /* CAPITAL REV. LUNATE SIGMA + antisigma */ + { 0x03FE, CHF_SYMBOL|CHF_CONSONANT|CHF_UPPER, 0, 0x037C }, /* CAPITAL DOTTED LUNATE SIGMA + sigma periestigmenon */ + { 0x03FF, CHF_SYMBOL|CHF_CONSONANT|CHF_UPPER, 0, 0x037D }, /* antisigma periestigmenon */ +}; + +/* FIXME: Implement http://www.unicode.org/charts/PDF/U1F00.pdf */ +struct char_info_st el_extended_ctype[] = { + { 0x1F00, }, + { 0x1F01, }, + { 0x1F02, }, + { 0x1F03, }, + { 0x1F04, }, + { 0x1F05, }, + { 0x1F06, }, + { 0x1F07, }, + { 0x1F08, }, + { 0x1F09, }, + { 0x1F0A, }, + { 0x1F0B, }, + { 0x1F0C, }, + { 0x1F0D, }, + { 0x1F0E, }, + { 0x1F0F, }, + { 0x1F10, }, + { 0x1F11, }, + { 0x1F12, }, + { 0x1F13, }, + { 0x1F14, }, + { 0x1F15, }, + { 0x1F16, }, + { 0x1F17, }, + { 0x1F18, }, + { 0x1F19, }, + { 0x1F1A, }, + { 0x1F1B, }, + { 0x1F1C, }, + { 0x1F1D, }, + { 0x1F1E, }, + { 0x1F1F, }, + { 0x1F20, }, + { 0x1F21, }, + { 0x1F22, }, + { 0x1F23, }, + { 0x1F24, }, + { 0x1F25, }, + { 0x1F26, }, + { 0x1F27, }, + { 0x1F28, }, + { 0x1F29, }, + { 0x1F2A, }, + { 0x1F2B, }, + { 0x1F2C, }, + { 0x1F2D, }, + { 0x1F2E, }, + { 0x1F2F, }, + { 0x1F30, }, + { 0x1F31, }, + { 0x1F32, }, + { 0x1F33, }, + { 0x1F34, }, + { 0x1F35, }, + { 0x1F36, }, + { 0x1F37, }, + { 0x1F38, }, + { 0x1F39, }, + { 0x1F3A, }, + { 0x1F3B, }, + { 0x1F3C, }, + { 0x1F3D, }, + { 0x1F3E, }, + { 0x1F3F, }, + { 0x1F40, }, + { 0x1F41, }, + { 0x1F42, }, + { 0x1F43, }, + { 0x1F44, }, + { 0x1F45, }, + { 0x1F46, }, + { 0x1F47, }, + { 0x1F48, }, + { 0x1F49, }, + { 0x1F4A, }, + { 0x1F4B, }, + { 0x1F4C, }, + { 0x1F4D, }, + { 0x1F4E, }, + { 0x1F4F, }, + { 0x1F50, }, + { 0x1F51, }, + { 0x1F52, }, + { 0x1F53, }, + { 0x1F54, }, + { 0x1F55, }, + { 0x1F56, }, + { 0x1F57, }, + { 0x1F58, }, + { 0x1F59, }, + { 0x1F5A, }, + { 0x1F5B, }, + { 0x1F5C, }, + { 0x1F5D, }, + { 0x1F5E, }, + { 0x1F5F, }, + { 0x1F60, }, + { 0x1F61, }, + { 0x1F62, }, + { 0x1F63, }, + { 0x1F64, }, + { 0x1F65, }, + { 0x1F66, }, + { 0x1F67, }, + { 0x1F68, }, + { 0x1F69, }, + { 0x1F6A, }, + { 0x1F6B, }, + { 0x1F6C, }, + { 0x1F6D, }, + { 0x1F6E, }, + { 0x1F6F, }, + { 0x1F70, }, + { 0x1F71, }, + { 0x1F72, }, + { 0x1F73, }, + { 0x1F74, }, + { 0x1F75, }, + { 0x1F76, }, + { 0x1F77, }, + { 0x1F78, }, + { 0x1F79, }, + { 0x1F7A, }, + { 0x1F7B, }, + { 0x1F7C, }, + { 0x1F7D, }, + { 0x1F7E, }, + { 0x1F7F, }, + { 0x1F80, }, + { 0x1F81, }, + { 0x1F82, }, + { 0x1F83, }, + { 0x1F84, }, + { 0x1F85, }, + { 0x1F86, }, + { 0x1F87, }, + { 0x1F88, }, + { 0x1F89, }, + { 0x1F8A, }, + { 0x1F8B, }, + { 0x1F8C, }, + { 0x1F8D, }, + { 0x1F8E, }, + { 0x1F8F, }, + { 0x1F90, }, + { 0x1F91, }, + { 0x1F92, }, + { 0x1F93, }, + { 0x1F94, }, + { 0x1F95, }, + { 0x1F96, }, + { 0x1F97, }, + { 0x1F98, }, + { 0x1F99, }, + { 0x1F9A, }, + { 0x1F9B, }, + { 0x1F9C, }, + { 0x1F9D, }, + { 0x1F9E, }, + { 0x1F9F, }, + { 0x1FA0, }, + { 0x1FA1, }, + { 0x1FA2, }, + { 0x1FA3, }, + { 0x1FA4, }, + { 0x1FA5, }, + { 0x1FA6, }, + { 0x1FA7, }, + { 0x1FA8, }, + { 0x1FA9, }, + { 0x1FAA, }, + { 0x1FAB, }, + { 0x1FAC, }, + { 0x1FAD, }, + { 0x1FAE, }, + { 0x1FAF, }, + { 0x1FB0, }, + { 0x1FB1, }, + { 0x1FB2, }, + { 0x1FB3, }, + { 0x1FB4, }, + { 0x1FB5, }, + { 0x1FB6, }, + { 0x1FB7, }, + { 0x1FB8, }, + { 0x1FB9, }, + { 0x1FBA, }, + { 0x1FBB, }, + { 0x1FBC, }, + { 0x1FBD, }, + { 0x1FBE, }, + { 0x1FBF, }, + { 0x1FC0, }, + { 0x1FC1, }, + { 0x1FC2, }, + { 0x1FC3, }, + { 0x1FC4, }, + { 0x1FC5, }, + { 0x1FC6, }, + { 0x1FC7, }, + { 0x1FC8, }, + { 0x1FC9, }, + { 0x1FCA, }, + { 0x1FCB, }, + { 0x1FCC, }, + { 0x1FCD, }, + { 0x1FCE, }, + { 0x1FCF, }, + { 0x1FD0, }, + { 0x1FD1, }, + { 0x1FD2, }, + { 0x1FD3, }, + { 0x1FD4, }, + { 0x1FD5, }, + { 0x1FD6, }, + { 0x1FD7, }, + { 0x1FD8, }, + { 0x1FD9, }, + { 0x1FDA, }, + { 0x1FDB, }, + { 0x1FDC, }, + { 0x1FDD, }, + { 0x1FDE, }, + { 0x1FDF, }, + { 0x1FE0, }, + { 0x1FE1, }, + { 0x1FE2, }, + { 0x1FE3, }, + { 0x1FE4, }, + { 0x1FE5, }, + { 0x1FE6, }, + { 0x1FE7, }, + { 0x1FE8, }, + { 0x1FE9, }, + { 0x1FEA, }, + { 0x1FEB, }, + { 0x1FEC, }, + { 0x1FED, }, + { 0x1FEE, }, + { 0x1FEF, }, + { 0x1FF0, }, + { 0x1FF1, }, + { 0x1FF2, }, + { 0x1FF3, }, + { 0x1FF4, }, + { 0x1FF5, }, + { 0x1FF6, }, + { 0x1FF7, }, + { 0x1FF8, }, + { 0x1FF9, }, + { 0x1FFA, }, + { 0x1FFB, }, + { 0x1FFC, }, + { 0x1FFD, }, + { 0x1FFE, }, + { 0x1FFF, } +}; + +static struct char_info_st * +elchr_info(unsigned ch) +{ + if (ch >= 0x0300 && ch <= 0x03FF) + return el_basic_ctype + ch - 0x0300; + else if (ch >= 0x1F00 && ch <= 0x1FFF) + return el_extended_ctype + ch - 0x1F00; + return NULL; +} + +int +elchr_flags(unsigned ch) +{ + struct char_info_st *ci = elchr_info(ch); + return ci ? ci->flags : 0; +} + +int +elchr_isupper(unsigned ch) +{ + return elchr_flags(ch) & CHF_UPPER; +} + +int +elchr_islower(unsigned ch) +{ + return elchr_flags(ch) & CHF_LOWER; +} + +int +elchr_getaccent(unsigned ch) +{ + return elchr_flags(ch) & CHF_ACCENT_MASK; +} + +int +elchr_istrema(unsigned ch) +{ + return elchr_flags(ch) & CHF_TREMA; +} + + +int +elchr_isvowel(unsigned ch) +{ + return elchr_flags(ch) & CHF_VOWEL; +} + +int +elchr_isconsonant(unsigned ch) +{ + return elchr_flags(ch) & CHF_CONSONANT; +} + +int +elchr_issemivowel(unsigned ch) +{ + return elchr_flags(ch) & CHF_SEMIVOWEL; +} + +int +elchr_ispunct(unsigned ch) +{ + return elchr_flags(ch) & CHF_PUNCT; +} + +int +elchr_issymbol(unsigned ch) +{ + return elchr_flags(ch) & CHF_SYMBOL; +} + +int +elchr_ismodifier(unsigned ch) +{ + return elchr_flags(ch) & CHF_MODIFIER; +} + +int +elchr_isarchaic(unsigned ch) +{ + return elchr_flags(ch) & CHF_ARCHAIC; +} + +int +elchr_isnumeric(unsigned ch) +{ + return elchr_flags(ch) & CHF_NUMERIC; +} + +unsigned +elchr_numeric_value(unsigned ch) +{ + struct char_info_st *ci = elchr_info(ch); + return (ci && (ci->flags & CHF_NUMERIC)) ? ci->numval: 0; +} + +unsigned +elchr_toupper(unsigned ch) +{ + struct char_info_st *ci = elchr_info(ch); + return (ci && (ci->flags & CHF_LOWER)) ? ci->trans: ch; +} + +unsigned +elchr_tolower(unsigned ch) +{ + struct char_info_st *ci = elchr_info(ch); + return (ci && (ci->flags & CHF_UPPER)) ? ci->trans : ch; +} + +unsigned +elchr_base(unsigned ch) +{ + struct char_info_st *ci = elchr_info(ch); + return (ci && (ci->flags & CHF_ACCENT_MASK) && ci->base) ? ci->base : ch; +} + +unsigned +elchr_deaccent(unsigned ch) +{ + struct char_info_st *ci = elchr_info(ch); + if (ci && (ci->flags & CHF_ACCENT_MASK)) + return ci->deaccent ? ci->deaccent : ci->base ? ci->base : ch; + return ch; +} + +unsigned +elchr_accent(unsigned ch, int acc) +{ + struct char_info_st *ci = elchr_info(ch); + return (ci && ci->accented[acc-1]) ? ci->accented[acc-1] : ch; +} + +int +elchr_diphthong(unsigned ch, int state) +{ + struct char_info_st *ci = elchr_info(ch); + + if (!ci || !(ci->flags & CHF_VOWEL)) + return 0; + switch (state) { + case 0: + if (ci->flags & CHF_DIPH1) + state = 1; + break; + case 1: + if (ci->flags & CHF_DIPH2) + state = 2; + break; + default: + state = 0; + } + return state; +} diff --git a/src/ellinika/elmorph.c b/src/ellinika/elmorph.c new file mode 100644 index 0000000..5234eda --- /dev/null +++ b/src/ellinika/elmorph.c @@ -0,0 +1,655 @@ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "utf8.h" +#include "elmorph.h" + +struct elstr { + unsigned *str; /* UTF-8 string */ + size_t len; /* Its length */ + unsigned nsyl; /* Number of syllables. */ + unsigned *sylmap; /* Syllable map (nsyl elements) */ + unsigned acc_syl; /* Number of the accented syllable + (1-based, from the last syllable) */ + unsigned acc_pos; /* Number of the accented character + (0-based, from str[0]) */ +}; + +scm_t_bits _elstr_tag; + +static void +_elstr_syllabize(struct elstr *elstr) +{ + unsigned *sylmap; + unsigned i, nsyl = 0, accsyl = 0, accchr = 0; + int dstate = 0; + int acc = 0; + + sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len, "syllable map"); + + for (i = 0; i < elstr->len; i++) { + int nstate; + + if (elchr_getaccent(elstr->str[i])) { + accsyl = nsyl; + accchr = i; + } + nstate = elchr_diphthong(elstr->str[i], dstate); + if (nstate) + /* skip */; + else if (dstate) + sylmap[nsyl++] = i - 1; + else if (elchr_isvowel(elstr->str[i])) + sylmap[nsyl++] = i; + dstate = nstate; + } + if (dstate) + sylmap[nsyl++] = i - 1; + else + sylmap[nsyl-1] = i - 1; + elstr->sylmap = sylmap; + elstr->nsyl = nsyl; + elstr->acc_pos = accchr; + elstr->acc_syl = nsyl - accsyl; +} + +static SCM +_elstr_alloc(const char *instr) +{ + struct elstr *elstr; + unsigned *wptr; + size_t wlen; + + if (utf8_mbstr_to_wc(instr, &wptr, &wlen)) + return SCM_EOL; + + elstr = scm_gc_malloc(sizeof(*elstr), "Elstr"); + elstr->str = wptr; + elstr->len = wlen; + + _elstr_syllabize(elstr); + + SCM_RETURN_NEWSMOB(_elstr_tag, elstr); +} + +static SCM +_elstr_dup(struct elstr *elstr) +{ + struct elstr *elnew; + + elnew = scm_gc_malloc(sizeof(*elstr), "Elstr"); + elnew->str = calloc(elstr->len, sizeof(elnew->str[0])); + if (!elnew->str) + scm_memory_error("_elstr_dup"); + elnew->sylmap = calloc(elstr->nsyl, sizeof(elnew->sylmap[0])); + if (!elnew->sylmap) { + free(elnew->str); + scm_memory_error("_elstr_dup"); + } + memcpy(elnew->str, elstr->str, sizeof(elstr->str[0]) * elstr->len); + elnew->len = elstr->len; + elnew->nsyl = elstr->nsyl; + memcpy(elnew->sylmap, elstr->sylmap, + sizeof(elstr->sylmap[0]) * elstr->nsyl); + elnew->acc_syl = elstr->acc_syl; + elnew->acc_pos = elstr->acc_pos; + SCM_RETURN_NEWSMOB(_elstr_tag, elnew); +} + +static scm_sizet +_elstr_free(SCM smob) +{ + struct elstr *elstr = (struct elstr *) SCM_CDR(smob); + free(elstr->str); + free(elstr->sylmap); + free(elstr); + return 0; +} + +static int +_elstr_print(SCM smob, SCM port, scm_print_state *pstate) +{ + struct elstr *elstr = (struct elstr *) SCM_CDR(smob); + int i, j, an; + char *s; + + scm_puts("#nsyl - elstr->acc_syl; + if (an == 0) + scm_puts("[", port); + for (i = j = 0; i < elstr->len; i++) { + char r[6]; + int n; + + if (i == elstr->sylmap[j] + 1) { + if (j == an) + scm_puts("]", port); + scm_puts("-", port); + if (++j == an) + scm_puts("[", port); + } + n = utf8_wctomb(r, elstr->str[i]); + if (n == -1) + continue; + r[n] = 0; + scm_puts(r, port); + } + if (j == an) + scm_puts("]", port); + scm_puts("''>", port); + return 1; +} + +static void +_elstr_init() +{ + _elstr_tag = scm_make_smob_type("Elstr", sizeof(struct elstr)); + scm_set_smob_free(_elstr_tag, _elstr_free); + scm_set_smob_print(_elstr_tag, _elstr_print); +} + +SCM_DEFINE_PUBLIC(scm_string__elstr, "string->elstr", 1, 0, 0, + (SCM string), +"Create new ELSTR from STRING\n") +#define FUNC_NAME s_scm_string__elstr +{ + char *str; + SCM scm; + + SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME); + str = scm_to_locale_string(string); + scm = _elstr_alloc(str); + free(str); + if (scm == SCM_EOL) + scm_misc_error(FUNC_NAME, + "Invalid input string: ~S", + scm_list_1(string)); + return scm; +} +#undef FUNC_NAME + +#define scm_is_elstr(s) (!SCM_IMP(s) && SCM_CELL_TYPE(s) == _elstr_tag) + +SCM_DEFINE_PUBLIC(scm_elstr__string, "elstr->string", 1, 0, 0, + (SCM el), +"Convert EL to a STRING\n") +#define FUNC_NAME s_scm_elstr__string +{ + struct elstr *elstr; + char *s; + SCM scm; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + if (utf8_wc_to_mbstr(elstr->str, elstr->len, &s)) + scm_misc_error(FUNC_NAME, + "cannot convert elstr to Scheme", + SCM_EOL); + scm = scm_from_locale_string(s); + free(s); + return scm; +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_length, "elstr-length", 1, 0, 0, + (SCM el), +"Returns the number of characters in EL\n") +#define FUNC_NAME s_scm_elstr_length +{ + struct elstr *elstr; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + return scm_from_uint(elstr->len); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_number_of_syllables, "elstr-number-of-syllables", + 1, 0, 0, + (SCM el), +"Returns the number of characters in EL\n") +#define FUNC_NAME s_scm_elstr_number_of_syllables +{ + struct elstr *elstr; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + return scm_from_uint(elstr->nsyl); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_syllable_prop, "elstr-syllable-prop", + 2, 0, 0, + (SCM el, SCM n), +"Returns properties of the syllable N in EL\n") +#define FUNC_NAME s_scm_elstr_syllable_prop +{ + struct elstr *elstr; + unsigned num, start; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); + num = scm_to_uint(n); + if (num > elstr->nsyl) + scm_misc_error(FUNC_NAME, + "cannot get syllable #~S: not enough syllables: ~S", + scm_list_2(el, n)); + num = elstr->nsyl - num; + if (num == 0) + start = 0; + else + start = elstr->sylmap[num - 1] + 1; + + return scm_cons(scm_from_uint(start), + scm_from_uint(elstr->sylmap[num])); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_accent_position, "elstr-accent-position", 1, 0, 0, + (SCM el), +"Return position of the accented character in EL\n") +#define FUNC_NAME s_scm_elstr_accent_position +{ + struct elstr *elstr; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + return scm_from_uint(elstr->acc_pos); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_accented_syllable, "elstr-accented-syllable", + 1, 0, 0, + (SCM el), +"Return position of the accented syllable in EL\n") +#define FUNC_NAME s_scm_elstr_accented_syllable +{ + struct elstr *elstr; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + return scm_from_uint(elstr->acc_syl); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_syllable, "elstr-syllable", + 2, 0, 0, + (SCM el, SCM n), +"Return Nth syllable in EL\n") +#define FUNC_NAME s_scm_elstr_accented_syllable +{ + struct elstr *elstr; + char *s; + SCM scm; + unsigned num, start; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); + num = scm_to_uint(n); + if (num > elstr->nsyl) + scm_misc_error(FUNC_NAME, + "cannot get syllable #~S: not enough syllables: ~S", + scm_list_2(el, n)); + num = elstr->nsyl - num; + if (num == 0) + start = 0; + else + start = elstr->sylmap[num - 1] + 1; + if (utf8_wc_to_mbstr(elstr->str + start, + elstr->sylmap[num] - start + 1, + &s)) + scm_misc_error(FUNC_NAME, + "cannot convert elstr to Scheme", + SCM_EOL); + scm = scm_from_locale_string(s); + free(s); + return scm; +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_character, "elstr-character", + 2, 0, 0, + (SCM el, SCM n), +"Return Nth character in EL\n") +#define FUNC_NAME s_scm_elstr_character +{ + struct elstr *elstr; + unsigned num; + char r[6]; + int len; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); + num = scm_to_uint(n); + if (num >= elstr->len) + scm_misc_error(FUNC_NAME, + "cannot get character #~S: not enough characters: ~S", + scm_list_2(el, n)); + len = utf8_wctomb(r, elstr->str[num]); + if (len <= 0) + scm_misc_error(FUNC_NAME, + "cannot convert elchr to Scheme", + SCM_EOL); + r[len] = 0; + return scm_from_locale_string(r); +} +#undef FUNC_NAME + +static SCM +_elstr_chgcase(SCM el, void (*chgfun)(unsigned *, size_t), + int destructive, const char *func_name) +{ + struct elstr *elstr; + SCM scm; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); + elstr = (struct elstr*) SCM_CDR(el); + if (destructive) + scm = SCM_UNSPECIFIED; + else { + scm = _elstr_dup(elstr); + elstr = (struct elstr*) SCM_CDR(scm); + } + chgfun(elstr->str, elstr->len); + return scm; +} + +SCM_DEFINE_PUBLIC(scm_elstr_toupper, "elstr-toupper", + 1, 0, 0, + (SCM el), +"Convert EL to upper case\n") +#define FUNC_NAME s_scm_elstr_toupper +{ + return _elstr_chgcase(el, utf8_wc_strnupper, 0, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_tolower, "elstr-tolower", + 1, 0, 0, + (SCM el), +"Convert EL to lower case\n") +#define FUNC_NAME s_scm_elstr_tolower +{ + return _elstr_chgcase(el, utf8_wc_strnlower, 0, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_toupper_x, "elstr-toupper!", + 1, 0, 0, + (SCM el), +"Convert EL to upper case (destructive)\n") +#define FUNC_NAME s_scm_elstr_toupper_x +{ + return _elstr_chgcase(el, utf8_wc_strnupper, 1, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_tolower_x, "elstr-tolower!", + 1, 0, 0, + (SCM el), +"Convert EL to lower case (destructive)\n") +#define FUNC_NAME s_scm_elstr_tolower_x +{ + return _elstr_chgcase(el, utf8_wc_strnlower, 0, FUNC_NAME); +} +#undef FUNC_NAME + +static SCM +_elstr_deaccent(SCM el, int destructive, const char *func_name) +{ + struct elstr *elstr; + unsigned i; + SCM scm; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); + elstr = (struct elstr*) SCM_CDR(el); + if (destructive) + scm = SCM_UNSPECIFIED; + else { + scm = _elstr_dup(elstr); + elstr = (struct elstr*) SCM_CDR(scm); + } + for (i = 0; i < elstr->len; i++) + elstr->str[i] = elchr_deaccent(elstr->str[i]); + elstr->acc_pos = 0; + elstr->acc_syl = 0; + return scm; +} + +SCM_DEFINE_PUBLIC(scm_elstr_deaccent, "elstr-deaccent", + 1, 0, 0, + (SCM el), +"Remove all accents from EL\n") +#define FUNC_NAME s_scm_elstr_deaccent +{ + return _elstr_deaccent(el, 0, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_deaccent_x, "elstr-deaccent!", + 1, 0, 0, + (SCM el), +"Remove all accents from EL (desctructive)\n") +#define FUNC_NAME s_scm_elstr_deaccent_x +{ + return _elstr_deaccent(el, 1, FUNC_NAME); +} +#undef FUNC_NAME + +static SCM +_elstr_set_accent(SCM el, SCM n, int destructive, const char *func_name) +{ + struct elstr *elstr; + unsigned i; + unsigned acc_num, num, len, start; + SCM scm; + int dstate; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name); + elstr = (struct elstr*) SCM_CDR(el); + num = scm_to_uint(n); + if (num > elstr->nsyl) + scm_misc_error(func_name, + "cannot get syllable #~S: not enough syllables: ~S", + scm_list_2(el, n)); + acc_num = elstr->nsyl - num; + if (acc_num == 0) + start = 0; + else + start = elstr->sylmap[acc_num - 1] + 1; + + if (destructive) + scm = SCM_UNSPECIFIED; + else { + scm = _elstr_dup(elstr); + elstr = (struct elstr*) SCM_CDR(scm); + } + + /* Clear all accents */ + for (i = 0; i < elstr->len; i++) + elstr->str[i] = elchr_deaccent(elstr->str[i]); + len = elstr->sylmap[acc_num] - start + 1; + dstate = 0; + for (i = start; i <= start + len; i++) { + int nstate; + + if (!elchr_isvowel(elstr->str[i])) { + if (dstate) { + --i; + break; + } + continue; + } + nstate = elchr_diphthong(elstr->str[i], dstate); + if (!nstate) + break; + dstate = nstate; + } + elstr->str[i] = elchr_accent(elstr->str[i], CHF_OXEIA); + elstr->acc_syl = num; + return scm; +} + +SCM_DEFINE_PUBLIC(scm_elstr_set_accent, "elstr-set-accent", + 2, 0, 0, + (SCM el, SCM n), +"Set accent on Nth syllable of EL\n") +{ + return _elstr_set_accent(el, n, 0, s_scm_elstr_set_accent); +} + +SCM_DEFINE_PUBLIC(scm_elstr_set_accent_x, "elstr-set-accent!", + 2, 0, 0, + (SCM el, SCM n), +"Set accent on Nth syllable of EL (destructive)\n") +{ + return _elstr_set_accent(el, n, 1, s_scm_elstr_set_accent_x); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_char_prop_bitmask, "elstr-char-prop-bitmask", + 2, 0, 0, + (SCM el, SCM n), +"Returns properties of the Nth char in EL, as a bitmask\n") +#define FUNC_NAME s_scm_elstr_char_prop_bitmask +{ + struct elstr *elstr; + unsigned num; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + num = scm_to_uint(n); + if (num >= elstr->len) + scm_misc_error(FUNC_NAME, + "cannot get character #~S: not enough characters: ~S", + scm_list_2(el, n)); + return scm_from_uint(elchr_flags(elstr->str[num])); +} +#undef FUNC_NAME + +static struct deftab { + unsigned val; + char *sym; +} deftab[] = { + { CHF_OXEIA, "elmorph:oxeia" }, + { CHF_PERISPWMENH, "elmorph:perispwmenh" }, + { CHF_BAREIA, "elmorph:bareia" }, + { CHF_ACCENT_MASK, "elmorph:accent-mask" }, + { CHF_TREMA, "elmorph:trema" }, + { CHF_VOWEL, "elmorph:vowel" }, + { CHF_CONSONANT, "elmorph:consonant" }, + { CHF_SEMIVOWEL, "elmorph:semivowel" }, + { CHF_PUNCT, "elmorph:punct" }, + { CHF_SYMBOL, "elmorph:symbol" }, + { CHF_MODIFIER, "elmorph:modifier" }, + { CHF_ARCHAIC, "elmorph:archaic" }, + { CHF_LOWER, "elmorph:lower" }, + { CHF_UPPER, "elmorph:upper" }, + { CHF_NUMERIC, "elmorph:numeric" }, + + { CHF_DIPH1, "elmorph:diph1" }, + { CHF_DIPH2, "elmorph:diph2" } +}; + +SCM_DEFINE_PUBLIC(scm_utf8_toupper, "utf8-toupper", 1, 0, 0, + (SCM string), +"Convert STRING to uppercase\n") +#define FUNC_NAME s_scm_utf8_toupper +{ + char *str; + SCM scm; + + SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME); + str = scm_to_locale_string(string); + if (utf8_toupper(str, strlen(str))) + scm_misc_error(FUNC_NAME, + "cannot convert to upper case: ~S", + scm_list_1(string)); + scm = scm_from_locale_string(str); + free(str); + return scm; +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_utf8_tolower, "utf8-tolower", 1, 0, 0, + (SCM string), +"Convert STRING to lowercase\n") +#define FUNC_NAME s_scm_utf8_tolower +{ + char *str; + SCM scm; + + SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME); + str = scm_to_locale_string(string); + if (utf8_tolower(str, strlen(str))) + scm_misc_error(FUNC_NAME, + "cannot convert to lower case: ~S", + scm_list_1(string)); + scm = scm_from_locale_string(str); + free(str); + return scm; +} +#undef FUNC_NAME + +static SCM +_elstr_thema_aoristoy(SCM el, int destructive, const char *func_name) +{ + struct elstr *elstr; + SCM scm; + unsigned *wc; + size_t wclen; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); + elstr = (struct elstr*) SCM_CDR(el); + if (destructive) + scm = SCM_UNSPECIFIED; + else { + scm = _elstr_dup(elstr); + elstr = (struct elstr*) SCM_CDR(scm); + } + if (elmorph_thema_aoristoy(elstr->str, elstr->len, &wc, &wclen)) + scm_memory_error(func_name); + free(elstr->str); + elstr->str = wc; + elstr->len = wclen; + return scm; +} + +SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy, "elstr-thema-aoristoy", 1, 0, 0, + (SCM thema), +"Convert THEMA, which must be a root of present. to an aorist root\n") +#define FUNC_NAME s_scm_elstr_thema_aoristoy +{ + return _elstr_thema_aoristoy(thema, 0, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy_x, "elstr-thema-aoristoy!", 1, 0, 0, + (SCM thema), +"Convert THEMA, which must be a root of present. to an aorist root (destructive)\n") +#define FUNC_NAME s_scm_elstr_thema_aoristoy_x +{ + return _elstr_thema_aoristoy(thema, 1, FUNC_NAME); +} +#undef FUNC_NAME + + +void +scm_init_ellinika_elmorph_module() +{ + int i; + + _elstr_init(); + for (i = 0; i < sizeof(deftab)/sizeof(deftab[0]); i++) { + scm_c_define(deftab[i].sym, scm_from_uint(deftab[i].val)); + scm_c_export(deftab[i].sym, NULL); + } +#include "elmorph.x" +} diff --git a/src/ellinika/elmorph.h b/src/ellinika/elmorph.h new file mode 100644 index 0000000..d91f513 --- /dev/null +++ b/src/ellinika/elmorph.h @@ -0,0 +1,46 @@ +#define CHF_OXEIA 1 +#define CHF_PERISPWMENH 2 +#define CHF_BAREIA 3 + +#define CHF_ACCENT_MASK 0x000f + +#define CHF_TREMA 0x0010 + +#define CHF_VOWEL 0x00020 +#define CHF_CONSONANT 0x00040 +#define CHF_SEMIVOWEL 0x00080 +#define CHF_PUNCT 0x00100 +#define CHF_SYMBOL 0x00200 +#define CHF_MODIFIER 0x00400 +#define CHF_ARCHAIC 0x00800 +#define CHF_LOWER 0x01000 +#define CHF_UPPER 0x02000 +#define CHF_NUMERIC 0x04000 + +#define CHF_DIPH1 0x10000 +#define CHF_DIPH2 0x20000 + +int elchr_flags(unsigned ch); +int elchr_isupper(unsigned ch); +int elchr_islower(unsigned ch); +int elchr_getaccent(unsigned ch); +int elchr_istrema(unsigned ch); +int elchr_isvowel(unsigned ch); +int elchr_isconsonant(unsigned ch); +int elchr_issemivowel(unsigned ch); +int elchr_ispunct(unsigned ch); +int elchr_issymbol(unsigned ch); +int elchr_ismodifier(unsigned ch); +int elchr_isarchaic(unsigned ch); +int elchr_isnumeric(unsigned ch); +unsigned elchr_numeric_value(unsigned ch); +unsigned elchr_toupper(unsigned ch); +unsigned elchr_tolower(unsigned ch); +unsigned elchr_base(unsigned ch); +unsigned elchr_deaccent(unsigned ch); +unsigned elchr_accent(unsigned ch, int acc); +int elchr_diphthong(unsigned ch, int state); + + +int elmorph_thema_aoristoy(unsigned *word, size_t len, + unsigned **thema, size_t *tlen); diff --git a/src/ellinika/elmorph.scm4 b/src/ellinika/elmorph.scm4 new file mode 100644 index 0000000..64c471e --- /dev/null +++ b/src/ellinika/elmorph.scm4 @@ -0,0 +1,5 @@ +(define-module (ellinika elmorph)) + +(load-extension + "LIBDIR/libguile-elmorph-v-VERSION" + "scm_init_ellinika_elmorph_module") diff --git a/src/ellinika/utf8.c b/src/ellinika/utf8.c new file mode 100644 index 0000000..952af07 --- /dev/null +++ b/src/ellinika/utf8.c @@ -0,0 +1,2149 @@ +/* This file is part of GNU Dico + Copyright (C) 2007, 2008, 2010 Sergey Poznyakoff + + GNU Dico is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GNU Dico is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with GNU Dico. If not, see . */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include "utf8.h" + +struct unicase_info_st { + unsigned toupper; + unsigned tolower; + unsigned sort; +}; + +typedef struct unicase_info_st MY_UNICASE_INFO; + +/* UTF tables written by Alexander Barkov */ +static MY_UNICASE_INFO plane00[] = { + {0x0000, 0x0000, 0x0000}, {0x0001, 0x0001, 0x0001}, + {0x0002, 0x0002, 0x0002}, {0x0003, 0x0003, 0x0003}, + {0x0004, 0x0004, 0x0004}, {0x0005, 0x0005, 0x0005}, + {0x0006, 0x0006, 0x0006}, {0x0007, 0x0007, 0x0007}, + {0x0008, 0x0008, 0x0008}, {0x0009, 0x0009, 0x0009}, + {0x000A, 0x000A, 0x000A}, {0x000B, 0x000B, 0x000B}, + {0x000C, 0x000C, 0x000C}, {0x000D, 0x000D, 0x000D}, + {0x000E, 0x000E, 0x000E}, {0x000F, 0x000F, 0x000F}, + {0x0010, 0x0010, 0x0010}, {0x0011, 0x0011, 0x0011}, + {0x0012, 0x0012, 0x0012}, {0x0013, 0x0013, 0x0013}, + {0x0014, 0x0014, 0x0014}, {0x0015, 0x0015, 0x0015}, + {0x0016, 0x0016, 0x0016}, {0x0017, 0x0017, 0x0017}, + {0x0018, 0x0018, 0x0018}, {0x0019, 0x0019, 0x0019}, + {0x001A, 0x001A, 0x001A}, {0x001B, 0x001B, 0x001B}, + {0x001C, 0x001C, 0x001C}, {0x001D, 0x001D, 0x001D}, + {0x001E, 0x001E, 0x001E}, {0x001F, 0x001F, 0x001F}, + {0x0020, 0x0020, 0x0020}, {0x0021, 0x0021, 0x0021}, + {0x0022, 0x0022, 0x0022}, {0x0023, 0x0023, 0x0023}, + {0x0024, 0x0024, 0x0024}, {0x0025, 0x0025, 0x0025}, + {0x0026, 0x0026, 0x0026}, {0x0027, 0x0027, 0x0027}, + {0x0028, 0x0028, 0x0028}, {0x0029, 0x0029, 0x0029}, + {0x002A, 0x002A, 0x002A}, {0x002B, 0x002B, 0x002B}, + {0x002C, 0x002C, 0x002C}, {0x002D, 0x002D, 0x002D}, + {0x002E, 0x002E, 0x002E}, {0x002F, 0x002F, 0x002F}, + {0x0030, 0x0030, 0x0030}, {0x0031, 0x0031, 0x0031}, + {0x0032, 0x0032, 0x0032}, {0x0033, 0x0033, 0x0033}, + {0x0034, 0x0034, 0x0034}, {0x0035, 0x0035, 0x0035}, + {0x0036, 0x0036, 0x0036}, {0x0037, 0x0037, 0x0037}, + {0x0038, 0x0038, 0x0038}, {0x0039, 0x0039, 0x0039}, + {0x003A, 0x003A, 0x003A}, {0x003B, 0x003B, 0x003B}, + {0x003C, 0x003C, 0x003C}, {0x003D, 0x003D, 0x003D}, + {0x003E, 0x003E, 0x003E}, {0x003F, 0x003F, 0x003F}, + {0x0040, 0x0040, 0x0040}, {0x0041, 0x0061, 0x0041}, + {0x0042, 0x0062, 0x0042}, {0x0043, 0x0063, 0x0043}, + {0x0044, 0x0064, 0x0044}, {0x0045, 0x0065, 0x0045}, + {0x0046, 0x0066, 0x0046}, {0x0047, 0x0067, 0x0047}, + {0x0048, 0x0068, 0x0048}, {0x0049, 0x0069, 0x0049}, + {0x004A, 0x006A, 0x004A}, {0x004B, 0x006B, 0x004B}, + {0x004C, 0x006C, 0x004C}, {0x004D, 0x006D, 0x004D}, + {0x004E, 0x006E, 0x004E}, {0x004F, 0x006F, 0x004F}, + {0x0050, 0x0070, 0x0050}, {0x0051, 0x0071, 0x0051}, + {0x0052, 0x0072, 0x0052}, {0x0053, 0x0073, 0x0053}, + {0x0054, 0x0074, 0x0054}, {0x0055, 0x0075, 0x0055}, + {0x0056, 0x0076, 0x0056}, {0x0057, 0x0077, 0x0057}, + {0x0058, 0x0078, 0x0058}, {0x0059, 0x0079, 0x0059}, + {0x005A, 0x007A, 0x005A}, {0x005B, 0x005B, 0x005B}, + {0x005C, 0x005C, 0x005C}, {0x005D, 0x005D, 0x005D}, + {0x005E, 0x005E, 0x005E}, {0x005F, 0x005F, 0x005F}, + {0x0060, 0x0060, 0x0060}, {0x0041, 0x0061, 0x0041}, + {0x0042, 0x0062, 0x0042}, {0x0043, 0x0063, 0x0043}, + {0x0044, 0x0064, 0x0044}, {0x0045, 0x0065, 0x0045}, + {0x0046, 0x0066, 0x0046}, {0x0047, 0x0067, 0x0047}, + {0x0048, 0x0068, 0x0048}, {0x0049, 0x0069, 0x0049}, + {0x004A, 0x006A, 0x004A}, {0x004B, 0x006B, 0x004B}, + {0x004C, 0x006C, 0x004C}, {0x004D, 0x006D, 0x004D}, + {0x004E, 0x006E, 0x004E}, {0x004F, 0x006F, 0x004F}, + {0x0050, 0x0070, 0x0050}, {0x0051, 0x0071, 0x0051}, + {0x0052, 0x0072, 0x0052}, {0x0053, 0x0073, 0x0053}, + {0x0054, 0x0074, 0x0054}, {0x0055, 0x0075, 0x0055}, + {0x0056, 0x0076, 0x0056}, {0x0057, 0x0077, 0x0057}, + {0x0058, 0x0078, 0x0058}, {0x0059, 0x0079, 0x0059}, + {0x005A, 0x007A, 0x005A}, {0x007B, 0x007B, 0x007B}, + {0x007C, 0x007C, 0x007C}, {0x007D, 0x007D, 0x007D}, + {0x007E, 0x007E, 0x007E}, {0x007F, 0x007F, 0x007F}, + {0x0080, 0x0080, 0x0080}, {0x0081, 0x0081, 0x0081}, + {0x0082, 0x0082, 0x0082}, {0x0083, 0x0083, 0x0083}, + {0x0084, 0x0084, 0x0084}, {0x0085, 0x0085, 0x0085}, + {0x0086, 0x0086, 0x0086}, {0x0087, 0x0087, 0x0087}, + {0x0088, 0x0088, 0x0088}, {0x0089, 0x0089, 0x0089}, + {0x008A, 0x008A, 0x008A}, {0x008B, 0x008B, 0x008B}, + {0x008C, 0x008C, 0x008C}, {0x008D, 0x008D, 0x008D}, + {0x008E, 0x008E, 0x008E}, {0x008F, 0x008F, 0x008F}, + {0x0090, 0x0090, 0x0090}, {0x0091, 0x0091, 0x0091}, + {0x0092, 0x0092, 0x0092}, {0x0093, 0x0093, 0x0093}, + {0x0094, 0x0094, 0x0094}, {0x0095, 0x0095, 0x0095}, + {0x0096, 0x0096, 0x0096}, {0x0097, 0x0097, 0x0097}, + {0x0098, 0x0098, 0x0098}, {0x0099, 0x0099, 0x0099}, + {0x009A, 0x009A, 0x009A}, {0x009B, 0x009B, 0x009B}, + {0x009C, 0x009C, 0x009C}, {0x009D, 0x009D, 0x009D}, + {0x009E, 0x009E, 0x009E}, {0x009F, 0x009F, 0x009F}, + {0x00A0, 0x00A0, 0x00A0}, {0x00A1, 0x00A1, 0x00A1}, + {0x00A2, 0x00A2, 0x00A2}, {0x00A3, 0x00A3, 0x00A3}, + {0x00A4, 0x00A4, 0x00A4}, {0x00A5, 0x00A5, 0x00A5}, + {0x00A6, 0x00A6, 0x00A6}, {0x00A7, 0x00A7, 0x00A7}, + {0x00A8, 0x00A8, 0x00A8}, {0x00A9, 0x00A9, 0x00A9}, + {0x00AA, 0x00AA, 0x00AA}, {0x00AB, 0x00AB, 0x00AB}, + {0x00AC, 0x00AC, 0x00AC}, {0x00AD, 0x00AD, 0x00AD}, + {0x00AE, 0x00AE, 0x00AE}, {0x00AF, 0x00AF, 0x00AF}, + {0x00B0, 0x00B0, 0x00B0}, {0x00B1, 0x00B1, 0x00B1}, + {0x00B2, 0x00B2, 0x00B2}, {0x00B3, 0x00B3, 0x00B3}, + {0x00B4, 0x00B4, 0x00B4}, {0x039C, 0x00B5, 0x039C}, + {0x00B6, 0x00B6, 0x00B6}, {0x00B7, 0x00B7, 0x00B7}, + {0x00B8, 0x00B8, 0x00B8}, {0x00B9, 0x00B9, 0x00B9}, + {0x00BA, 0x00BA, 0x00BA}, {0x00BB, 0x00BB, 0x00BB}, + {0x00BC, 0x00BC, 0x00BC}, {0x00BD, 0x00BD, 0x00BD}, + {0x00BE, 0x00BE, 0x00BE}, {0x00BF, 0x00BF, 0x00BF}, + {0x00C0, 0x00E0, 0x0041}, {0x00C1, 0x00E1, 0x0041}, + {0x00C2, 0x00E2, 0x0041}, {0x00C3, 0x00E3, 0x0041}, + {0x00C4, 0x00E4, 0x0041}, {0x00C5, 0x00E5, 0x0041}, + {0x00C6, 0x00E6, 0x00C6}, {0x00C7, 0x00E7, 0x0043}, + {0x00C8, 0x00E8, 0x0045}, {0x00C9, 0x00E9, 0x0045}, + {0x00CA, 0x00EA, 0x0045}, {0x00CB, 0x00EB, 0x0045}, + {0x00CC, 0x00EC, 0x0049}, {0x00CD, 0x00ED, 0x0049}, + {0x00CE, 0x00EE, 0x0049}, {0x00CF, 0x00EF, 0x0049}, + {0x00D0, 0x00F0, 0x00D0}, {0x00D1, 0x00F1, 0x004E}, + {0x00D2, 0x00F2, 0x004F}, {0x00D3, 0x00F3, 0x004F}, + {0x00D4, 0x00F4, 0x004F}, {0x00D5, 0x00F5, 0x004F}, + {0x00D6, 0x00F6, 0x004F}, {0x00D7, 0x00D7, 0x00D7}, + {0x00D8, 0x00F8, 0x00D8}, {0x00D9, 0x00F9, 0x0055}, + {0x00DA, 0x00FA, 0x0055}, {0x00DB, 0x00FB, 0x0055}, + {0x00DC, 0x00FC, 0x0055}, {0x00DD, 0x00FD, 0x0059}, + {0x00DE, 0x00FE, 0x00DE}, {0x00DF, 0x00DF, 0x00DF}, + {0x00C0, 0x00E0, 0x0041}, {0x00C1, 0x00E1, 0x0041}, + {0x00C2, 0x00E2, 0x0041}, {0x00C3, 0x00E3, 0x0041}, + {0x00C4, 0x00E4, 0x0041}, {0x00C5, 0x00E5, 0x0041}, + {0x00C6, 0x00E6, 0x00C6}, {0x00C7, 0x00E7, 0x0043}, + {0x00C8, 0x00E8, 0x0045}, {0x00C9, 0x00E9, 0x0045}, + {0x00CA, 0x00EA, 0x0045}, {0x00CB, 0x00EB, 0x0045}, + {0x00CC, 0x00EC, 0x0049}, {0x00CD, 0x00ED, 0x0049}, + {0x00CE, 0x00EE, 0x0049}, {0x00CF, 0x00EF, 0x0049}, + {0x00D0, 0x00F0, 0x00D0}, {0x00D1, 0x00F1, 0x004E}, + {0x00D2, 0x00F2, 0x004F}, {0x00D3, 0x00F3, 0x004F}, + {0x00D4, 0x00F4, 0x004F}, {0x00D5, 0x00F5, 0x004F}, + {0x00D6, 0x00F6, 0x004F}, {0x00F7, 0x00F7, 0x00F7}, + {0x00D8, 0x00F8, 0x00D8}, {0x00D9, 0x00F9, 0x0055}, + {0x00DA, 0x00FA, 0x0055}, {0x00DB, 0x00FB, 0x0055}, + {0x00DC, 0x00FC, 0x0055}, {0x00DD, 0x00FD, 0x0059}, + {0x00DE, 0x00FE, 0x00DE}, {0x0178, 0x00FF, 0x0059} +}; + +static MY_UNICASE_INFO plane01[] = { + {0x0100, 0x0101, 0x0041}, {0x0100, 0x0101, 0x0041}, + {0x0102, 0x0103, 0x0041}, {0x0102, 0x0103, 0x0041}, + {0x0104, 0x0105, 0x0041}, {0x0104, 0x0105, 0x0041}, + {0x0106, 0x0107, 0x0043}, {0x0106, 0x0107, 0x0043}, + {0x0108, 0x0109, 0x0043}, {0x0108, 0x0109, 0x0043}, + {0x010A, 0x010B, 0x0043}, {0x010A, 0x010B, 0x0043}, + {0x010C, 0x010D, 0x0043}, {0x010C, 0x010D, 0x0043}, + {0x010E, 0x010F, 0x0044}, {0x010E, 0x010F, 0x0044}, + {0x0110, 0x0111, 0x0110}, {0x0110, 0x0111, 0x0110}, + {0x0112, 0x0113, 0x0045}, {0x0112, 0x0113, 0x0045}, + {0x0114, 0x0115, 0x0045}, {0x0114, 0x0115, 0x0045}, + {0x0116, 0x0117, 0x0045}, {0x0116, 0x0117, 0x0045}, + {0x0118, 0x0119, 0x0045}, {0x0118, 0x0119, 0x0045}, + {0x011A, 0x011B, 0x0045}, {0x011A, 0x011B, 0x0045}, + {0x011C, 0x011D, 0x0047}, {0x011C, 0x011D, 0x0047}, + {0x011E, 0x011F, 0x0047}, {0x011E, 0x011F, 0x0047}, + {0x0120, 0x0121, 0x0047}, {0x0120, 0x0121, 0x0047}, + {0x0122, 0x0123, 0x0047}, {0x0122, 0x0123, 0x0047}, + {0x0124, 0x0125, 0x0048}, {0x0124, 0x0125, 0x0048}, + {0x0126, 0x0127, 0x0126}, {0x0126, 0x0127, 0x0126}, + {0x0128, 0x0129, 0x0049}, {0x0128, 0x0129, 0x0049}, + {0x012A, 0x012B, 0x0049}, {0x012A, 0x012B, 0x0049}, + {0x012C, 0x012D, 0x0049}, {0x012C, 0x012D, 0x0049}, + {0x012E, 0x012F, 0x0049}, {0x012E, 0x012F, 0x0049}, + {0x0130, 0x0069, 0x0049}, {0x0049, 0x0131, 0x0049}, + {0x0132, 0x0133, 0x0132}, {0x0132, 0x0133, 0x0132}, + {0x0134, 0x0135, 0x004A}, {0x0134, 0x0135, 0x004A}, + {0x0136, 0x0137, 0x004B}, {0x0136, 0x0137, 0x004B}, + {0x0138, 0x0138, 0x0138}, {0x0139, 0x013A, 0x004C}, + {0x0139, 0x013A, 0x004C}, {0x013B, 0x013C, 0x004C}, + {0x013B, 0x013C, 0x004C}, {0x013D, 0x013E, 0x004C}, + {0x013D, 0x013E, 0x004C}, {0x013F, 0x0140, 0x013F}, + {0x013F, 0x0140, 0x013F}, {0x0141, 0x0142, 0x0141}, + {0x0141, 0x0142, 0x0141}, {0x0143, 0x0144, 0x004E}, + {0x0143, 0x0144, 0x004E}, {0x0145, 0x0146, 0x004E}, + {0x0145, 0x0146, 0x004E}, {0x0147, 0x0148, 0x004E}, + {0x0147, 0x0148, 0x004E}, {0x0149, 0x0149, 0x0149}, + {0x014A, 0x014B, 0x014A}, {0x014A, 0x014B, 0x014A}, + {0x014C, 0x014D, 0x004F}, {0x014C, 0x014D, 0x004F}, + {0x014E, 0x014F, 0x004F}, {0x014E, 0x014F, 0x004F}, + {0x0150, 0x0151, 0x004F}, {0x0150, 0x0151, 0x004F}, + {0x0152, 0x0153, 0x0152}, {0x0152, 0x0153, 0x0152}, + {0x0154, 0x0155, 0x0052}, {0x0154, 0x0155, 0x0052}, + {0x0156, 0x0157, 0x0052}, {0x0156, 0x0157, 0x0052}, + {0x0158, 0x0159, 0x0052}, {0x0158, 0x0159, 0x0052}, + {0x015A, 0x015B, 0x0053}, {0x015A, 0x015B, 0x0053}, + {0x015C, 0x015D, 0x0053}, {0x015C, 0x015D, 0x0053}, + {0x015E, 0x015F, 0x0053}, {0x015E, 0x015F, 0x0053}, + {0x0160, 0x0161, 0x0053}, {0x0160, 0x0161, 0x0053}, + {0x0162, 0x0163, 0x0054}, {0x0162, 0x0163, 0x0054}, + {0x0164, 0x0165, 0x0054}, {0x0164, 0x0165, 0x0054}, + {0x0166, 0x0167, 0x0166}, {0x0166, 0x0167, 0x0166}, + {0x0168, 0x0169, 0x0055}, {0x0168, 0x0169, 0x0055}, + {0x016A, 0x016B, 0x0055}, {0x016A, 0x016B, 0x0055}, + {0x016C, 0x016D, 0x0055}, {0x016C, 0x016D, 0x0055}, + {0x016E, 0x016F, 0x0055}, {0x016E, 0x016F, 0x0055}, + {0x0170, 0x0171, 0x0055}, {0x0170, 0x0171, 0x0055}, + {0x0172, 0x0173, 0x0055}, {0x0172, 0x0173, 0x0055}, + {0x0174, 0x0175, 0x0057}, {0x0174, 0x0175, 0x0057}, + {0x0176, 0x0177, 0x0059}, {0x0176, 0x0177, 0x0059}, + {0x0178, 0x00FF, 0x0059}, {0x0179, 0x017A, 0x005A}, + {0x0179, 0x017A, 0x005A}, {0x017B, 0x017C, 0x005A}, + {0x017B, 0x017C, 0x005A}, {0x017D, 0x017E, 0x005A}, + {0x017D, 0x017E, 0x005A}, {0x0053, 0x017F, 0x0053}, + {0x0180, 0x0180, 0x0180}, {0x0181, 0x0253, 0x0181}, + {0x0182, 0x0183, 0x0182}, {0x0182, 0x0183, 0x0182}, + {0x0184, 0x0185, 0x0184}, {0x0184, 0x0185, 0x0184}, + {0x0186, 0x0254, 0x0186}, {0x0187, 0x0188, 0x0187}, + {0x0187, 0x0188, 0x0187}, {0x0189, 0x0256, 0x0189}, + {0x018A, 0x0257, 0x018A}, {0x018B, 0x018C, 0x018B}, + {0x018B, 0x018C, 0x018B}, {0x018D, 0x018D, 0x018D}, + {0x018E, 0x01DD, 0x018E}, {0x018F, 0x0259, 0x018F}, + {0x0190, 0x025B, 0x0190}, {0x0191, 0x0192, 0x0191}, + {0x0191, 0x0192, 0x0191}, {0x0193, 0x0260, 0x0193}, + {0x0194, 0x0263, 0x0194}, {0x01F6, 0x0195, 0x01F6}, + {0x0196, 0x0269, 0x0196}, {0x0197, 0x0268, 0x0197}, + {0x0198, 0x0199, 0x0198}, {0x0198, 0x0199, 0x0198}, + {0x019A, 0x019A, 0x019A}, {0x019B, 0x019B, 0x019B}, + {0x019C, 0x026F, 0x019C}, {0x019D, 0x0272, 0x019D}, + {0x019E, 0x019E, 0x019E}, {0x019F, 0x0275, 0x019F}, + {0x01A0, 0x01A1, 0x004F}, {0x01A0, 0x01A1, 0x004F}, + {0x01A2, 0x01A3, 0x01A2}, {0x01A2, 0x01A3, 0x01A2}, + {0x01A4, 0x01A5, 0x01A4}, {0x01A4, 0x01A5, 0x01A4}, + {0x01A6, 0x0280, 0x01A6}, {0x01A7, 0x01A8, 0x01A7}, + {0x01A7, 0x01A8, 0x01A7}, {0x01A9, 0x0283, 0x01A9}, + {0x01AA, 0x01AA, 0x01AA}, {0x01AB, 0x01AB, 0x01AB}, + {0x01AC, 0x01AD, 0x01AC}, {0x01AC, 0x01AD, 0x01AC}, + {0x01AE, 0x0288, 0x01AE}, {0x01AF, 0x01B0, 0x0055}, + {0x01AF, 0x01B0, 0x0055}, {0x01B1, 0x028A, 0x01B1}, + {0x01B2, 0x028B, 0x01B2}, {0x01B3, 0x01B4, 0x01B3}, + {0x01B3, 0x01B4, 0x01B3}, {0x01B5, 0x01B6, 0x01B5}, + {0x01B5, 0x01B6, 0x01B5}, {0x01B7, 0x0292, 0x01B7}, + {0x01B8, 0x01B9, 0x01B8}, {0x01B8, 0x01B9, 0x01B8}, + {0x01BA, 0x01BA, 0x01BA}, {0x01BB, 0x01BB, 0x01BB}, + {0x01BC, 0x01BD, 0x01BC}, {0x01BC, 0x01BD, 0x01BC}, + {0x01BE, 0x01BE, 0x01BE}, {0x01F7, 0x01BF, 0x01F7}, + {0x01C0, 0x01C0, 0x01C0}, {0x01C1, 0x01C1, 0x01C1}, + {0x01C2, 0x01C2, 0x01C2}, {0x01C3, 0x01C3, 0x01C3}, + {0x01C4, 0x01C6, 0x01C4}, {0x01C4, 0x01C6, 0x01C4}, + {0x01C4, 0x01C6, 0x01C4}, {0x01C7, 0x01C9, 0x01C7}, + {0x01C7, 0x01C9, 0x01C7}, {0x01C7, 0x01C9, 0x01C7}, + {0x01CA, 0x01CC, 0x01CA}, {0x01CA, 0x01CC, 0x01CA}, + {0x01CA, 0x01CC, 0x01CA}, {0x01CD, 0x01CE, 0x0041}, + {0x01CD, 0x01CE, 0x0041}, {0x01CF, 0x01D0, 0x0049}, + {0x01CF, 0x01D0, 0x0049}, {0x01D1, 0x01D2, 0x004F}, + {0x01D1, 0x01D2, 0x004F}, {0x01D3, 0x01D4, 0x0055}, + {0x01D3, 0x01D4, 0x0055}, {0x01D5, 0x01D6, 0x0055}, + {0x01D5, 0x01D6, 0x0055}, {0x01D7, 0x01D8, 0x0055}, + {0x01D7, 0x01D8, 0x0055}, {0x01D9, 0x01DA, 0x0055}, + {0x01D9, 0x01DA, 0x0055}, {0x01DB, 0x01DC, 0x0055}, + {0x01DB, 0x01DC, 0x0055}, {0x018E, 0x01DD, 0x018E}, + {0x01DE, 0x01DF, 0x0041}, {0x01DE, 0x01DF, 0x0041}, + {0x01E0, 0x01E1, 0x0041}, {0x01E0, 0x01E1, 0x0041}, + {0x01E2, 0x01E3, 0x00C6}, {0x01E2, 0x01E3, 0x00C6}, + {0x01E4, 0x01E5, 0x01E4}, {0x01E4, 0x01E5, 0x01E4}, + {0x01E6, 0x01E7, 0x0047}, {0x01E6, 0x01E7, 0x0047}, + {0x01E8, 0x01E9, 0x004B}, {0x01E8, 0x01E9, 0x004B}, + {0x01EA, 0x01EB, 0x004F}, {0x01EA, 0x01EB, 0x004F}, + {0x01EC, 0x01ED, 0x004F}, {0x01EC, 0x01ED, 0x004F}, + {0x01EE, 0x01EF, 0x01B7}, {0x01EE, 0x01EF, 0x01B7}, + {0x01F0, 0x01F0, 0x004A}, {0x01F1, 0x01F3, 0x01F1}, + {0x01F1, 0x01F3, 0x01F1}, {0x01F1, 0x01F3, 0x01F1}, + {0x01F4, 0x01F5, 0x0047}, {0x01F4, 0x01F5, 0x0047}, + {0x01F6, 0x0195, 0x01F6}, {0x01F7, 0x01BF, 0x01F7}, + {0x01F8, 0x01F9, 0x004E}, {0x01F8, 0x01F9, 0x004E}, + {0x01FA, 0x01FB, 0x0041}, {0x01FA, 0x01FB, 0x0041}, + {0x01FC, 0x01FD, 0x00C6}, {0x01FC, 0x01FD, 0x00C6}, + {0x01FE, 0x01FF, 0x00D8}, {0x01FE, 0x01FF, 0x00D8} +}; + +static MY_UNICASE_INFO plane02[] = { + {0x0200, 0x0201, 0x0041}, {0x0200, 0x0201, 0x0041}, + {0x0202, 0x0203, 0x0041}, {0x0202, 0x0203, 0x0041}, + {0x0204, 0x0205, 0x0045}, {0x0204, 0x0205, 0x0045}, + {0x0206, 0x0207, 0x0045}, {0x0206, 0x0207, 0x0045}, + {0x0208, 0x0209, 0x0049}, {0x0208, 0x0209, 0x0049}, + {0x020A, 0x020B, 0x0049}, {0x020A, 0x020B, 0x0049}, + {0x020C, 0x020D, 0x004F}, {0x020C, 0x020D, 0x004F}, + {0x020E, 0x020F, 0x004F}, {0x020E, 0x020F, 0x004F}, + {0x0210, 0x0211, 0x0052}, {0x0210, 0x0211, 0x0052}, + {0x0212, 0x0213, 0x0052}, {0x0212, 0x0213, 0x0052}, + {0x0214, 0x0215, 0x0055}, {0x0214, 0x0215, 0x0055}, + {0x0216, 0x0217, 0x0055}, {0x0216, 0x0217, 0x0055}, + {0x0218, 0x0219, 0x0053}, {0x0218, 0x0219, 0x0053}, + {0x021A, 0x021B, 0x0054}, {0x021A, 0x021B, 0x0054}, + {0x021C, 0x021D, 0x021C}, {0x021C, 0x021D, 0x021C}, + {0x021E, 0x021F, 0x0048}, {0x021E, 0x021F, 0x0048}, + {0x0220, 0x0220, 0x0220}, {0x0221, 0x0221, 0x0221}, + {0x0222, 0x0223, 0x0222}, {0x0222, 0x0223, 0x0222}, + {0x0224, 0x0225, 0x0224}, {0x0224, 0x0225, 0x0224}, + {0x0226, 0x0227, 0x0041}, {0x0226, 0x0227, 0x0041}, + {0x0228, 0x0229, 0x0045}, {0x0228, 0x0229, 0x0045}, + {0x022A, 0x022B, 0x004F}, {0x022A, 0x022B, 0x004F}, + {0x022C, 0x022D, 0x004F}, {0x022C, 0x022D, 0x004F}, + {0x022E, 0x022F, 0x004F}, {0x022E, 0x022F, 0x004F}, + {0x0230, 0x0231, 0x004F}, {0x0230, 0x0231, 0x004F}, + {0x0232, 0x0233, 0x0059}, {0x0232, 0x0233, 0x0059}, + {0x0234, 0x0234, 0x0234}, {0x0235, 0x0235, 0x0235}, + {0x0236, 0x0236, 0x0236}, {0x0237, 0x0237, 0x0237}, + {0x0238, 0x0238, 0x0238}, {0x0239, 0x0239, 0x0239}, + {0x023A, 0x023A, 0x023A}, {0x023B, 0x023B, 0x023B}, + {0x023C, 0x023C, 0x023C}, {0x023D, 0x023D, 0x023D}, + {0x023E, 0x023E, 0x023E}, {0x023F, 0x023F, 0x023F}, + {0x0240, 0x0240, 0x0240}, {0x0241, 0x0241, 0x0241}, + {0x0242, 0x0242, 0x0242}, {0x0243, 0x0243, 0x0243}, + {0x0244, 0x0244, 0x0244}, {0x0245, 0x0245, 0x0245}, + {0x0246, 0x0246, 0x0246}, {0x0247, 0x0247, 0x0247}, + {0x0248, 0x0248, 0x0248}, {0x0249, 0x0249, 0x0249}, + {0x024A, 0x024A, 0x024A}, {0x024B, 0x024B, 0x024B}, + {0x024C, 0x024C, 0x024C}, {0x024D, 0x024D, 0x024D}, + {0x024E, 0x024E, 0x024E}, {0x024F, 0x024F, 0x024F}, + {0x0250, 0x0250, 0x0250}, {0x0251, 0x0251, 0x0251}, + {0x0252, 0x0252, 0x0252}, {0x0181, 0x0253, 0x0181}, + {0x0186, 0x0254, 0x0186}, {0x0255, 0x0255, 0x0255}, + {0x0189, 0x0256, 0x0189}, {0x018A, 0x0257, 0x018A}, + {0x0258, 0x0258, 0x0258}, {0x018F, 0x0259, 0x018F}, + {0x025A, 0x025A, 0x025A}, {0x0190, 0x025B, 0x0190}, + {0x025C, 0x025C, 0x025C}, {0x025D, 0x025D, 0x025D}, + {0x025E, 0x025E, 0x025E}, {0x025F, 0x025F, 0x025F}, + {0x0193, 0x0260, 0x0193}, {0x0261, 0x0261, 0x0261}, + {0x0262, 0x0262, 0x0262}, {0x0194, 0x0263, 0x0194}, + {0x0264, 0x0264, 0x0264}, {0x0265, 0x0265, 0x0265}, + {0x0266, 0x0266, 0x0266}, {0x0267, 0x0267, 0x0267}, + {0x0197, 0x0268, 0x0197}, {0x0196, 0x0269, 0x0196}, + {0x026A, 0x026A, 0x026A}, {0x026B, 0x026B, 0x026B}, + {0x026C, 0x026C, 0x026C}, {0x026D, 0x026D, 0x026D}, + {0x026E, 0x026E, 0x026E}, {0x019C, 0x026F, 0x019C}, + {0x0270, 0x0270, 0x0270}, {0x0271, 0x0271, 0x0271}, + {0x019D, 0x0272, 0x019D}, {0x0273, 0x0273, 0x0273}, + {0x0274, 0x0274, 0x0274}, {0x019F, 0x0275, 0x019F}, + {0x0276, 0x0276, 0x0276}, {0x0277, 0x0277, 0x0277}, + {0x0278, 0x0278, 0x0278}, {0x0279, 0x0279, 0x0279}, + {0x027A, 0x027A, 0x027A}, {0x027B, 0x027B, 0x027B}, + {0x027C, 0x027C, 0x027C}, {0x027D, 0x027D, 0x027D}, + {0x027E, 0x027E, 0x027E}, {0x027F, 0x027F, 0x027F}, + {0x01A6, 0x0280, 0x01A6}, {0x0281, 0x0281, 0x0281}, + {0x0282, 0x0282, 0x0282}, {0x01A9,