diff options
-rw-r--r-- | configure.ac | 3 | ||||
-rw-r--r-- | scm/Makefile.am | 7 | ||||
-rw-r--r-- | scm/elmorph.c (renamed from scm/mod.c) | 0 | ||||
-rw-r--r-- | scm/elmorph.h (renamed from scm/elchr.h) | 0 | ||||
-rw-r--r-- | src/ellinika/Makefile.am | 44 | ||||
-rw-r--r-- | src/ellinika/aorist.c | 73 | ||||
-rw-r--r-- | src/ellinika/elchr.c (renamed from scm/elchr.c) | 63 | ||||
-rw-r--r-- | src/ellinika/elmorph.c | 655 | ||||
-rw-r--r-- | src/ellinika/elmorph.h | 46 | ||||
-rw-r--r-- | src/ellinika/elmorph.scm4 | 5 | ||||
-rw-r--r-- | src/ellinika/utf8.c (renamed from scm/utf8.c) | 16 | ||||
-rw-r--r-- | src/ellinika/utf8.h (renamed from scm/utf8.h) | 3 |
12 files changed, 882 insertions, 33 deletions
diff --git a/configure.ac b/configure.ac index 0f83e04..e4f4f7c 100644 --- a/configure.ac +++ b/configure.ac @@ -22,7 +22,8 @@ AC_INIT(ellinika, 1.99.99, [gray+ellinika@gnu.org.ua]) AC_CONFIG_SRCDIR(src/cgi-bin/dict.scm4) AC_CONFIG_AUX_DIR([build-aux]) AC_CANONICAL_SYSTEM -AM_INIT_AUTOMAKE(no-exeext) +AM_INIT_AUTOMAKE([1.11 no-exeext]) +AM_CONFIG_HEADER([config.h]) ## * Checks for programs. AC_PROG_CC diff --git a/scm/Makefile.am b/scm/Makefile.am index 4ae4480..f2669f4 100644 --- a/scm/Makefile.am +++ b/scm/Makefile.am @@ -34,10 +34,3 @@ neatrans: $(srcdir)/neatrans.scm dictrans.sed sed -f dictrans.sed $(srcdir)/neatrans.scm > $@ chmod +x $@ -lib_LTLIBRARIES=libelchr.la - -libelchr_la_SOURCES = \ - utf8.c\ - elchr.c\ - mod.c - diff --git a/scm/mod.c b/scm/elmorph.c index 87598d9..87598d9 100644 --- a/scm/mod.c +++ b/scm/elmorph.c diff --git a/scm/elchr.h b/scm/elmorph.h index 6bc19ca..6bc19ca 100644 --- a/scm/elchr.h +++ b/scm/elmorph.h diff --git a/src/ellinika/Makefile.am b/src/ellinika/Makefile.am index 136b44f..274eea8 100644 --- a/src/ellinika/Makefile.am +++ b/src/ellinika/Makefile.am @@ -15,7 +15,7 @@ # along with this program. If not, see <http://www.gnu.org/licenses/>. guiledir=$(GUILE_SITE)/$(PACKAGE) -guile_DATA=xlat.scm cgi.scm i18n.scm config.scm dico.scm +guile_DATA=xlat.scm cgi.scm i18n.scm config.scm dico.scm elmorph.scm cgi.m4: Makefile echo 'divert(-1)' > $@ @@ -31,13 +31,53 @@ cgi.m4: Makefile echo 'define([SYSCONFDIR],$(sysconfdir))' >> $@ echo 'define([LOCALEDIR],$(datadir)/locale)' >> $@ echo 'define([HTMLDIR],$(HTMLDIR))' >> $@ + echo 'define([VERSION],$(VERSION))' >> $@ + echo 'define([LIBDIR],$(pkglibdir))' >> $@ echo 'divert(0)dnl' >> $@ echo '@AUTOGENERATED@' >> $@ -SUFFIXES = .scm4 .scm +SUFFIXES = .scm4 .scm .x .scm4.scm: m4 cgi.m4 $< > $@ cgi.scm: cgi.scm4 cgi.m4 config.scm: config.scm4 cgi.m4 +elmorph.scm: elmorph.scm4 cgi.m4 + +pkglib_LTLIBRARIES=libelmorph.la + +libelmorph_la_SOURCES = \ + aorist.c\ + utf8.c\ + elchr.c\ + elmorph.c\ + elmorph.h + +DOT_X_FILES = elmorph.x + +BUILT_SOURCES = $(DOT_X_FILES) + +DISTCLEANFILES = $(DOT_X_FILES) + +snarfcppopts = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) + +.c.x: + AWK=$(AWK) \ + guile-snarf -o $@ $< $(snarfcppopts) + +pkglibnames=elmorph + +install-data-hook: + here=`pwd`; \ + cd $(DESTDIR)$(pkglibdir);\ + for name in $(pkglibnames); do \ + if test -f lib$$name.la; then \ + dlname=`sed -n 's/dlname='\''\(.*\)'\''/\1/p' lib$$name.la`; \ + test -z "$$dlname" && dlname='lib$$name.so'; \ + $(LN_S) -f "$$dlname" libguile-$$name-v-$(VERSION).so; \ + fi; \ + done; \ + cd $$here + + diff --git a/src/ellinika/aorist.c b/src/ellinika/aorist.c new file mode 100644 index 0000000..995fce8 --- /dev/null +++ b/src/ellinika/aorist.c @@ -0,0 +1,73 @@ +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif +#include <errno.h> +#include <stdlib.h> +#include <libguile.h> +#include "utf8.h" +#include "elmorph.h" + +int +elmorph_thema_aoristoy(unsigned *word, size_t len, + unsigned **thema, size_t *tlen) +{ + unsigned ch, *pw; + + switch (word[len-1]) { + case 0x03B6: /* ζ */ + /* FIXME: This can produce ξ as well: αλλάζω => άλλαξα */ + case 0x03B8: /* θ */ + ch = 0x03C3; /* σ */ + break; + + case 0x03B3: /* γ */ + case 0x03C7: /* χ */ + ch = 0x03BE; /* ξ */ + break; + + case 0x03BA: /* κ */ + if (len > 1 && word[len-2] == 0x03C3 /* σκ */) + len--; + ch = 0x03BE; /* ξ */ + break; + + case 0x03BD: /* ν */ + if (len > 1 && word[len-2] == 0x03C7 /* χν */) { + len--; + ch = 0x03BE; /* ξ */ + } else + ch = 0x03C3; /* σ */ + break; + + case 0x03B2: /* β */ + case 0x03C0: /* π */ + case 0x03C6: /* φ */ + ch = 0x03C8; /* ψ */ + break; + + case 0x03CD: /* ύ */ + case 0x03C5: /* υ FIXME: This assumes the word has been deaccentized */ + if (len > 1 && (word[len-2] == 0x03B1 /* αύ */ || + word[len-2] == 0x03B5 /* εύ */)) { + ch = 0x03C8; /* ψ */ + break; + } + + default: + len++; + ch = 0x03C3; /* σ */ + } + + pw = calloc(len, sizeof(pw[0])); + if (!pw) + return -1; + memcpy(pw, word, sizeof(word[0]) * (len - 1)); + pw[len-1] = ch; + + *thema = pw; + *tlen = len; + return 0; +} + + + diff --git a/scm/elchr.c b/src/ellinika/elchr.c index fa271a1..9b4e7ad 100644 --- a/scm/elchr.c +++ b/src/ellinika/elchr.c @@ -5,7 +5,7 @@ #include <stdlib.h> #include <libguile.h> #include "utf8.h" -#include "elchr.h" +#include "elmorph.h" struct char_info_st { unsigned ch; /* Characters */ @@ -160,23 +160,23 @@ struct char_info_st el_basic_ctype[] = { { 0x0386, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0391, 0x03AC }, /* Ά */ { 0x0387, CHF_PUNCT }, /* ano teleia */ { 0x0388, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0395, 0x03AD }, /* Έ */ - { 0x0389, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0397, 0x03AE }, /* Ή */ - { 0x038A, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0399, 0x03AF }, /* Ί */ + { 0x0389, CHF_VOWEL|CHF_UPPER|CHF_OXEIA|CHF_DIPH2, 0x0397, 0x03AE }, /* Ή */ + { 0x038A, CHF_VOWEL|CHF_UPPER|CHF_OXEIA|CHF_DIPH2, 0x0399, 0x03AF }, /* Ί */ { 0x038B, }, { 0x038C, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x039F, 0x03CC }, /* Ό */ { 0x038D, }, - { 0x038E, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x03A5, 0x03CD }, /* Ύ */ + { 0x038E, CHF_VOWEL|CHF_UPPER|CHF_OXEIA|CHF_DIPH2, 0x03A5, 0x03CD }, /* Ύ */ { 0x038F, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x03A9, 0x03CE }, /* Ώ */ { 0x0390, CHF_VOWEL|CHF_LOWER|CHF_TREMA|CHF_OXEIA, 0x03B9, 0, 0, 0, 0, 0x03CA }, /* ΐ */ - { 0x0391, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03B1, 1, 0x0386 }, /* Α */ + { 0x0391, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH1, 0, 0x03B1, 1, 0x0386 }, /* Α */ { 0x0392, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B2, 2 }, /* Β */ { 0x0393, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B3, 3 }, /* Γ */ { 0x0394, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B4, 4 }, /* Δ */ - { 0x0395, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03B5, 5, 0x0388 }, /* Ε */ + { 0x0395, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH1, 0, 0x03B5, 5, 0x0388 }, /* Ε */ { 0x0396, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B6, 7 }, /* Ζ */ { 0x0397, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03B7, 8, 0x0389 }, /* Η */ { 0x0398, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B8, 9 }, /* Θ */ - { 0x0399, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03B9, 10, 0x038A }, /* Ι */ + { 0x0399, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH2, 0, 0x03B9, 10, 0x038A }, /* Ι */ { 0x039A, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BA, 20 }, /* Κ */ { 0x039B, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BB, 30 }, /* Λ */ { 0x039C, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BC, 40 }, /* Μ */ @@ -188,47 +188,48 @@ struct char_info_st el_basic_ctype[] = { { 0x03A2, }, { 0x03A3, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C3, 200 }, /* Σ */ { 0x03A4, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C4, 300 }, /* Τ */ - { 0x03A5, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03C5, 400, 0x038E }, /* Υ */ + { 0x03A5, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH1|CHF_DIPH2, 0, 0x03C5, 400, 0x038E }, /* Υ */ { 0x03A6, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C6, 500 }, /* Φ */ { 0x03A7, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C7, 600 }, /* Χ */ { 0x03A8, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C8, 700 }, /* Ψ */ { 0x03A9, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03C9, 800, 0x038F }, /* Ω */ - { 0x03AA, CHF_VOWEL|CHF_UPPER|CHF_TREMA, 0x0399, 0x03CA }, /* Ϊ */ + { 0x03AA, CHF_VOWEL|CHF_UPPER|CHF_TREMA|CHF_DIPH2, 0x0399, 0x03CA }, /* Ϊ */ { 0x03AB, CHF_VOWEL|CHF_UPPER|CHF_TREMA, 0x03A5, 0x03CB }, /* Ϋ */ { 0x03AC, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B1, 0x0386 }, /* ά */ { 0x03AD, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B4, 0x0388 }, /* έ */ - { 0x03AE, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B7, 0x0389 }, /* ή */ - { 0x03AF, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B9, 0x038A }, /* ί */ + { 0x03AE, CHF_VOWEL|CHF_LOWER|CHF_OXEIA|CHF_DIPH2, 0x03B7, 0x0389 }, /* ή */ + { 0x03AF, CHF_VOWEL|CHF_LOWER|CHF_OXEIA|CHF_DIPH2, 0x03B9, 0x038A }, /* ί */ { 0x03B0, CHF_VOWEL|CHF_OXEIA|CHF_TREMA, 0x03C5, 0, 0, 0, 0, 0x03CB }, /* ΰ */ - { 0x03B1, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x0391, 1, 0x03AC }, /* α */ + { 0x03B1, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH1, 0, 0x0391, 1, 0x03AC }, /* α */ { 0x03B2, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0392, 2 }, /* β */ { 0x03B3, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0393, 3 }, /* γ */ { 0x03B4, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0394, 4 }, /* δ */ - { 0x03B5, CHF_CONSONANT|CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x0395, 5, 0x03AD }, /* ε */ + { 0x03B5, CHF_CONSONANT|CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH1, 0, 0x0395, 5, 0x03AD }, /* ε */ { 0x03B6, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0396, 7 }, /* ζ */ - { 0x03B7, CHF_CONSONANT|CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x0397, 8, 0x03AE }, /* η */ + { 0x03B7, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH1|CHF_DIPH2, 0, 0x0397, 8, 0x03AE }, /* η */ { 0x03B8, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0398, 9 }, /* θ */ - { 0x03B9, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0399, 10, 0x03AF }, /* ι */ + { 0x03B9, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x0399, 10, 0x03AF }, /* ι */ { 0x03BA, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039A, 20 }, /* κ */ { 0x03BB, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039B, 30 }, /* λ */ { 0x03BC, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039C, 40 }, /* μ */ { 0x03BD, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039D, 50 }, /* ν */ { 0x03BE, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039E, 60 }, /* ξ */ - { 0x03BF, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x039F, 70, 0x03CC }, /* ο */ + + { 0x03BF, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x039F, 70, 0x03CC }, /* ο */ { 0x03C0, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A0, 80 }, /* π */ { 0x03C1, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A1, 100 }, /* ρ */ { 0x03C2, CHF_CONSONANT|CHF_LOWER, 0, 0x03A3 }, /* ς */ { 0x03C3, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A3, 200 }, /* σ */ { 0x03C4, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A4, 300 }, /* τ */ - { 0x03C5, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x03A5, 400, 0x03CD }, /* υ */ + { 0x03C5, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH2, 0, 0x03A5, 400, 0x03CD }, /* υ */ { 0x03C6, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A6, 500 }, /* φ */ { 0x03C7, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A7, 600 }, /* χ */ { 0x03C8, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A8, 700 }, /* ψ */ { 0x03C9, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x03A9, 800, 0x03CE }, /* ω */ - { 0x03CA, CHF_VOWEL|CHF_LOWER|CHF_TREMA, 0x03B9, 0x03AA, 0, 0x0390 }, /* ϊ */ + { 0x03CA, CHF_VOWEL|CHF_LOWER|CHF_TREMA|CHF_DIPH2, 0x03B9, 0x03AA, 0, 0x0390 }, /* ϊ */ { 0x03CB, CHF_VOWEL|CHF_LOWER|CHF_TREMA, 0x03C5, 0x03AB, 0, 0x03B0 }, /* ϋ */ { 0x03CC, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03BF, 0x038C }, /* ό */ - { 0x03CD, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03C5, 0x038E }, /* ύ */ + { 0x03CD, CHF_VOWEL|CHF_LOWER|CHF_OXEIA|CHF_DIPH2, 0x03C5, 0x038E }, /* ύ */ { 0x03CE, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03CE, 0x038F }, /* ώ */ { 0x03CF, CHF_SYMBOL|CHF_UPPER, 0x03D7 }, /* KAI */ { 0x03D0, CHF_CONSONANT|CHF_LOWER, 0, 0x0392 }, /* curled beta */ @@ -674,9 +675,27 @@ unsigned elchr_accent(unsigned ch, int acc) { struct char_info_st *ci = elchr_info(ch); - return (ci && (ci->flags & CHF_ACCENT_MASK) && ci->accented[acc-1]) ? - ci->accented[acc-1] : ch; + return (ci && ci->accented[acc-1]) ? ci->accented[acc-1] : ch; } +int +elchr_diphthong(unsigned ch, int state) +{ + struct char_info_st *ci = elchr_info(ch); - + if (!ci || !(ci->flags & CHF_VOWEL)) + return 0; + switch (state) { + case 0: + if (ci->flags & CHF_DIPH1) + state = 1; + break; + case 1: + if (ci->flags & CHF_DIPH2) + state = 2; + break; + default: + state = 0; + } + return state; +} diff --git a/src/ellinika/elmorph.c b/src/ellinika/elmorph.c new file mode 100644 index 0000000..5234eda --- /dev/null +++ b/src/ellinika/elmorph.c @@ -0,0 +1,655 @@ +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif +#include <errno.h> +#include <stdlib.h> +#include <libguile.h> +#include "utf8.h" +#include "elmorph.h" + +struct elstr { + unsigned *str; /* UTF-8 string */ + size_t len; /* Its length */ + unsigned nsyl; /* Number of syllables. */ + unsigned *sylmap; /* Syllable map (nsyl elements) */ + unsigned acc_syl; /* Number of the accented syllable + (1-based, from the last syllable) */ + unsigned acc_pos; /* Number of the accented character + (0-based, from str[0]) */ +}; + +scm_t_bits _elstr_tag; + +static void +_elstr_syllabize(struct elstr *elstr) +{ + unsigned *sylmap; + unsigned i, nsyl = 0, accsyl = 0, accchr = 0; + int dstate = 0; + int acc = 0; + + sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len, "syllable map"); + + for (i = 0; i < elstr->len; i++) { + int nstate; + + if (elchr_getaccent(elstr->str[i])) { + accsyl = nsyl; + accchr = i; + } + nstate = elchr_diphthong(elstr->str[i], dstate); + if (nstate) + /* skip */; + else if (dstate) + sylmap[nsyl++] = i - 1; + else if (elchr_isvowel(elstr->str[i])) + sylmap[nsyl++] = i; + dstate = nstate; + } + if (dstate) + sylmap[nsyl++] = i - 1; + else + sylmap[nsyl-1] = i - 1; + elstr->sylmap = sylmap; + elstr->nsyl = nsyl; + elstr->acc_pos = accchr; + elstr->acc_syl = nsyl - accsyl; +} + +static SCM +_elstr_alloc(const char *instr) +{ + struct elstr *elstr; + unsigned *wptr; + size_t wlen; + + if (utf8_mbstr_to_wc(instr, &wptr, &wlen)) + return SCM_EOL; + + elstr = scm_gc_malloc(sizeof(*elstr), "Elstr"); + elstr->str = wptr; + elstr->len = wlen; + + _elstr_syllabize(elstr); + + SCM_RETURN_NEWSMOB(_elstr_tag, elstr); +} + +static SCM +_elstr_dup(struct elstr *elstr) +{ + struct elstr *elnew; + + elnew = scm_gc_malloc(sizeof(*elstr), "Elstr"); + elnew->str = calloc(elstr->len, sizeof(elnew->str[0])); + if (!elnew->str) + scm_memory_error("_elstr_dup"); + elnew->sylmap = calloc(elstr->nsyl, sizeof(elnew->sylmap[0])); + if (!elnew->sylmap) { + free(elnew->str); + scm_memory_error("_elstr_dup"); + } + memcpy(elnew->str, elstr->str, sizeof(elstr->str[0]) * elstr->len); + elnew->len = elstr->len; + elnew->nsyl = elstr->nsyl; + memcpy(elnew->sylmap, elstr->sylmap, + sizeof(elstr->sylmap[0]) * elstr->nsyl); + elnew->acc_syl = elstr->acc_syl; + elnew->acc_pos = elstr->acc_pos; + SCM_RETURN_NEWSMOB(_elstr_tag, elnew); +} + +static scm_sizet +_elstr_free(SCM smob) +{ + struct elstr *elstr = (struct elstr *) SCM_CDR(smob); + free(elstr->str); + free(elstr->sylmap); + free(elstr); + return 0; +} + +static int +_elstr_print(SCM smob, SCM port, scm_print_state *pstate) +{ + struct elstr *elstr = (struct elstr *) SCM_CDR(smob); + int i, j, an; + char *s; + + scm_puts("#<elstr ``", port); + an = elstr->nsyl - elstr->acc_syl; + if (an == 0) + scm_puts("[", port); + for (i = j = 0; i < elstr->len; i++) { + char r[6]; + int n; + + if (i == elstr->sylmap[j] + 1) { + if (j == an) + scm_puts("]", port); + scm_puts("-", port); + if (++j == an) + scm_puts("[", port); + } + n = utf8_wctomb(r, elstr->str[i]); + if (n == -1) + continue; + r[n] = 0; + scm_puts(r, port); + } + if (j == an) + scm_puts("]", port); + scm_puts("''>", port); + return 1; +} + +static void +_elstr_init() +{ + _elstr_tag = scm_make_smob_type("Elstr", sizeof(struct elstr)); + scm_set_smob_free(_elstr_tag, _elstr_free); + scm_set_smob_print(_elstr_tag, _elstr_print); +} + +SCM_DEFINE_PUBLIC(scm_string__elstr, "string->elstr", 1, 0, 0, + (SCM string), +"Create new ELSTR from STRING\n") +#define FUNC_NAME s_scm_string__elstr +{ + char *str; + SCM scm; + + SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME); + str = scm_to_locale_string(string); + scm = _elstr_alloc(str); + free(str); + if (scm == SCM_EOL) + scm_misc_error(FUNC_NAME, + "Invalid input string: ~S", + scm_list_1(string)); + return scm; +} +#undef FUNC_NAME + +#define scm_is_elstr(s) (!SCM_IMP(s) && SCM_CELL_TYPE(s) == _elstr_tag) + +SCM_DEFINE_PUBLIC(scm_elstr__string, "elstr->string", 1, 0, 0, + (SCM el), +"Convert EL to a STRING\n") +#define FUNC_NAME s_scm_elstr__string +{ + struct elstr *elstr; + char *s; + SCM scm; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + if (utf8_wc_to_mbstr(elstr->str, elstr->len, &s)) + scm_misc_error(FUNC_NAME, + "cannot convert elstr to Scheme", + SCM_EOL); + scm = scm_from_locale_string(s); + free(s); + return scm; +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_length, "elstr-length", 1, 0, 0, + (SCM el), +"Returns the number of characters in EL\n") +#define FUNC_NAME s_scm_elstr_length +{ + struct elstr *elstr; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + return scm_from_uint(elstr->len); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_number_of_syllables, "elstr-number-of-syllables", + 1, 0, 0, + (SCM el), +"Returns the number of characters in EL\n") +#define FUNC_NAME s_scm_elstr_number_of_syllables +{ + struct elstr *elstr; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + return scm_from_uint(elstr->nsyl); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_syllable_prop, "elstr-syllable-prop", + 2, 0, 0, + (SCM el, SCM n), +"Returns properties of the syllable N in EL\n") +#define FUNC_NAME s_scm_elstr_syllable_prop +{ + struct elstr *elstr; + unsigned num, start; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); + num = scm_to_uint(n); + if (num > elstr->nsyl) + scm_misc_error(FUNC_NAME, + "cannot get syllable #~S: not enough syllables: ~S", + scm_list_2(el, n)); + num = elstr->nsyl - num; + if (num == 0) + start = 0; + else + start = elstr->sylmap[num - 1] + 1; + + return scm_cons(scm_from_uint(start), + scm_from_uint(elstr->sylmap[num])); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_accent_position, "elstr-accent-position", 1, 0, 0, + (SCM el), +"Return position of the accented character in EL\n") +#define FUNC_NAME s_scm_elstr_accent_position +{ + struct elstr *elstr; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + return scm_from_uint(elstr->acc_pos); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_accented_syllable, "elstr-accented-syllable", + 1, 0, 0, + (SCM el), +"Return position of the accented syllable in EL\n") +#define FUNC_NAME s_scm_elstr_accented_syllable +{ + struct elstr *elstr; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + return scm_from_uint(elstr->acc_syl); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_syllable, "elstr-syllable", + 2, 0, 0, + (SCM el, SCM n), +"Return Nth syllable in EL\n") +#define FUNC_NAME s_scm_elstr_accented_syllable +{ + struct elstr *elstr; + char *s; + SCM scm; + unsigned num, start; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); + num = scm_to_uint(n); + if (num > elstr->nsyl) + scm_misc_error(FUNC_NAME, + "cannot get syllable #~S: not enough syllables: ~S", + scm_list_2(el, n)); + num = elstr->nsyl - num; + if (num == 0) + start = 0; + else + start = elstr->sylmap[num - 1] + 1; + if (utf8_wc_to_mbstr(elstr->str + start, + elstr->sylmap[num] - start + 1, + &s)) + scm_misc_error(FUNC_NAME, + "cannot convert elstr to Scheme", + SCM_EOL); + scm = scm_from_locale_string(s); + free(s); + return scm; +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_character, "elstr-character", + 2, 0, 0, + (SCM el, SCM n), +"Return Nth character in EL\n") +#define FUNC_NAME s_scm_elstr_character +{ + struct elstr *elstr; + unsigned num; + char r[6]; + int len; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); + num = scm_to_uint(n); + if (num >= elstr->len) + scm_misc_error(FUNC_NAME, + "cannot get character #~S: not enough characters: ~S", + scm_list_2(el, n)); + len = utf8_wctomb(r, elstr->str[num]); + if (len <= 0) + scm_misc_error(FUNC_NAME, + "cannot convert elchr to Scheme", + SCM_EOL); + r[len] = 0; + return scm_from_locale_string(r); +} +#undef FUNC_NAME + +static SCM +_elstr_chgcase(SCM el, void (*chgfun)(unsigned *, size_t), + int destructive, const char *func_name) +{ + struct elstr *elstr; + SCM scm; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); + elstr = (struct elstr*) SCM_CDR(el); + if (destructive) + scm = SCM_UNSPECIFIED; + else { + scm = _elstr_dup(elstr); + elstr = (struct elstr*) SCM_CDR(scm); + } + chgfun(elstr->str, elstr->len); + return scm; +} + +SCM_DEFINE_PUBLIC(scm_elstr_toupper, "elstr-toupper", + 1, 0, 0, + (SCM el), +"Convert EL to upper case\n") +#define FUNC_NAME s_scm_elstr_toupper +{ + return _elstr_chgcase(el, utf8_wc_strnupper, 0, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_tolower, "elstr-tolower", + 1, 0, 0, + (SCM el), +"Convert EL to lower case\n") +#define FUNC_NAME s_scm_elstr_tolower +{ + return _elstr_chgcase(el, utf8_wc_strnlower, 0, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_toupper_x, "elstr-toupper!", + 1, 0, 0, + (SCM el), +"Convert EL to upper case (destructive)\n") +#define FUNC_NAME s_scm_elstr_toupper_x +{ + return _elstr_chgcase(el, utf8_wc_strnupper, 1, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_tolower_x, "elstr-tolower!", + 1, 0, 0, + (SCM el), +"Convert EL to lower case (destructive)\n") +#define FUNC_NAME s_scm_elstr_tolower_x +{ + return _elstr_chgcase(el, utf8_wc_strnlower, 0, FUNC_NAME); +} +#undef FUNC_NAME + +static SCM +_elstr_deaccent(SCM el, int destructive, const char *func_name) +{ + struct elstr *elstr; + unsigned i; + SCM scm; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); + elstr = (struct elstr*) SCM_CDR(el); + if (destructive) + scm = SCM_UNSPECIFIED; + else { + scm = _elstr_dup(elstr); + elstr = (struct elstr*) SCM_CDR(scm); + } + for (i = 0; i < elstr->len; i++) + elstr->str[i] = elchr_deaccent(elstr->str[i]); + elstr->acc_pos = 0; + elstr->acc_syl = 0; + return scm; +} + +SCM_DEFINE_PUBLIC(scm_elstr_deaccent, "elstr-deaccent", + 1, 0, 0, + (SCM el), +"Remove all accents from EL\n") +#define FUNC_NAME s_scm_elstr_deaccent +{ + return _elstr_deaccent(el, 0, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_deaccent_x, "elstr-deaccent!", + 1, 0, 0, + (SCM el), +"Remove all accents from EL (desctructive)\n") +#define FUNC_NAME s_scm_elstr_deaccent_x +{ + return _elstr_deaccent(el, 1, FUNC_NAME); +} +#undef FUNC_NAME + +static SCM +_elstr_set_accent(SCM el, SCM n, int destructive, const char *func_name) +{ + struct elstr *elstr; + unsigned i; + unsigned acc_num, num, len, start; + SCM scm; + int dstate; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name); + elstr = (struct elstr*) SCM_CDR(el); + num = scm_to_uint(n); + if (num > elstr->nsyl) + scm_misc_error(func_name, + "cannot get syllable #~S: not enough syllables: ~S", + scm_list_2(el, n)); + acc_num = elstr->nsyl - num; + if (acc_num == 0) + start = 0; + else + start = elstr->sylmap[acc_num - 1] + 1; + + if (destructive) + scm = SCM_UNSPECIFIED; + else { + scm = _elstr_dup(elstr); + elstr = (struct elstr*) SCM_CDR(scm); + } + + /* Clear all accents */ + for (i = 0; i < elstr->len; i++) + elstr->str[i] = elchr_deaccent(elstr->str[i]); + len = elstr->sylmap[acc_num] - start + 1; + dstate = 0; + for (i = start; i <= start + len; i++) { + int nstate; + + if (!elchr_isvowel(elstr->str[i])) { + if (dstate) { + --i; + break; + } + continue; + } + nstate = elchr_diphthong(elstr->str[i], dstate); + if (!nstate) + break; + dstate = nstate; + } + elstr->str[i] = elchr_accent(elstr->str[i], CHF_OXEIA); + elstr->acc_syl = num; + return scm; +} + +SCM_DEFINE_PUBLIC(scm_elstr_set_accent, "elstr-set-accent", + 2, 0, 0, + (SCM el, SCM n), +"Set accent on Nth syllable of EL\n") +{ + return _elstr_set_accent(el, n, 0, s_scm_elstr_set_accent); +} + +SCM_DEFINE_PUBLIC(scm_elstr_set_accent_x, "elstr-set-accent!", + 2, 0, 0, + (SCM el, SCM n), +"Set accent on Nth syllable of EL (destructive)\n") +{ + return _elstr_set_accent(el, n, 1, s_scm_elstr_set_accent_x); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_char_prop_bitmask, "elstr-char-prop-bitmask", + 2, 0, 0, + (SCM el, SCM n), +"Returns properties of the Nth char in EL, as a bitmask\n") +#define FUNC_NAME s_scm_elstr_char_prop_bitmask +{ + struct elstr *elstr; + unsigned num; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + num = scm_to_uint(n); + if (num >= elstr->len) + scm_misc_error(FUNC_NAME, + "cannot get character #~S: not enough characters: ~S", + scm_list_2(el, n)); + return scm_from_uint(elchr_flags(elstr->str[num])); +} +#undef FUNC_NAME + +static struct deftab { + unsigned val; + char *sym; +} deftab[] = { + { CHF_OXEIA, "elmorph:oxeia" }, + { CHF_PERISPWMENH, "elmorph:perispwmenh" }, + { CHF_BAREIA, "elmorph:bareia" }, + { CHF_ACCENT_MASK, "elmorph:accent-mask" }, + { CHF_TREMA, "elmorph:trema" }, + { CHF_VOWEL, "elmorph:vowel" }, + { CHF_CONSONANT, "elmorph:consonant" }, + { CHF_SEMIVOWEL, "elmorph:semivowel" }, + { CHF_PUNCT, "elmorph:punct" }, + { CHF_SYMBOL, "elmorph:symbol" }, + { CHF_MODIFIER, "elmorph:modifier" }, + { CHF_ARCHAIC, "elmorph:archaic" }, + { CHF_LOWER, "elmorph:lower" }, + { CHF_UPPER, "elmorph:upper" }, + { CHF_NUMERIC, "elmorph:numeric" }, + + { CHF_DIPH1, "elmorph:diph1" }, + { CHF_DIPH2, "elmorph:diph2" } +}; + +SCM_DEFINE_PUBLIC(scm_utf8_toupper, "utf8-toupper", 1, 0, 0, + (SCM string), +"Convert STRING to uppercase\n") +#define FUNC_NAME s_scm_utf8_toupper +{ + char *str; + SCM scm; + + SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME); + str = scm_to_locale_string(string); + if (utf8_toupper(str, strlen(str))) + scm_misc_error(FUNC_NAME, + "cannot convert to upper case: ~S", + scm_list_1(string)); + scm = scm_from_locale_string(str); + free(str); + return scm; +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_utf8_tolower, "utf8-tolower", 1, 0, 0, + (SCM string), +"Convert STRING to lowercase\n") +#define FUNC_NAME s_scm_utf8_tolower +{ + char *str; + SCM scm; + + SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME); + str = scm_to_locale_string(string); + if (utf8_tolower(str, strlen(str))) + scm_misc_error(FUNC_NAME, + "cannot convert to lower case: ~S", + scm_list_1(string)); + scm = scm_from_locale_string(str); + free(str); + return scm; +} +#undef FUNC_NAME + +static SCM +_elstr_thema_aoristoy(SCM el, int destructive, const char *func_name) +{ + struct elstr *elstr; + SCM scm; + unsigned *wc; + size_t wclen; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); + elstr = (struct elstr*) SCM_CDR(el); + if (destructive) + scm = SCM_UNSPECIFIED; + else { + scm = _elstr_dup(elstr); + elstr = (struct elstr*) SCM_CDR(scm); + } + if (elmorph_thema_aoristoy(elstr->str, elstr->len, &wc, &wclen)) + scm_memory_error(func_name); + free(elstr->str); + elstr->str = wc; + elstr->len = wclen; + return scm; +} + +SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy, "elstr-thema-aoristoy", 1, 0, 0, + (SCM thema), +"Convert THEMA, which must be a root of present. to an aorist root\n") +#define FUNC_NAME s_scm_elstr_thema_aoristoy +{ + return _elstr_thema_aoristoy(thema, 0, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy_x, "elstr-thema-aoristoy!", 1, 0, 0, + (SCM thema), +"Convert THEMA, which must be a root of present. to an aorist root (destructive)\n") +#define FUNC_NAME s_scm_elstr_thema_aoristoy_x +{ + return _elstr_thema_aoristoy(thema, 1, FUNC_NAME); +} +#undef FUNC_NAME + + +void +scm_init_ellinika_elmorph_module() +{ + int i; + + _elstr_init(); + for (i = 0; i < sizeof(deftab)/sizeof(deftab[0]); i++) { + scm_c_define(deftab[i].sym, scm_from_uint(deftab[i].val)); + scm_c_export(deftab[i].sym, NULL); + } +#include "elmorph.x" +} diff --git a/src/ellinika/elmorph.h b/src/ellinika/elmorph.h new file mode 100644 index 0000000..d91f513 --- /dev/null +++ b/src/ellinika/elmorph.h @@ -0,0 +1,46 @@ +#define CHF_OXEIA 1 +#define CHF_PERISPWMENH 2 +#define CHF_BAREIA 3 + +#define CHF_ACCENT_MASK 0x000f + +#define CHF_TREMA 0x0010 + +#define CHF_VOWEL 0x00020 +#define CHF_CONSONANT 0x00040 +#define CHF_SEMIVOWEL 0x00080 +#define CHF_PUNCT 0x00100 +#define CHF_SYMBOL 0x00200 +#define CHF_MODIFIER 0x00400 +#define CHF_ARCHAIC 0x00800 +#define CHF_LOWER 0x01000 +#define CHF_UPPER 0x02000 +#define CHF_NUMERIC 0x04000 + +#define CHF_DIPH1 0x10000 +#define CHF_DIPH2 0x20000 + +int elchr_flags(unsigned ch); +int elchr_isupper(unsigned ch); +int elchr_islower(unsigned ch); +int elchr_getaccent(unsigned ch); +int elchr_istrema(unsigned ch); +int elchr_isvowel(unsigned ch); +int elchr_isconsonant(unsigned ch); +int elchr_issemivowel(unsigned ch); +int elchr_ispunct(unsigned ch); +int elchr_issymbol(unsigned ch); +int elchr_ismodifier(unsigned ch); +int elchr_isarchaic(unsigned ch); +int elchr_isnumeric(unsigned ch); +unsigned elchr_numeric_value(unsigned ch); +unsigned elchr_toupper(unsigned ch); +unsigned elchr_tolower(unsigned ch); +unsigned elchr_base(unsigned ch); +unsigned elchr_deaccent(unsigned ch); +unsigned elchr_accent(unsigned ch, int acc); +int elchr_diphthong(unsigned ch, int state); + + +int elmorph_thema_aoristoy(unsigned *word, size_t len, + unsigned **thema, size_t *tlen); diff --git a/src/ellinika/elmorph.scm4 b/src/ell |