aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--configure.ac3
-rw-r--r--scm/Makefile.am7
-rw-r--r--scm/elmorph.c (renamed from scm/mod.c)0
-rw-r--r--scm/elmorph.h (renamed from scm/elchr.h)0
-rw-r--r--src/ellinika/Makefile.am44
-rw-r--r--src/ellinika/aorist.c73
-rw-r--r--src/ellinika/elchr.c (renamed from scm/elchr.c)63
-rw-r--r--src/ellinika/elmorph.c655
-rw-r--r--src/ellinika/elmorph.h46
-rw-r--r--src/ellinika/elmorph.scm45
-rw-r--r--src/ellinika/utf8.c (renamed from scm/utf8.c)16
-rw-r--r--src/ellinika/utf8.h (renamed from scm/utf8.h)3
12 files changed, 882 insertions, 33 deletions
diff --git a/configure.ac b/configure.ac
index 0f83e04..e4f4f7c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -22,7 +22,8 @@ AC_INIT(ellinika, 1.99.99, [gray+ellinika@gnu.org.ua])
AC_CONFIG_SRCDIR(src/cgi-bin/dict.scm4)
AC_CONFIG_AUX_DIR([build-aux])
AC_CANONICAL_SYSTEM
-AM_INIT_AUTOMAKE(no-exeext)
+AM_INIT_AUTOMAKE([1.11 no-exeext])
+AM_CONFIG_HEADER([config.h])
## * Checks for programs.
AC_PROG_CC
diff --git a/scm/Makefile.am b/scm/Makefile.am
index 4ae4480..f2669f4 100644
--- a/scm/Makefile.am
+++ b/scm/Makefile.am
@@ -34,10 +34,3 @@ neatrans: $(srcdir)/neatrans.scm dictrans.sed
sed -f dictrans.sed $(srcdir)/neatrans.scm > $@
chmod +x $@
-lib_LTLIBRARIES=libelchr.la
-
-libelchr_la_SOURCES = \
- utf8.c\
- elchr.c\
- mod.c
-
diff --git a/scm/mod.c b/scm/elmorph.c
index 87598d9..87598d9 100644
--- a/scm/mod.c
+++ b/scm/elmorph.c
diff --git a/scm/elchr.h b/scm/elmorph.h
index 6bc19ca..6bc19ca 100644
--- a/scm/elchr.h
+++ b/scm/elmorph.h
diff --git a/src/ellinika/Makefile.am b/src/ellinika/Makefile.am
index 136b44f..274eea8 100644
--- a/src/ellinika/Makefile.am
+++ b/src/ellinika/Makefile.am
@@ -15,7 +15,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
guiledir=$(GUILE_SITE)/$(PACKAGE)
-guile_DATA=xlat.scm cgi.scm i18n.scm config.scm dico.scm
+guile_DATA=xlat.scm cgi.scm i18n.scm config.scm dico.scm elmorph.scm
cgi.m4: Makefile
echo 'divert(-1)' > $@
@@ -31,13 +31,53 @@ cgi.m4: Makefile
echo 'define([SYSCONFDIR],$(sysconfdir))' >> $@
echo 'define([LOCALEDIR],$(datadir)/locale)' >> $@
echo 'define([HTMLDIR],$(HTMLDIR))' >> $@
+ echo 'define([VERSION],$(VERSION))' >> $@
+ echo 'define([LIBDIR],$(pkglibdir))' >> $@
echo 'divert(0)dnl' >> $@
echo '@AUTOGENERATED@' >> $@
-SUFFIXES = .scm4 .scm
+SUFFIXES = .scm4 .scm .x
.scm4.scm:
m4 cgi.m4 $< > $@
cgi.scm: cgi.scm4 cgi.m4
config.scm: config.scm4 cgi.m4
+elmorph.scm: elmorph.scm4 cgi.m4
+
+pkglib_LTLIBRARIES=libelmorph.la
+
+libelmorph_la_SOURCES = \
+ aorist.c\
+ utf8.c\
+ elchr.c\
+ elmorph.c\
+ elmorph.h
+
+DOT_X_FILES = elmorph.x
+
+BUILT_SOURCES = $(DOT_X_FILES)
+
+DISTCLEANFILES = $(DOT_X_FILES)
+
+snarfcppopts = $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+
+.c.x:
+ AWK=$(AWK) \
+ guile-snarf -o $@ $< $(snarfcppopts)
+
+pkglibnames=elmorph
+
+install-data-hook:
+ here=`pwd`; \
+ cd $(DESTDIR)$(pkglibdir);\
+ for name in $(pkglibnames); do \
+ if test -f lib$$name.la; then \
+ dlname=`sed -n 's/dlname='\''\(.*\)'\''/\1/p' lib$$name.la`; \
+ test -z "$$dlname" && dlname='lib$$name.so'; \
+ $(LN_S) -f "$$dlname" libguile-$$name-v-$(VERSION).so; \
+ fi; \
+ done; \
+ cd $$here
+
+
diff --git a/src/ellinika/aorist.c b/src/ellinika/aorist.c
new file mode 100644
index 0000000..995fce8
--- /dev/null
+++ b/src/ellinika/aorist.c
@@ -0,0 +1,73 @@
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+#include <errno.h>
+#include <stdlib.h>
+#include <libguile.h>
+#include "utf8.h"
+#include "elmorph.h"
+
+int
+elmorph_thema_aoristoy(unsigned *word, size_t len,
+ unsigned **thema, size_t *tlen)
+{
+ unsigned ch, *pw;
+
+ switch (word[len-1]) {
+ case 0x03B6: /* ζ */
+ /* FIXME: This can produce ξ as well: αλλάζω => άλλαξα */
+ case 0x03B8: /* θ */
+ ch = 0x03C3; /* σ */
+ break;
+
+ case 0x03B3: /* γ */
+ case 0x03C7: /* χ */
+ ch = 0x03BE; /* ξ */
+ break;
+
+ case 0x03BA: /* κ */
+ if (len > 1 && word[len-2] == 0x03C3 /* σκ */)
+ len--;
+ ch = 0x03BE; /* ξ */
+ break;
+
+ case 0x03BD: /* ν */
+ if (len > 1 && word[len-2] == 0x03C7 /* χν */) {
+ len--;
+ ch = 0x03BE; /* ξ */
+ } else
+ ch = 0x03C3; /* σ */
+ break;
+
+ case 0x03B2: /* β */
+ case 0x03C0: /* π */
+ case 0x03C6: /* φ */
+ ch = 0x03C8; /* ψ */
+ break;
+
+ case 0x03CD: /* ύ */
+ case 0x03C5: /* υ FIXME: This assumes the word has been deaccentized */
+ if (len > 1 && (word[len-2] == 0x03B1 /* αύ */ ||
+ word[len-2] == 0x03B5 /* εύ */)) {
+ ch = 0x03C8; /* ψ */
+ break;
+ }
+
+ default:
+ len++;
+ ch = 0x03C3; /* σ */
+ }
+
+ pw = calloc(len, sizeof(pw[0]));
+ if (!pw)
+ return -1;
+ memcpy(pw, word, sizeof(word[0]) * (len - 1));
+ pw[len-1] = ch;
+
+ *thema = pw;
+ *tlen = len;
+ return 0;
+}
+
+
+
diff --git a/scm/elchr.c b/src/ellinika/elchr.c
index fa271a1..9b4e7ad 100644
--- a/scm/elchr.c
+++ b/src/ellinika/elchr.c
@@ -5,7 +5,7 @@
#include <stdlib.h>
#include <libguile.h>
#include "utf8.h"
-#include "elchr.h"
+#include "elmorph.h"
struct char_info_st {
unsigned ch; /* Characters */
@@ -160,23 +160,23 @@ struct char_info_st el_basic_ctype[] = {
{ 0x0386, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0391, 0x03AC }, /* Ά */
{ 0x0387, CHF_PUNCT }, /* ano teleia */
{ 0x0388, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0395, 0x03AD }, /* Έ */
- { 0x0389, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0397, 0x03AE }, /* Ή */
- { 0x038A, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x0399, 0x03AF }, /* Ί */
+ { 0x0389, CHF_VOWEL|CHF_UPPER|CHF_OXEIA|CHF_DIPH2, 0x0397, 0x03AE }, /* Ή */
+ { 0x038A, CHF_VOWEL|CHF_UPPER|CHF_OXEIA|CHF_DIPH2, 0x0399, 0x03AF }, /* Ί */
{ 0x038B, },
{ 0x038C, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x039F, 0x03CC }, /* Ό */
{ 0x038D, },
- { 0x038E, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x03A5, 0x03CD }, /* Ύ */
+ { 0x038E, CHF_VOWEL|CHF_UPPER|CHF_OXEIA|CHF_DIPH2, 0x03A5, 0x03CD }, /* Ύ */
{ 0x038F, CHF_VOWEL|CHF_UPPER|CHF_OXEIA, 0x03A9, 0x03CE }, /* Ώ */
{ 0x0390, CHF_VOWEL|CHF_LOWER|CHF_TREMA|CHF_OXEIA, 0x03B9, 0, 0, 0, 0, 0x03CA }, /* ΐ */
- { 0x0391, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03B1, 1, 0x0386 }, /* Α */
+ { 0x0391, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH1, 0, 0x03B1, 1, 0x0386 }, /* Α */
{ 0x0392, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B2, 2 }, /* Β */
{ 0x0393, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B3, 3 }, /* Γ */
{ 0x0394, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B4, 4 }, /* Δ */
- { 0x0395, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03B5, 5, 0x0388 }, /* Ε */
+ { 0x0395, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH1, 0, 0x03B5, 5, 0x0388 }, /* Ε */
{ 0x0396, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B6, 7 }, /* Ζ */
{ 0x0397, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03B7, 8, 0x0389 }, /* Η */
{ 0x0398, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03B8, 9 }, /* Θ */
- { 0x0399, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03B9, 10, 0x038A }, /* Ι */
+ { 0x0399, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH2, 0, 0x03B9, 10, 0x038A }, /* Ι */
{ 0x039A, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BA, 20 }, /* Κ */
{ 0x039B, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BB, 30 }, /* Λ */
{ 0x039C, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03BC, 40 }, /* Μ */
@@ -188,47 +188,48 @@ struct char_info_st el_basic_ctype[] = {
{ 0x03A2, },
{ 0x03A3, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C3, 200 }, /* Σ */
{ 0x03A4, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C4, 300 }, /* Τ */
- { 0x03A5, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03C5, 400, 0x038E }, /* Υ */
+ { 0x03A5, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC|CHF_DIPH1|CHF_DIPH2, 0, 0x03C5, 400, 0x038E }, /* Υ */
{ 0x03A6, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C6, 500 }, /* Φ */
{ 0x03A7, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C7, 600 }, /* Χ */
{ 0x03A8, CHF_CONSONANT|CHF_UPPER|CHF_NUMERIC, 0, 0x03C8, 700 }, /* Ψ */
{ 0x03A9, CHF_VOWEL|CHF_UPPER|CHF_NUMERIC, 0, 0x03C9, 800, 0x038F }, /* Ω */
- { 0x03AA, CHF_VOWEL|CHF_UPPER|CHF_TREMA, 0x0399, 0x03CA }, /* Ϊ */
+ { 0x03AA, CHF_VOWEL|CHF_UPPER|CHF_TREMA|CHF_DIPH2, 0x0399, 0x03CA }, /* Ϊ */
{ 0x03AB, CHF_VOWEL|CHF_UPPER|CHF_TREMA, 0x03A5, 0x03CB }, /* Ϋ */
{ 0x03AC, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B1, 0x0386 }, /* ά */
{ 0x03AD, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B4, 0x0388 }, /* έ */
- { 0x03AE, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B7, 0x0389 }, /* ή */
- { 0x03AF, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03B9, 0x038A }, /* ί */
+ { 0x03AE, CHF_VOWEL|CHF_LOWER|CHF_OXEIA|CHF_DIPH2, 0x03B7, 0x0389 }, /* ή */
+ { 0x03AF, CHF_VOWEL|CHF_LOWER|CHF_OXEIA|CHF_DIPH2, 0x03B9, 0x038A }, /* ί */
{ 0x03B0, CHF_VOWEL|CHF_OXEIA|CHF_TREMA, 0x03C5, 0, 0, 0, 0, 0x03CB }, /* ΰ */
- { 0x03B1, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x0391, 1, 0x03AC }, /* α */
+ { 0x03B1, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH1, 0, 0x0391, 1, 0x03AC }, /* α */
{ 0x03B2, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0392, 2 }, /* β */
{ 0x03B3, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0393, 3 }, /* γ */
{ 0x03B4, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0394, 4 }, /* δ */
- { 0x03B5, CHF_CONSONANT|CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x0395, 5, 0x03AD }, /* ε */
+ { 0x03B5, CHF_CONSONANT|CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH1, 0, 0x0395, 5, 0x03AD }, /* ε */
{ 0x03B6, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0396, 7 }, /* ζ */
- { 0x03B7, CHF_CONSONANT|CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x0397, 8, 0x03AE }, /* η */
+ { 0x03B7, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH1|CHF_DIPH2, 0, 0x0397, 8, 0x03AE }, /* η */
{ 0x03B8, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0398, 9 }, /* θ */
- { 0x03B9, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x0399, 10, 0x03AF }, /* ι */
+ { 0x03B9, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x0399, 10, 0x03AF }, /* ι */
{ 0x03BA, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039A, 20 }, /* κ */
{ 0x03BB, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039B, 30 }, /* λ */
{ 0x03BC, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039C, 40 }, /* μ */
{ 0x03BD, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039D, 50 }, /* ν */
{ 0x03BE, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x039E, 60 }, /* ξ */
- { 0x03BF, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x039F, 70, 0x03CC }, /* ο */
+
+ { 0x03BF, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x039F, 70, 0x03CC }, /* ο */
{ 0x03C0, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A0, 80 }, /* π */
{ 0x03C1, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A1, 100 }, /* ρ */
{ 0x03C2, CHF_CONSONANT|CHF_LOWER, 0, 0x03A3 }, /* ς */
{ 0x03C3, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A3, 200 }, /* σ */
{ 0x03C4, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A4, 300 }, /* τ */
- { 0x03C5, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x03A5, 400, 0x03CD }, /* υ */
+ { 0x03C5, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC|CHF_DIPH2, 0, 0x03A5, 400, 0x03CD }, /* υ */
{ 0x03C6, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A6, 500 }, /* φ */
{ 0x03C7, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A7, 600 }, /* χ */
{ 0x03C8, CHF_CONSONANT|CHF_LOWER|CHF_NUMERIC, 0, 0x03A8, 700 }, /* ψ */
{ 0x03C9, CHF_VOWEL|CHF_LOWER|CHF_NUMERIC, 0, 0x03A9, 800, 0x03CE }, /* ω */
- { 0x03CA, CHF_VOWEL|CHF_LOWER|CHF_TREMA, 0x03B9, 0x03AA, 0, 0x0390 }, /* ϊ */
+ { 0x03CA, CHF_VOWEL|CHF_LOWER|CHF_TREMA|CHF_DIPH2, 0x03B9, 0x03AA, 0, 0x0390 }, /* ϊ */
{ 0x03CB, CHF_VOWEL|CHF_LOWER|CHF_TREMA, 0x03C5, 0x03AB, 0, 0x03B0 }, /* ϋ */
{ 0x03CC, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03BF, 0x038C }, /* ό */
- { 0x03CD, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03C5, 0x038E }, /* ύ */
+ { 0x03CD, CHF_VOWEL|CHF_LOWER|CHF_OXEIA|CHF_DIPH2, 0x03C5, 0x038E }, /* ύ */
{ 0x03CE, CHF_VOWEL|CHF_LOWER|CHF_OXEIA, 0x03CE, 0x038F }, /* ώ */
{ 0x03CF, CHF_SYMBOL|CHF_UPPER, 0x03D7 }, /* KAI */
{ 0x03D0, CHF_CONSONANT|CHF_LOWER, 0, 0x0392 }, /* curled beta */
@@ -674,9 +675,27 @@ unsigned
elchr_accent(unsigned ch, int acc)
{
struct char_info_st *ci = elchr_info(ch);
- return (ci && (ci->flags & CHF_ACCENT_MASK) && ci->accented[acc-1]) ?
- ci->accented[acc-1] : ch;
+ return (ci && ci->accented[acc-1]) ? ci->accented[acc-1] : ch;
}
+int
+elchr_diphthong(unsigned ch, int state)
+{
+ struct char_info_st *ci = elchr_info(ch);
-
+ if (!ci || !(ci->flags & CHF_VOWEL))
+ return 0;
+ switch (state) {
+ case 0:
+ if (ci->flags & CHF_DIPH1)
+ state = 1;
+ break;
+ case 1:
+ if (ci->flags & CHF_DIPH2)
+ state = 2;
+ break;
+ default:
+ state = 0;
+ }
+ return state;
+}
diff --git a/src/ellinika/elmorph.c b/src/ellinika/elmorph.c
new file mode 100644
index 0000000..5234eda
--- /dev/null
+++ b/src/ellinika/elmorph.c
@@ -0,0 +1,655 @@
+#ifdef HAVE_CONFIG_H
+# include <config.h>
+#endif
+#include <errno.h>
+#include <stdlib.h>
+#include <libguile.h>
+#include "utf8.h"
+#include "elmorph.h"
+
+struct elstr {
+ unsigned *str; /* UTF-8 string */
+ size_t len; /* Its length */
+ unsigned nsyl; /* Number of syllables. */
+ unsigned *sylmap; /* Syllable map (nsyl elements) */
+ unsigned acc_syl; /* Number of the accented syllable
+ (1-based, from the last syllable) */
+ unsigned acc_pos; /* Number of the accented character
+ (0-based, from str[0]) */
+};
+
+scm_t_bits _elstr_tag;
+
+static void
+_elstr_syllabize(struct elstr *elstr)
+{
+ unsigned *sylmap;
+ unsigned i, nsyl = 0, accsyl = 0, accchr = 0;
+ int dstate = 0;
+ int acc = 0;
+
+ sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len, "syllable map");
+
+ for (i = 0; i < elstr->len; i++) {
+ int nstate;
+
+ if (elchr_getaccent(elstr->str[i])) {
+ accsyl = nsyl;
+ accchr = i;
+ }
+ nstate = elchr_diphthong(elstr->str[i], dstate);
+ if (nstate)
+ /* skip */;
+ else if (dstate)
+ sylmap[nsyl++] = i - 1;
+ else if (elchr_isvowel(elstr->str[i]))
+ sylmap[nsyl++] = i;
+ dstate = nstate;
+ }
+ if (dstate)
+ sylmap[nsyl++] = i - 1;
+ else
+ sylmap[nsyl-1] = i - 1;
+ elstr->sylmap = sylmap;
+ elstr->nsyl = nsyl;
+ elstr->acc_pos = accchr;
+ elstr->acc_syl = nsyl - accsyl;
+}
+
+static SCM
+_elstr_alloc(const char *instr)
+{
+ struct elstr *elstr;
+ unsigned *wptr;
+ size_t wlen;
+
+ if (utf8_mbstr_to_wc(instr, &wptr, &wlen))
+ return SCM_EOL;
+
+ elstr = scm_gc_malloc(sizeof(*elstr), "Elstr");
+ elstr->str = wptr;
+ elstr->len = wlen;
+
+ _elstr_syllabize(elstr);
+
+ SCM_RETURN_NEWSMOB(_elstr_tag, elstr);
+}
+
+static SCM
+_elstr_dup(struct elstr *elstr)
+{
+ struct elstr *elnew;
+
+ elnew = scm_gc_malloc(sizeof(*elstr), "Elstr");
+ elnew->str = calloc(elstr->len, sizeof(elnew->str[0]));
+ if (!elnew->str)
+ scm_memory_error("_elstr_dup");
+ elnew->sylmap = calloc(elstr->nsyl, sizeof(elnew->sylmap[0]));
+ if (!elnew->sylmap) {
+ free(elnew->str);
+ scm_memory_error("_elstr_dup");
+ }
+ memcpy(elnew->str, elstr->str, sizeof(elstr->str[0]) * elstr->len);
+ elnew->len = elstr->len;
+ elnew->nsyl = elstr->nsyl;
+ memcpy(elnew->sylmap, elstr->sylmap,
+ sizeof(elstr->sylmap[0]) * elstr->nsyl);
+ elnew->acc_syl = elstr->acc_syl;
+ elnew->acc_pos = elstr->acc_pos;
+ SCM_RETURN_NEWSMOB(_elstr_tag, elnew);
+}
+
+static scm_sizet
+_elstr_free(SCM smob)
+{
+ struct elstr *elstr = (struct elstr *) SCM_CDR(smob);
+ free(elstr->str);
+ free(elstr->sylmap);
+ free(elstr);
+ return 0;
+}
+
+static int
+_elstr_print(SCM smob, SCM port, scm_print_state *pstate)
+{
+ struct elstr *elstr = (struct elstr *) SCM_CDR(smob);
+ int i, j, an;
+ char *s;
+
+ scm_puts("#<elstr ``", port);
+ an = elstr->nsyl - elstr->acc_syl;
+ if (an == 0)
+ scm_puts("[", port);
+ for (i = j = 0; i < elstr->len; i++) {
+ char r[6];
+ int n;
+
+ if (i == elstr->sylmap[j] + 1) {
+ if (j == an)
+ scm_puts("]", port);
+ scm_puts("-", port);
+ if (++j == an)
+ scm_puts("[", port);
+ }
+ n = utf8_wctomb(r, elstr->str[i]);
+ if (n == -1)
+ continue;
+ r[n] = 0;
+ scm_puts(r, port);
+ }
+ if (j == an)
+ scm_puts("]", port);
+ scm_puts("''>", port);
+ return 1;
+}
+
+static void
+_elstr_init()
+{
+ _elstr_tag = scm_make_smob_type("Elstr", sizeof(struct elstr));
+ scm_set_smob_free(_elstr_tag, _elstr_free);
+ scm_set_smob_print(_elstr_tag, _elstr_print);
+}
+
+SCM_DEFINE_PUBLIC(scm_string__elstr, "string->elstr", 1, 0, 0,
+ (SCM string),
+"Create new ELSTR from STRING\n")
+#define FUNC_NAME s_scm_string__elstr
+{
+ char *str;
+ SCM scm;
+
+ SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME);
+ str = scm_to_locale_string(string);
+ scm = _elstr_alloc(str);
+ free(str);
+ if (scm == SCM_EOL)
+ scm_misc_error(FUNC_NAME,
+ "Invalid input string: ~S",
+ scm_list_1(string));
+ return scm;
+}
+#undef FUNC_NAME
+
+#define scm_is_elstr(s) (!SCM_IMP(s) && SCM_CELL_TYPE(s) == _elstr_tag)
+
+SCM_DEFINE_PUBLIC(scm_elstr__string, "elstr->string", 1, 0, 0,
+ (SCM el),
+"Convert EL to a STRING\n")
+#define FUNC_NAME s_scm_elstr__string
+{
+ struct elstr *elstr;
+ char *s;
+ SCM scm;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
+ elstr = (struct elstr*) SCM_CDR(el);
+ if (utf8_wc_to_mbstr(elstr->str, elstr->len, &s))
+ scm_misc_error(FUNC_NAME,
+ "cannot convert elstr to Scheme",
+ SCM_EOL);
+ scm = scm_from_locale_string(s);
+ free(s);
+ return scm;
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_length, "elstr-length", 1, 0, 0,
+ (SCM el),
+"Returns the number of characters in EL\n")
+#define FUNC_NAME s_scm_elstr_length
+{
+ struct elstr *elstr;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
+ elstr = (struct elstr*) SCM_CDR(el);
+ return scm_from_uint(elstr->len);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_number_of_syllables, "elstr-number-of-syllables",
+ 1, 0, 0,
+ (SCM el),
+"Returns the number of characters in EL\n")
+#define FUNC_NAME s_scm_elstr_number_of_syllables
+{
+ struct elstr *elstr;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
+ elstr = (struct elstr*) SCM_CDR(el);
+ return scm_from_uint(elstr->nsyl);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_syllable_prop, "elstr-syllable-prop",
+ 2, 0, 0,
+ (SCM el, SCM n),
+"Returns properties of the syllable N in EL\n")
+#define FUNC_NAME s_scm_elstr_syllable_prop
+{
+ struct elstr *elstr;
+ unsigned num, start;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
+ elstr = (struct elstr*) SCM_CDR(el);
+ SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME);
+ num = scm_to_uint(n);
+ if (num > elstr->nsyl)
+ scm_misc_error(FUNC_NAME,
+ "cannot get syllable #~S: not enough syllables: ~S",
+ scm_list_2(el, n));
+ num = elstr->nsyl - num;
+ if (num == 0)
+ start = 0;
+ else
+ start = elstr->sylmap[num - 1] + 1;
+
+ return scm_cons(scm_from_uint(start),
+ scm_from_uint(elstr->sylmap[num]));
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_accent_position, "elstr-accent-position", 1, 0, 0,
+ (SCM el),
+"Return position of the accented character in EL\n")
+#define FUNC_NAME s_scm_elstr_accent_position
+{
+ struct elstr *elstr;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
+ elstr = (struct elstr*) SCM_CDR(el);
+ return scm_from_uint(elstr->acc_pos);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_accented_syllable, "elstr-accented-syllable",
+ 1, 0, 0,
+ (SCM el),
+"Return position of the accented syllable in EL\n")
+#define FUNC_NAME s_scm_elstr_accented_syllable
+{
+ struct elstr *elstr;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
+ elstr = (struct elstr*) SCM_CDR(el);
+ return scm_from_uint(elstr->acc_syl);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_syllable, "elstr-syllable",
+ 2, 0, 0,
+ (SCM el, SCM n),
+"Return Nth syllable in EL\n")
+#define FUNC_NAME s_scm_elstr_accented_syllable
+{
+ struct elstr *elstr;
+ char *s;
+ SCM scm;
+ unsigned num, start;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
+ elstr = (struct elstr*) SCM_CDR(el);
+ SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME);
+ num = scm_to_uint(n);
+ if (num > elstr->nsyl)
+ scm_misc_error(FUNC_NAME,
+ "cannot get syllable #~S: not enough syllables: ~S",
+ scm_list_2(el, n));
+ num = elstr->nsyl - num;
+ if (num == 0)
+ start = 0;
+ else
+ start = elstr->sylmap[num - 1] + 1;
+ if (utf8_wc_to_mbstr(elstr->str + start,
+ elstr->sylmap[num] - start + 1,
+ &s))
+ scm_misc_error(FUNC_NAME,
+ "cannot convert elstr to Scheme",
+ SCM_EOL);
+ scm = scm_from_locale_string(s);
+ free(s);
+ return scm;
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_character, "elstr-character",
+ 2, 0, 0,
+ (SCM el, SCM n),
+"Return Nth character in EL\n")
+#define FUNC_NAME s_scm_elstr_character
+{
+ struct elstr *elstr;
+ unsigned num;
+ char r[6];
+ int len;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
+ elstr = (struct elstr*) SCM_CDR(el);
+ SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME);
+ num = scm_to_uint(n);
+ if (num >= elstr->len)
+ scm_misc_error(FUNC_NAME,
+ "cannot get character #~S: not enough characters: ~S",
+ scm_list_2(el, n));
+ len = utf8_wctomb(r, elstr->str[num]);
+ if (len <= 0)
+ scm_misc_error(FUNC_NAME,
+ "cannot convert elchr to Scheme",
+ SCM_EOL);
+ r[len] = 0;
+ return scm_from_locale_string(r);
+}
+#undef FUNC_NAME
+
+static SCM
+_elstr_chgcase(SCM el, void (*chgfun)(unsigned *, size_t),
+ int destructive, const char *func_name)
+{
+ struct elstr *elstr;
+ SCM scm;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name);
+ elstr = (struct elstr*) SCM_CDR(el);
+ if (destructive)
+ scm = SCM_UNSPECIFIED;
+ else {
+ scm = _elstr_dup(elstr);
+ elstr = (struct elstr*) SCM_CDR(scm);
+ }
+ chgfun(elstr->str, elstr->len);
+ return scm;
+}
+
+SCM_DEFINE_PUBLIC(scm_elstr_toupper, "elstr-toupper",
+ 1, 0, 0,
+ (SCM el),
+"Convert EL to upper case\n")
+#define FUNC_NAME s_scm_elstr_toupper
+{
+ return _elstr_chgcase(el, utf8_wc_strnupper, 0, FUNC_NAME);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_tolower, "elstr-tolower",
+ 1, 0, 0,
+ (SCM el),
+"Convert EL to lower case\n")
+#define FUNC_NAME s_scm_elstr_tolower
+{
+ return _elstr_chgcase(el, utf8_wc_strnlower, 0, FUNC_NAME);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_toupper_x, "elstr-toupper!",
+ 1, 0, 0,
+ (SCM el),
+"Convert EL to upper case (destructive)\n")
+#define FUNC_NAME s_scm_elstr_toupper_x
+{
+ return _elstr_chgcase(el, utf8_wc_strnupper, 1, FUNC_NAME);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_tolower_x, "elstr-tolower!",
+ 1, 0, 0,
+ (SCM el),
+"Convert EL to lower case (destructive)\n")
+#define FUNC_NAME s_scm_elstr_tolower_x
+{
+ return _elstr_chgcase(el, utf8_wc_strnlower, 0, FUNC_NAME);
+}
+#undef FUNC_NAME
+
+static SCM
+_elstr_deaccent(SCM el, int destructive, const char *func_name)
+{
+ struct elstr *elstr;
+ unsigned i;
+ SCM scm;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name);
+ elstr = (struct elstr*) SCM_CDR(el);
+ if (destructive)
+ scm = SCM_UNSPECIFIED;
+ else {
+ scm = _elstr_dup(elstr);
+ elstr = (struct elstr*) SCM_CDR(scm);
+ }
+ for (i = 0; i < elstr->len; i++)
+ elstr->str[i] = elchr_deaccent(elstr->str[i]);
+ elstr->acc_pos = 0;
+ elstr->acc_syl = 0;
+ return scm;
+}
+
+SCM_DEFINE_PUBLIC(scm_elstr_deaccent, "elstr-deaccent",
+ 1, 0, 0,
+ (SCM el),
+"Remove all accents from EL\n")
+#define FUNC_NAME s_scm_elstr_deaccent
+{
+ return _elstr_deaccent(el, 0, FUNC_NAME);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_deaccent_x, "elstr-deaccent!",
+ 1, 0, 0,
+ (SCM el),
+"Remove all accents from EL (desctructive)\n")
+#define FUNC_NAME s_scm_elstr_deaccent_x
+{
+ return _elstr_deaccent(el, 1, FUNC_NAME);
+}
+#undef FUNC_NAME
+
+static SCM
+_elstr_set_accent(SCM el, SCM n, int destructive, const char *func_name)
+{
+ struct elstr *elstr;
+ unsigned i;
+ unsigned acc_num, num, len, start;
+ SCM scm;
+ int dstate;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name);
+ SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name);
+ elstr = (struct elstr*) SCM_CDR(el);
+ num = scm_to_uint(n);
+ if (num > elstr->nsyl)
+ scm_misc_error(func_name,
+ "cannot get syllable #~S: not enough syllables: ~S",
+ scm_list_2(el, n));
+ acc_num = elstr->nsyl - num;
+ if (acc_num == 0)
+ start = 0;
+ else
+ start = elstr->sylmap[acc_num - 1] + 1;
+
+ if (destructive)
+ scm = SCM_UNSPECIFIED;
+ else {
+ scm = _elstr_dup(elstr);
+ elstr = (struct elstr*) SCM_CDR(scm);
+ }
+
+ /* Clear all accents */
+ for (i = 0; i < elstr->len; i++)
+ elstr->str[i] = elchr_deaccent(elstr->str[i]);
+ len = elstr->sylmap[acc_num] - start + 1;
+ dstate = 0;
+ for (i = start; i <= start + len; i++) {
+ int nstate;
+
+ if (!elchr_isvowel(elstr->str[i])) {
+ if (dstate) {
+ --i;
+ break;
+ }
+ continue;
+ }
+ nstate = elchr_diphthong(elstr->str[i], dstate);
+ if (!nstate)
+ break;
+ dstate = nstate;
+ }
+ elstr->str[i] = elchr_accent(elstr->str[i], CHF_OXEIA);
+ elstr->acc_syl = num;
+ return scm;
+}
+
+SCM_DEFINE_PUBLIC(scm_elstr_set_accent, "elstr-set-accent",
+ 2, 0, 0,
+ (SCM el, SCM n),
+"Set accent on Nth syllable of EL\n")
+{
+ return _elstr_set_accent(el, n, 0, s_scm_elstr_set_accent);
+}
+
+SCM_DEFINE_PUBLIC(scm_elstr_set_accent_x, "elstr-set-accent!",
+ 2, 0, 0,
+ (SCM el, SCM n),
+"Set accent on Nth syllable of EL (destructive)\n")
+{
+ return _elstr_set_accent(el, n, 1, s_scm_elstr_set_accent_x);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_char_prop_bitmask, "elstr-char-prop-bitmask",
+ 2, 0, 0,
+ (SCM el, SCM n),
+"Returns properties of the Nth char in EL, as a bitmask\n")
+#define FUNC_NAME s_scm_elstr_char_prop_bitmask
+{
+ struct elstr *elstr;
+ unsigned num;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
+ elstr = (struct elstr*) SCM_CDR(el);
+ num = scm_to_uint(n);
+ if (num >= elstr->len)
+ scm_misc_error(FUNC_NAME,
+ "cannot get character #~S: not enough characters: ~S",
+ scm_list_2(el, n));
+ return scm_from_uint(elchr_flags(elstr->str[num]));
+}
+#undef FUNC_NAME
+
+static struct deftab {
+ unsigned val;
+ char *sym;
+} deftab[] = {
+ { CHF_OXEIA, "elmorph:oxeia" },
+ { CHF_PERISPWMENH, "elmorph:perispwmenh" },
+ { CHF_BAREIA, "elmorph:bareia" },
+ { CHF_ACCENT_MASK, "elmorph:accent-mask" },
+ { CHF_TREMA, "elmorph:trema" },
+ { CHF_VOWEL, "elmorph:vowel" },
+ { CHF_CONSONANT, "elmorph:consonant" },
+ { CHF_SEMIVOWEL, "elmorph:semivowel" },
+ { CHF_PUNCT, "elmorph:punct" },
+ { CHF_SYMBOL, "elmorph:symbol" },
+ { CHF_MODIFIER, "elmorph:modifier" },
+ { CHF_ARCHAIC, "elmorph:archaic" },
+ { CHF_LOWER, "elmorph:lower" },
+ { CHF_UPPER, "elmorph:upper" },
+ { CHF_NUMERIC, "elmorph:numeric" },
+
+ { CHF_DIPH1, "elmorph:diph1" },
+ { CHF_DIPH2, "elmorph:diph2" }
+};
+
+SCM_DEFINE_PUBLIC(scm_utf8_toupper, "utf8-toupper", 1, 0, 0,
+ (SCM string),
+"Convert STRING to uppercase\n")
+#define FUNC_NAME s_scm_utf8_toupper
+{
+ char *str;
+ SCM scm;
+
+ SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME);
+ str = scm_to_locale_string(string);
+ if (utf8_toupper(str, strlen(str)))
+ scm_misc_error(FUNC_NAME,
+ "cannot convert to upper case: ~S",
+ scm_list_1(string));
+ scm = scm_from_locale_string(str);
+ free(str);
+ return scm;
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_utf8_tolower, "utf8-tolower", 1, 0, 0,
+ (SCM string),
+"Convert STRING to lowercase\n")
+#define FUNC_NAME s_scm_utf8_tolower
+{
+ char *str;
+ SCM scm;
+
+ SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME);
+ str = scm_to_locale_string(string);
+ if (utf8_tolower(str, strlen(str)))
+ scm_misc_error(FUNC_NAME,
+ "cannot convert to lower case: ~S",
+ scm_list_1(string));
+ scm = scm_from_locale_string(str);
+ free(str);
+ return scm;
+}
+#undef FUNC_NAME
+
+static SCM
+_elstr_thema_aoristoy(SCM el, int destructive, const char *func_name)
+{
+ struct elstr *elstr;
+ SCM scm;
+ unsigned *wc;
+ size_t wclen;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name);
+ elstr = (struct elstr*) SCM_CDR(el);
+ if (destructive)
+ scm = SCM_UNSPECIFIED;
+ else {
+ scm = _elstr_dup(elstr);
+ elstr = (struct elstr*) SCM_CDR(scm);
+ }
+ if (elmorph_thema_aoristoy(elstr->str, elstr->len, &wc, &wclen))
+ scm_memory_error(func_name);
+ free(elstr->str);
+ elstr->str = wc;
+ elstr->len = wclen;
+ return scm;
+}
+
+SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy, "elstr-thema-aoristoy", 1, 0, 0,
+ (SCM thema),
+"Convert THEMA, which must be a root of present. to an aorist root\n")
+#define FUNC_NAME s_scm_elstr_thema_aoristoy
+{
+ return _elstr_thema_aoristoy(thema, 0, FUNC_NAME);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy_x, "elstr-thema-aoristoy!", 1, 0, 0,
+ (SCM thema),
+"Convert THEMA, which must be a root of present. to an aorist root (destructive)\n")
+#define FUNC_NAME s_scm_elstr_thema_aoristoy_x
+{
+ return _elstr_thema_aoristoy(thema, 1, FUNC_NAME);
+}
+#undef FUNC_NAME
+
+
+void
+scm_init_ellinika_elmorph_module()
+{
+ int i;
+
+ _elstr_init();
+ for (i = 0; i < sizeof(deftab)/sizeof(deftab[0]); i++) {
+ scm_c_define(deftab[i].sym, scm_from_uint(deftab[i].val));
+ scm_c_export(deftab[i].sym, NULL);
+ }
+#include "elmorph.x"
+}
diff --git a/src/ellinika/elmorph.h b/src/ellinika/elmorph.h
new file mode 100644
index 0000000..d91f513
--- /dev/null
+++ b/src/ellinika/elmorph.h
@@ -0,0 +1,46 @@
+#define CHF_OXEIA 1
+#define CHF_PERISPWMENH 2
+#define CHF_BAREIA 3
+
+#define CHF_ACCENT_MASK 0x000f
+
+#define CHF_TREMA 0x0010
+
+#define CHF_VOWEL 0x00020
+#define CHF_CONSONANT 0x00040
+#define CHF_SEMIVOWEL 0x00080
+#define CHF_PUNCT 0x00100
+#define CHF_SYMBOL 0x00200
+#define CHF_MODIFIER 0x00400
+#define CHF_ARCHAIC 0x00800
+#define CHF_LOWER 0x01000
+#define CHF_UPPER 0x02000
+#define CHF_NUMERIC 0x04000
+
+#define CHF_DIPH1 0x10000
+#define CHF_DIPH2 0x20000
+
+int elchr_flags(unsigned ch);
+int elchr_isupper(unsigned ch);
+int elchr_islower(unsigned ch);
+int elchr_getaccent(unsigned ch);
+int elchr_istrema(unsigned ch);
+int elchr_isvowel(unsigned ch);
+int elchr_isconsonant(unsigned ch);
+int elchr_issemivowel(unsigned ch);
+int elchr_ispunct(unsigned ch);
+int elchr_issymbol(unsigned ch);
+int elchr_ismodifier(unsigned ch);
+int elchr_isarchaic(unsigned ch);
+int elchr_isnumeric(unsigned ch);
+unsigned elchr_numeric_value(unsigned ch);
+unsigned elchr_toupper(unsigned ch);
+unsigned elchr_tolower(unsigned ch);
+unsigned elchr_base(unsigned ch);
+unsigned elchr_deaccent(unsigned ch);
+unsigned elchr_accent(unsigned ch, int acc);
+int elchr_diphthong(unsigned ch, int state);
+
+
+int elmorph_thema_aoristoy(unsigned *word, size_t len,
+ unsigned **thema, size_t *tlen);
diff --git a/src/ellinika/elmorph.scm4 b/src/ell