diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2011-06-04 10:27:59 +0000 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2011-06-04 10:27:59 +0000 |
commit | ceb837f01112d2cfde96ba9e6ddc9c9ccbd0d0a4 (patch) | |
tree | 316933c27051392c5cd48b873ae0697cd389d52a /src/ellinika/elmorph.c | |
parent | 99076de629a6f5f2b654118cde3612f9ba05edf0 (diff) | |
download | ellinika-ceb837f01112d2cfde96ba9e6ddc9c9ccbd0d0a4.tar.gz ellinika-ceb837f01112d2cfde96ba9e6ddc9c9ccbd0d0a4.tar.bz2 |
Implement new morphological functions. Move elmorph to scm/ellinika
git-svn-id: file:///home/puszcza/svnroot/ellinika/trunk@554 941c8c0f-9102-463b-b60b-cd22ce0e6858
Diffstat (limited to 'src/ellinika/elmorph.c')
-rw-r--r-- | src/ellinika/elmorph.c | 655 |
1 files changed, 655 insertions, 0 deletions
diff --git a/src/ellinika/elmorph.c b/src/ellinika/elmorph.c new file mode 100644 index 0000000..5234eda --- /dev/null +++ b/src/ellinika/elmorph.c @@ -0,0 +1,655 @@ +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif +#include <errno.h> +#include <stdlib.h> +#include <libguile.h> +#include "utf8.h" +#include "elmorph.h" + +struct elstr { + unsigned *str; /* UTF-8 string */ + size_t len; /* Its length */ + unsigned nsyl; /* Number of syllables. */ + unsigned *sylmap; /* Syllable map (nsyl elements) */ + unsigned acc_syl; /* Number of the accented syllable + (1-based, from the last syllable) */ + unsigned acc_pos; /* Number of the accented character + (0-based, from str[0]) */ +}; + +scm_t_bits _elstr_tag; + +static void +_elstr_syllabize(struct elstr *elstr) +{ + unsigned *sylmap; + unsigned i, nsyl = 0, accsyl = 0, accchr = 0; + int dstate = 0; + int acc = 0; + + sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len, "syllable map"); + + for (i = 0; i < elstr->len; i++) { + int nstate; + + if (elchr_getaccent(elstr->str[i])) { + accsyl = nsyl; + accchr = i; + } + nstate = elchr_diphthong(elstr->str[i], dstate); + if (nstate) + /* skip */; + else if (dstate) + sylmap[nsyl++] = i - 1; + else if (elchr_isvowel(elstr->str[i])) + sylmap[nsyl++] = i; + dstate = nstate; + } + if (dstate) + sylmap[nsyl++] = i - 1; + else + sylmap[nsyl-1] = i - 1; + elstr->sylmap = sylmap; + elstr->nsyl = nsyl; + elstr->acc_pos = accchr; + elstr->acc_syl = nsyl - accsyl; +} + +static SCM +_elstr_alloc(const char *instr) +{ + struct elstr *elstr; + unsigned *wptr; + size_t wlen; + + if (utf8_mbstr_to_wc(instr, &wptr, &wlen)) + return SCM_EOL; + + elstr = scm_gc_malloc(sizeof(*elstr), "Elstr"); + elstr->str = wptr; + elstr->len = wlen; + + _elstr_syllabize(elstr); + + SCM_RETURN_NEWSMOB(_elstr_tag, elstr); +} + +static SCM +_elstr_dup(struct elstr *elstr) +{ + struct elstr *elnew; + + elnew = scm_gc_malloc(sizeof(*elstr), "Elstr"); + elnew->str = calloc(elstr->len, sizeof(elnew->str[0])); + if (!elnew->str) + scm_memory_error("_elstr_dup"); + elnew->sylmap = calloc(elstr->nsyl, sizeof(elnew->sylmap[0])); + if (!elnew->sylmap) { + free(elnew->str); + scm_memory_error("_elstr_dup"); + } + memcpy(elnew->str, elstr->str, sizeof(elstr->str[0]) * elstr->len); + elnew->len = elstr->len; + elnew->nsyl = elstr->nsyl; + memcpy(elnew->sylmap, elstr->sylmap, + sizeof(elstr->sylmap[0]) * elstr->nsyl); + elnew->acc_syl = elstr->acc_syl; + elnew->acc_pos = elstr->acc_pos; + SCM_RETURN_NEWSMOB(_elstr_tag, elnew); +} + +static scm_sizet +_elstr_free(SCM smob) +{ + struct elstr *elstr = (struct elstr *) SCM_CDR(smob); + free(elstr->str); + free(elstr->sylmap); + free(elstr); + return 0; +} + +static int +_elstr_print(SCM smob, SCM port, scm_print_state *pstate) +{ + struct elstr *elstr = (struct elstr *) SCM_CDR(smob); + int i, j, an; + char *s; + + scm_puts("#<elstr ``", port); + an = elstr->nsyl - elstr->acc_syl; + if (an == 0) + scm_puts("[", port); + for (i = j = 0; i < elstr->len; i++) { + char r[6]; + int n; + + if (i == elstr->sylmap[j] + 1) { + if (j == an) + scm_puts("]", port); + scm_puts("-", port); + if (++j == an) + scm_puts("[", port); + } + n = utf8_wctomb(r, elstr->str[i]); + if (n == -1) + continue; + r[n] = 0; + scm_puts(r, port); + } + if (j == an) + scm_puts("]", port); + scm_puts("''>", port); + return 1; +} + +static void +_elstr_init() +{ + _elstr_tag = scm_make_smob_type("Elstr", sizeof(struct elstr)); + scm_set_smob_free(_elstr_tag, _elstr_free); + scm_set_smob_print(_elstr_tag, _elstr_print); +} + +SCM_DEFINE_PUBLIC(scm_string__elstr, "string->elstr", 1, 0, 0, + (SCM string), +"Create new ELSTR from STRING\n") +#define FUNC_NAME s_scm_string__elstr +{ + char *str; + SCM scm; + + SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME); + str = scm_to_locale_string(string); + scm = _elstr_alloc(str); + free(str); + if (scm == SCM_EOL) + scm_misc_error(FUNC_NAME, + "Invalid input string: ~S", + scm_list_1(string)); + return scm; +} +#undef FUNC_NAME + +#define scm_is_elstr(s) (!SCM_IMP(s) && SCM_CELL_TYPE(s) == _elstr_tag) + +SCM_DEFINE_PUBLIC(scm_elstr__string, "elstr->string", 1, 0, 0, + (SCM el), +"Convert EL to a STRING\n") +#define FUNC_NAME s_scm_elstr__string +{ + struct elstr *elstr; + char *s; + SCM scm; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + if (utf8_wc_to_mbstr(elstr->str, elstr->len, &s)) + scm_misc_error(FUNC_NAME, + "cannot convert elstr to Scheme", + SCM_EOL); + scm = scm_from_locale_string(s); + free(s); + return scm; +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_length, "elstr-length", 1, 0, 0, + (SCM el), +"Returns the number of characters in EL\n") +#define FUNC_NAME s_scm_elstr_length +{ + struct elstr *elstr; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + return scm_from_uint(elstr->len); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_number_of_syllables, "elstr-number-of-syllables", + 1, 0, 0, + (SCM el), +"Returns the number of characters in EL\n") +#define FUNC_NAME s_scm_elstr_number_of_syllables +{ + struct elstr *elstr; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + return scm_from_uint(elstr->nsyl); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_syllable_prop, "elstr-syllable-prop", + 2, 0, 0, + (SCM el, SCM n), +"Returns properties of the syllable N in EL\n") +#define FUNC_NAME s_scm_elstr_syllable_prop +{ + struct elstr *elstr; + unsigned num, start; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); + num = scm_to_uint(n); + if (num > elstr->nsyl) + scm_misc_error(FUNC_NAME, + "cannot get syllable #~S: not enough syllables: ~S", + scm_list_2(el, n)); + num = elstr->nsyl - num; + if (num == 0) + start = 0; + else + start = elstr->sylmap[num - 1] + 1; + + return scm_cons(scm_from_uint(start), + scm_from_uint(elstr->sylmap[num])); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_accent_position, "elstr-accent-position", 1, 0, 0, + (SCM el), +"Return position of the accented character in EL\n") +#define FUNC_NAME s_scm_elstr_accent_position +{ + struct elstr *elstr; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + return scm_from_uint(elstr->acc_pos); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_accented_syllable, "elstr-accented-syllable", + 1, 0, 0, + (SCM el), +"Return position of the accented syllable in EL\n") +#define FUNC_NAME s_scm_elstr_accented_syllable +{ + struct elstr *elstr; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + return scm_from_uint(elstr->acc_syl); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_syllable, "elstr-syllable", + 2, 0, 0, + (SCM el, SCM n), +"Return Nth syllable in EL\n") +#define FUNC_NAME s_scm_elstr_accented_syllable +{ + struct elstr *elstr; + char *s; + SCM scm; + unsigned num, start; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); + num = scm_to_uint(n); + if (num > elstr->nsyl) + scm_misc_error(FUNC_NAME, + "cannot get syllable #~S: not enough syllables: ~S", + scm_list_2(el, n)); + num = elstr->nsyl - num; + if (num == 0) + start = 0; + else + start = elstr->sylmap[num - 1] + 1; + if (utf8_wc_to_mbstr(elstr->str + start, + elstr->sylmap[num] - start + 1, + &s)) + scm_misc_error(FUNC_NAME, + "cannot convert elstr to Scheme", + SCM_EOL); + scm = scm_from_locale_string(s); + free(s); + return scm; +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_character, "elstr-character", + 2, 0, 0, + (SCM el, SCM n), +"Return Nth character in EL\n") +#define FUNC_NAME s_scm_elstr_character +{ + struct elstr *elstr; + unsigned num; + char r[6]; + int len; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); + num = scm_to_uint(n); + if (num >= elstr->len) + scm_misc_error(FUNC_NAME, + "cannot get character #~S: not enough characters: ~S", + scm_list_2(el, n)); + len = utf8_wctomb(r, elstr->str[num]); + if (len <= 0) + scm_misc_error(FUNC_NAME, + "cannot convert elchr to Scheme", + SCM_EOL); + r[len] = 0; + return scm_from_locale_string(r); +} +#undef FUNC_NAME + +static SCM +_elstr_chgcase(SCM el, void (*chgfun)(unsigned *, size_t), + int destructive, const char *func_name) +{ + struct elstr *elstr; + SCM scm; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); + elstr = (struct elstr*) SCM_CDR(el); + if (destructive) + scm = SCM_UNSPECIFIED; + else { + scm = _elstr_dup(elstr); + elstr = (struct elstr*) SCM_CDR(scm); + } + chgfun(elstr->str, elstr->len); + return scm; +} + +SCM_DEFINE_PUBLIC(scm_elstr_toupper, "elstr-toupper", + 1, 0, 0, + (SCM el), +"Convert EL to upper case\n") +#define FUNC_NAME s_scm_elstr_toupper +{ + return _elstr_chgcase(el, utf8_wc_strnupper, 0, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_tolower, "elstr-tolower", + 1, 0, 0, + (SCM el), +"Convert EL to lower case\n") +#define FUNC_NAME s_scm_elstr_tolower +{ + return _elstr_chgcase(el, utf8_wc_strnlower, 0, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_toupper_x, "elstr-toupper!", + 1, 0, 0, + (SCM el), +"Convert EL to upper case (destructive)\n") +#define FUNC_NAME s_scm_elstr_toupper_x +{ + return _elstr_chgcase(el, utf8_wc_strnupper, 1, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_tolower_x, "elstr-tolower!", + 1, 0, 0, + (SCM el), +"Convert EL to lower case (destructive)\n") +#define FUNC_NAME s_scm_elstr_tolower_x +{ + return _elstr_chgcase(el, utf8_wc_strnlower, 0, FUNC_NAME); +} +#undef FUNC_NAME + +static SCM +_elstr_deaccent(SCM el, int destructive, const char *func_name) +{ + struct elstr *elstr; + unsigned i; + SCM scm; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); + elstr = (struct elstr*) SCM_CDR(el); + if (destructive) + scm = SCM_UNSPECIFIED; + else { + scm = _elstr_dup(elstr); + elstr = (struct elstr*) SCM_CDR(scm); + } + for (i = 0; i < elstr->len; i++) + elstr->str[i] = elchr_deaccent(elstr->str[i]); + elstr->acc_pos = 0; + elstr->acc_syl = 0; + return scm; +} + +SCM_DEFINE_PUBLIC(scm_elstr_deaccent, "elstr-deaccent", + 1, 0, 0, + (SCM el), +"Remove all accents from EL\n") +#define FUNC_NAME s_scm_elstr_deaccent +{ + return _elstr_deaccent(el, 0, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_deaccent_x, "elstr-deaccent!", + 1, 0, 0, + (SCM el), +"Remove all accents from EL (desctructive)\n") +#define FUNC_NAME s_scm_elstr_deaccent_x +{ + return _elstr_deaccent(el, 1, FUNC_NAME); +} +#undef FUNC_NAME + +static SCM +_elstr_set_accent(SCM el, SCM n, int destructive, const char *func_name) +{ + struct elstr *elstr; + unsigned i; + unsigned acc_num, num, len, start; + SCM scm; + int dstate; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name); + elstr = (struct elstr*) SCM_CDR(el); + num = scm_to_uint(n); + if (num > elstr->nsyl) + scm_misc_error(func_name, + "cannot get syllable #~S: not enough syllables: ~S", + scm_list_2(el, n)); + acc_num = elstr->nsyl - num; + if (acc_num == 0) + start = 0; + else + start = elstr->sylmap[acc_num - 1] + 1; + + if (destructive) + scm = SCM_UNSPECIFIED; + else { + scm = _elstr_dup(elstr); + elstr = (struct elstr*) SCM_CDR(scm); + } + + /* Clear all accents */ + for (i = 0; i < elstr->len; i++) + elstr->str[i] = elchr_deaccent(elstr->str[i]); + len = elstr->sylmap[acc_num] - start + 1; + dstate = 0; + for (i = start; i <= start + len; i++) { + int nstate; + + if (!elchr_isvowel(elstr->str[i])) { + if (dstate) { + --i; + break; + } + continue; + } + nstate = elchr_diphthong(elstr->str[i], dstate); + if (!nstate) + break; + dstate = nstate; + } + elstr->str[i] = elchr_accent(elstr->str[i], CHF_OXEIA); + elstr->acc_syl = num; + return scm; +} + +SCM_DEFINE_PUBLIC(scm_elstr_set_accent, "elstr-set-accent", + 2, 0, 0, + (SCM el, SCM n), +"Set accent on Nth syllable of EL\n") +{ + return _elstr_set_accent(el, n, 0, s_scm_elstr_set_accent); +} + +SCM_DEFINE_PUBLIC(scm_elstr_set_accent_x, "elstr-set-accent!", + 2, 0, 0, + (SCM el, SCM n), +"Set accent on Nth syllable of EL (destructive)\n") +{ + return _elstr_set_accent(el, n, 1, s_scm_elstr_set_accent_x); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_char_prop_bitmask, "elstr-char-prop-bitmask", + 2, 0, 0, + (SCM el, SCM n), +"Returns properties of the Nth char in EL, as a bitmask\n") +#define FUNC_NAME s_scm_elstr_char_prop_bitmask +{ + struct elstr *elstr; + unsigned num; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(el); + num = scm_to_uint(n); + if (num >= elstr->len) + scm_misc_error(FUNC_NAME, + "cannot get character #~S: not enough characters: ~S", + scm_list_2(el, n)); + return scm_from_uint(elchr_flags(elstr->str[num])); +} +#undef FUNC_NAME + +static struct deftab { + unsigned val; + char *sym; +} deftab[] = { + { CHF_OXEIA, "elmorph:oxeia" }, + { CHF_PERISPWMENH, "elmorph:perispwmenh" }, + { CHF_BAREIA, "elmorph:bareia" }, + { CHF_ACCENT_MASK, "elmorph:accent-mask" }, + { CHF_TREMA, "elmorph:trema" }, + { CHF_VOWEL, "elmorph:vowel" }, + { CHF_CONSONANT, "elmorph:consonant" }, + { CHF_SEMIVOWEL, "elmorph:semivowel" }, + { CHF_PUNCT, "elmorph:punct" }, + { CHF_SYMBOL, "elmorph:symbol" }, + { CHF_MODIFIER, "elmorph:modifier" }, + { CHF_ARCHAIC, "elmorph:archaic" }, + { CHF_LOWER, "elmorph:lower" }, + { CHF_UPPER, "elmorph:upper" }, + { CHF_NUMERIC, "elmorph:numeric" }, + + { CHF_DIPH1, "elmorph:diph1" }, + { CHF_DIPH2, "elmorph:diph2" } +}; + +SCM_DEFINE_PUBLIC(scm_utf8_toupper, "utf8-toupper", 1, 0, 0, + (SCM string), +"Convert STRING to uppercase\n") +#define FUNC_NAME s_scm_utf8_toupper +{ + char *str; + SCM scm; + + SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME); + str = scm_to_locale_string(string); + if (utf8_toupper(str, strlen(str))) + scm_misc_error(FUNC_NAME, + "cannot convert to upper case: ~S", + scm_list_1(string)); + scm = scm_from_locale_string(str); + free(str); + return scm; +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_utf8_tolower, "utf8-tolower", 1, 0, 0, + (SCM string), +"Convert STRING to lowercase\n") +#define FUNC_NAME s_scm_utf8_tolower +{ + char *str; + SCM scm; + + SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME); + str = scm_to_locale_string(string); + if (utf8_tolower(str, strlen(str))) + scm_misc_error(FUNC_NAME, + "cannot convert to lower case: ~S", + scm_list_1(string)); + scm = scm_from_locale_string(str); + free(str); + return scm; +} +#undef FUNC_NAME + +static SCM +_elstr_thema_aoristoy(SCM el, int destructive, const char *func_name) +{ + struct elstr *elstr; + SCM scm; + unsigned *wc; + size_t wclen; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); + elstr = (struct elstr*) SCM_CDR(el); + if (destructive) + scm = SCM_UNSPECIFIED; + else { + scm = _elstr_dup(elstr); + elstr = (struct elstr*) SCM_CDR(scm); + } + if (elmorph_thema_aoristoy(elstr->str, elstr->len, &wc, &wclen)) + scm_memory_error(func_name); + free(elstr->str); + elstr->str = wc; + elstr->len = wclen; + return scm; +} + +SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy, "elstr-thema-aoristoy", 1, 0, 0, + (SCM thema), +"Convert THEMA, which must be a root of present. to an aorist root\n") +#define FUNC_NAME s_scm_elstr_thema_aoristoy +{ + return _elstr_thema_aoristoy(thema, 0, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy_x, "elstr-thema-aoristoy!", 1, 0, 0, + (SCM thema), +"Convert THEMA, which must be a root of present. to an aorist root (destructive)\n") +#define FUNC_NAME s_scm_elstr_thema_aoristoy_x +{ + return _elstr_thema_aoristoy(thema, 1, FUNC_NAME); +} +#undef FUNC_NAME + + +void +scm_init_ellinika_elmorph_module() +{ + int i; + + _elstr_init(); + for (i = 0; i < sizeof(deftab)/sizeof(deftab[0]); i++) { + scm_c_define(deftab[i].sym, scm_from_uint(deftab[i].val)); + scm_c_export(deftab[i].sym, NULL); + } +#include "elmorph.x" +} |