From a1a5b7ddd6c3c0532c37551b24fd573a554ac181 Mon Sep 17 00:00:00 2001 From: Sergey Poznyakoff Date: Fri, 10 Jun 2011 23:04:53 +0300 Subject: Fix syllabification. * configure.ac: Add AC_PROG_YACC * src/ellinika/phoneme.y: New file. * src/ellinika/yyrename: New file. * src/ellinika/syllabificator.c: New file. * src/ellinika/.gitignore: Update. * src/ellinika/elchr.c (char_info_st): Move to header. (el_basic_ctype): (elchr_info): Remove static qualifier. Return a pointer to const. (elchr_letter,elchr_phoneme): New functions. (elchr_diphthong): Remove. * src/ellinika/elmorph.c (elstr): New members. (_elstr_syllabize): Rewrite. (invalidate_maps)" New static function. (_elstr_alloc): Initialize new fields, take function name as argument, for diagnostic purposes. (_elstr_print): Rewrite (deftab): Update. (elstr-syllable-prop,elstr-syllable) (_elstr_set_accent,_elstr_set_accent_on_char): Rewrite. (elstr-char-phoneme,elstr->phonetic-map): New functions. * src/ellinika/elmorph.h (CHF_DIPH1,CHF_DIPH2): Remove. (CHF_DIPHTHONG): New flag. (PHON_.*): New constants. (phoneme,syllable): New structures. (char_info_st): New members. (elchr_info,elchr_letter) (elchr_phoneme,phoneme_map) (syllable_map): New protos. (elchr_diphthong): Remove protos. * src/ellinika/elmorph.scm4: Move public definitions to elmorph-public.scm; include it here. * src/ellinika/xlat.scm (ellinika:sounds-like): Rewrite as a wrapper over elstr->soundslike. Describe Milesian numbers. * style.css (img.ellinika-img): New class. * xml/lingua.conf.in (IMAGE): New tag. * xml/pl/alfabhta.xml: Describe Milesian numbers. Various fixes. * data/dbverb.struct: fix a typo in flection. Use 'sub' theme for pas/sub/aor. * data/irregular-verbs.xml: Add more verbs. * scm/conjugator.scm: Various fixes. * scm/verbop.scm: Accept empty mood and voice declarations. --- src/ellinika/elmorph.c | 308 +++++++++++++++++++++++++++++++------------------ 1 file changed, 196 insertions(+), 112 deletions(-) (limited to 'src/ellinika/elmorph.c') diff --git a/src/ellinika/elmorph.c b/src/ellinika/elmorph.c index 1831610..5a8acdf 100644 --- a/src/ellinika/elmorph.c +++ b/src/ellinika/elmorph.c @@ -26,55 +26,63 @@ struct elstr { unsigned *str; /* UTF-8 string */ size_t len; /* Its length */ + + struct phoneme *phoneme; /* Phonetical map*/ + unsigned phoneme_count; /* Number of phonemes */ + + struct syllable *sylmap; /* Syllable map (nsyl elements) */ unsigned nsyl; /* Number of syllables. */ - unsigned *sylmap; /* Syllable map (nsyl elements) */ unsigned acc_syl; /* Number of the accented syllable (1-based, from the last syllable) */ unsigned acc_pos; /* Number of the accented character (0-based, from str[0]) */ + }; scm_t_bits _elstr_tag; static void -_elstr_syllabize(struct elstr *elstr) -{ - unsigned *sylmap; - unsigned i, nsyl = 0, accchr = 0; - int accsyl = -1; - int dstate = 0; - int acc = 0; - - if (!elstr->sylmap) { - elstr->sylmap = calloc(elstr->len, sizeof(sylmap[0])); - if (!elstr->sylmap) - scm_memory_error("_elstr_syllabize"); - } - sylmap = elstr->sylmap; +_elstr_syllabize(struct elstr *elstr, const char *func_name) +{ + unsigned i; + + free(elstr->phoneme); + free(elstr->sylmap); - for (i = 0; i < elstr->len; i++) { - int nstate; - - if (elchr_getaccent(elstr->str[i])) { - accsyl = nsyl; - accchr = i; - } - nstate = elchr_diphthong(elstr->str[i], dstate); - if (nstate) - /* skip */; - else if (dstate) - sylmap[nsyl++] = i - 1; - else if (elchr_isvowel(elstr->str[i])) - sylmap[nsyl++] = i; - dstate = nstate; + if (phoneme_map(&elstr->phoneme, &elstr->phoneme_count, + elstr->str, elstr->len)) + scm_misc_error(func_name, + "cannot create phonetic map: ~S", + scm_from_int(errno)); + + if (syllable_map(&elstr->sylmap, &elstr->nsyl, + elstr->phoneme, elstr->phoneme_count)) + scm_misc_error(func_name, + "cannot create syllable map: ~S", + scm_from_int(errno)); + + for (i = elstr->nsyl; i > 0; i--) { + if (elstr->sylmap[elstr->nsyl - i].flags & CHF_ACCENT_MASK) + break; } - if (dstate) - sylmap[nsyl++] = i - 1; - else if (nsyl) - sylmap[nsyl-1] = i - 1; - elstr->nsyl = nsyl; - elstr->acc_pos = accchr; - elstr->acc_syl = (accsyl >= 0) ? nsyl - accsyl : 0; + elstr->acc_syl = i; + for (i = 0; i < elstr->len; i++) + if (elchr_getaccent(elstr->str[i])) + break; + elstr->acc_pos = i; +} + +static void +invalidate_maps(struct elstr *elstr) +{ + free(elstr->sylmap); + elstr->sylmap = NULL; + elstr->nsyl = 0; + free(elstr->phoneme); + elstr->phoneme = NULL; + elstr->phoneme_count = 0; + elstr->acc_pos = 0; + elstr->acc_syl = 0; } static SCM @@ -89,7 +97,7 @@ _elstr_alloc_empty(struct elstr **pelstr) } static SCM -_elstr_alloc(const char *instr, int syl) +_elstr_alloc(const char *instr, int syl, const char *func_name) { struct elstr *elstr; unsigned *wptr; @@ -105,8 +113,10 @@ _elstr_alloc(const char *instr, int syl) elstr->nsyl = 0; elstr->acc_syl = 0; elstr->acc_pos = 0; + elstr->phoneme = 0; + elstr->phoneme_count = 0; if (syl) - _elstr_syllabize(elstr); + _elstr_syllabize(elstr, func_name); SCM_RETURN_NEWSMOB(_elstr_tag, elstr); } @@ -120,19 +130,34 @@ _elstr_dup(struct elstr *elstr) elnew->str = calloc(elstr->len, sizeof(elnew->str[0])); if (!elnew->str) scm_memory_error("_elstr_dup"); + memcpy(elnew->str, elstr->str, sizeof(elstr->str[0]) * elstr->len); + elnew->len = elstr->len; + + if (elstr->phoneme) { + elnew->phoneme = calloc(elstr->phoneme_count, + sizeof(elnew->phoneme[0])); + if (!elnew->phoneme) { + free(elnew->str); + scm_memory_error("_elstr_dup"); + } + memcpy(elnew->phoneme, elstr->phoneme, + sizeof(elstr->phoneme[0]) * elstr->phoneme_count); + } else + elnew->phoneme = NULL; + elnew->phoneme_count = elstr->phoneme_count; + if (elstr->sylmap) { elnew->sylmap = calloc(elstr->nsyl, sizeof(elnew->sylmap[0])); if (!elnew->sylmap) { free(elnew->str); scm_memory_error("_elstr_dup"); } + memcpy(elnew->sylmap, elstr->sylmap, + sizeof(elstr->sylmap[0]) * elstr->nsyl); } else elnew->sylmap = NULL; - memcpy(elnew->str, elstr->str, sizeof(elstr->str[0]) * elstr->len); - elnew->len = elstr->len; elnew->nsyl = elstr->nsyl; - memcpy(elnew->sylmap, elstr->sylmap, - sizeof(elstr->sylmap[0]) * elstr->nsyl); + elnew->acc_syl = elstr->acc_syl; elnew->acc_pos = elstr->acc_pos; SCM_RETURN_NEWSMOB(_elstr_tag, elnew); @@ -162,6 +187,7 @@ _elstr_free(SCM smob) struct elstr *elstr = (struct elstr *) SCM_CDR(smob); free(elstr->str); free(elstr->sylmap); + free(elstr->phoneme); scm_gc_free(elstr, sizeof(struct elstr), "elstr"); return 0; } @@ -170,34 +196,31 @@ static int _elstr_print(SCM smob, SCM port, scm_print_state *pstate) { struct elstr *elstr = (struct elstr *) SCM_CDR(smob); - int i, j, an; + int i, j; char *s; scm_puts("#sylmap) { scm_puts("``", port); - an = elstr->nsyl - elstr->acc_syl; - if (an == 0) - scm_puts("[", port); - for (i = j = 0; i < elstr->len; i++) { - char r[6]; - int n; - - if (i == elstr->sylmap[j] + 1) { - if (j == an) - scm_puts("]", port); + for (i = 0; i < elstr->nsyl; i++) { + size_t start = elstr->sylmap[i].char_start; + if (i) scm_puts("-", port); - if (++j == an) - scm_puts("[", port); + if (elstr->sylmap[i].flags & CHF_ACCENT_MASK) + scm_puts("[", port); + for (j = 0; j < elstr->sylmap[i].char_count; j++) { + char r[6]; + int n; + + n = utf8_wctomb(r, elstr->str[start+j]); + if (n == -1) + continue; + r[n] = 0; + scm_puts(r, port); } - n = utf8_wctomb(r, elstr->str[i]); - if (n == -1) - continue; - r[n] = 0; - scm_puts(r, port); + if (elstr->sylmap[i].flags & CHF_ACCENT_MASK) + scm_puts("]", port); } - if (j == an) - scm_puts("]", port); } else { scm_puts("[NS] ``", port); for (i = j = 0; i < elstr->len; i++) { @@ -238,7 +261,7 @@ force_elstr(struct elstr **ep, SCM scm, int sylopt, SCM_ASSERT(scm_is_string(scm), scm, arg, func_name); str = scm_to_locale_string(scm); - newscm = _elstr_alloc(str, sylopt); + newscm = _elstr_alloc(str, sylopt, func_name); free(str); if (newscm == SCM_EOL) scm_misc_error(func_name, @@ -336,13 +359,10 @@ SCM_DEFINE_PUBLIC(scm_elstr_syllable_prop, "elstr-syllable-prop", "cannot get syllable #~S: not enough syllables: ~S", scm_list_2(el, n)); num = elstr->nsyl - num; - if (num == 0) - start = 0; - else - start = elstr->sylmap[num - 1] + 1; - return scm_cons(scm_from_uint(start), - scm_from_uint(elstr->sylmap[num])); + return scm_list_3(scm_from_uint(elstr->sylmap[num].char_start), + scm_from_uint(elstr->sylmap[num].char_count), + scm_from_int(elstr->sylmap[num].flags)); } #undef FUNC_NAME @@ -388,12 +408,8 @@ SCM_DEFINE_PUBLIC(scm_elstr_syllable, "elstr-syllable", "cannot get syllable #~S: not enough syllables: ~S", scm_list_2(el, n)); num = elstr->nsyl - num; - if (num == 0) - start = 0; - else - start = elstr->sylmap[num - 1] + 1; - if (utf8_wc_to_mbstr(elstr->str + start, - elstr->sylmap[num] - start + 1, + if (utf8_wc_to_mbstr(elstr->str + elstr->sylmap[num].char_start, + elstr->sylmap[num].char_count, &s)) scm_misc_error(FUNC_NAME, "cannot convert elstr to Scheme", @@ -514,8 +530,7 @@ _elstr_deaccent(SCM el, int destructive, const char *func_name) } for (i = 0; i < elstr->len; i++) elstr->str[i] = elchr_deaccent(elstr->str[i]); - elstr->acc_pos = 0; - elstr->acc_syl = 0; + invalidate_maps(elstr); return scm; } @@ -544,9 +559,10 @@ _elstr_set_accent(SCM el, SCM n, int destructive, const char *func_name) { struct elstr *elstr; unsigned i; - unsigned acc_num, num, len, start; + unsigned acc_num, num, start; SCM scm; - int dstate; + unsigned pos; + struct phoneme *phoneme = NULL; if (destructive) { SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); @@ -556,15 +572,11 @@ _elstr_set_accent(SCM el, SCM n, int destructive, const char *func_name) SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name); num = scm_to_uint(n); - if (num > elstr->nsyl) + if (num == 0 | num > elstr->nsyl) scm_misc_error(func_name, "cannot set accent on syllable #~S: not enough syllables: ~S", scm_list_2(n, el)); acc_num = elstr->nsyl - num; - if (acc_num == 0) - start = 0; - else - start = elstr->sylmap[acc_num - 1] + 1; if (destructive) scm = SCM_UNSPECIFIED; @@ -576,25 +588,38 @@ _elstr_set_accent(SCM el, SCM n, int destructive, const char *func_name) /* Clear all accents */ for (i = 0; i < elstr->len; i++) elstr->str[i] = elchr_deaccent(elstr->str[i]); - len = elstr->sylmap[acc_num] - start + 1; - dstate = 0; - for (i = start; i <= start + len; i++) { - int nstate; - - if (!elchr_isvowel(elstr->str[i])) { - if (dstate) { - --i; - break; - } - continue; - } - nstate = elchr_diphthong(elstr->str[i], dstate); - if (!nstate) + for (i = 0; i < elstr->nsyl; i++) + elstr->sylmap[i].flags &= ~CHF_ACCENT_MASK; + for (i = 0; i < elstr->phoneme_count; i++) + elstr->phoneme[i].flags &= ~CHF_ACCENT_MASK; + + start = elstr->sylmap[acc_num].phoneme_start; + pos = 0; + for (i = 0; i < elstr->sylmap[acc_num].phoneme_count; i++) { + struct phoneme *ph = elstr->phoneme + start + i; + if (ph->flags & CHF_CONSONANT) + /* skip */ ; + else if (ph->flags & CHF_DIPHTHONG) { + phoneme = ph; + pos = ph->start + 1; + break; + } else if (ph->flags & CHF_VOWEL) { + phoneme = ph; + pos = ph->start; break; - dstate = nstate; + } } - elstr->str[i] = elchr_accent(elstr->str[i], CHF_OXEIA); + if (!phoneme) + scm_misc_error(func_name, + "cannot set accent on syllable #~S of ~S: " + "INTERNAL ERROR", + scm_list_2(n, el)); + phoneme->flags |= CHF_OXEIA; + elstr->sylmap[acc_num].flags |= CHF_OXEIA; + elstr->str[pos] = elchr_accent(elstr->str[pos], CHF_OXEIA); + elstr->acc_syl = num; + elstr->acc_pos = pos; return scm; } @@ -652,7 +677,8 @@ _elstr_set_accent_on_char(SCM el, SCM n, int destructive, const char *func_name) elstr->str[i] = elchr_deaccent(elstr->str[i]); elstr->str[num] = elchr_accent(elstr->str[num], CHF_OXEIA); - _elstr_syllabize(elstr); + invalidate_maps(elstr); + _elstr_syllabize(elstr, func_name); return scm; } @@ -716,11 +742,31 @@ static struct deftab { { CHF_LOWER, "elmorph:lower" }, { CHF_UPPER, "elmorph:upper" }, { CHF_NUMERIC, "elmorph:numeric" }, - - { CHF_DIPH1, "elmorph:diph1" }, - { CHF_DIPH2, "elmorph:diph2" } + { CHF_DIPHTHONG, "elmorph:diphthong" }, }; - + +SCM_DEFINE_PUBLIC(scm_elstr_char_phoneme, "elstr-char-phoneme", + 2, 0, 0, + (SCM el, SCM n), +"Returns a phoneme code of the Nth char in EL\n") +#define FUNC_NAME s_scm_elstr_char_phoneme +{ + struct elstr *elstr; + int num; + + force_elstr(&elstr, el, 0, SCM_ARG1, FUNC_NAME); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); + num = scm_to_int(n); + if (num < 0) + num += elstr->len; + if (num >= elstr->len) + scm_misc_error(FUNC_NAME, + "cannot get character #~S: not enough characters: ~S", + scm_list_2(el, n)); + return scm_from_uint(elchr_phoneme(elstr->str[num])); +} +#undef FUNC_NAME + SCM_DEFINE_PUBLIC(scm_utf8_toupper, "utf8-toupper", 1, 0, 0, (SCM string), "Convert STRING to uppercase\n") @@ -818,7 +864,8 @@ _elstr_slice(SCM el, SCM n, SCM l, int destructive, const char *func_name) SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); elstr = (struct elstr*) SCM_CDR(el); } else - scm = force_elstr(&elstr, el, 1, SCM_ARG1, func_name); + scm = force_elstr(&elstr, el, 0, SCM_ARG1, func_name); + invalidate_maps(elstr); SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name); SCM_ASSERT(scm_is_integer(l), l, SCM_ARG3, func_name); num = scm_to_int(n); @@ -842,7 +889,7 @@ _elstr_slice(SCM el, SCM n, SCM l, int destructive, const char *func_name) memmove(elstr->str, elstr->str + num, sizeof(elstr->str[0]) * len); elstr->len = len; - _elstr_syllabize(elstr); + _elstr_syllabize(elstr, func_name); return scm; } @@ -869,7 +916,7 @@ SCM_DEFINE_PUBLIC(scm_elstr_slice_x, "elstr-slice!", SCM_DEFINE_PUBLIC(scm_elstr_index, "elstr-index", 2, 0, 0, (SCM word, SCM needle), -"") +"Returns position of NEEDLE in the WORD") #define FUNC_NAME s_scm_elstr_index { struct elstr *elstr, *ep; @@ -920,10 +967,10 @@ SCM_DEFINE_PUBLIC(scm_elstr_suffix_p, "elstr-suffix?", SCM_DEFINE_PUBLIC(scm_elstr_append, "elstr-append", 0, 0, 1, (SCM rest), -"") +"Concatenates arguments.\n") #define FUNC_NAME s_scm_elstr_append { - SCM ret = _elstr_alloc("", 0); + SCM ret = _elstr_alloc("", 0, FUNC_NAME); struct elstr *elstr = (struct elstr*) SCM_CDR(ret); for (; !scm_is_null(rest); rest = SCM_CDR(rest)) { @@ -933,11 +980,48 @@ SCM_DEFINE_PUBLIC(scm_elstr_append, "elstr-append", force_elstr(&elt, val, 0, SCM_ARGn, FUNC_NAME); _elstr_concat(elstr, elt, FUNC_NAME); } - _elstr_syllabize(elstr); + _elstr_syllabize(elstr, FUNC_NAME); return ret; } #undef FUNC_NAME + +static SCM +elmorph_scm_from_phoneme(struct phoneme *phoneme) +{ + return scm_list_4(scm_from_int(phoneme->code), + scm_from_uint(phoneme->start), + scm_from_uint(phoneme->count), + scm_from_bool(phoneme->flags)); +} +SCM_DEFINE_PUBLIC(scm_elstr__phonetic_map, "elstr->phonetic-map", + 1, 0, 0, + (SCM word), +"Converts WORD to a phonetic map.\n") +#define FUNC_NAME s_scm_elstr__phonetic_map +{ + struct elstr *elstr; + struct phoneme *phmap; + size_t phlen, i; + SCM head = SCM_EOL, tail = SCM_EOL; + + force_elstr(&elstr, word, 1, SCM_ARG1, FUNC_NAME); + phmap = elstr->phoneme; + phlen = elstr->phoneme_count; + for (i = 0; i < phlen; i++) { + SCM elt = scm_cons(elmorph_scm_from_phoneme(phmap + i), + SCM_EOL); + if (scm_is_null(head)) + head = tail = elt; + else { + SCM_SETCDR(tail, elt); + tail = elt; + } + } + free(phmap); + return head; +} +#undef FUNC_NAME void scm_init_ellinika_elmorph_module() -- cgit v1.2.1