aboutsummaryrefslogtreecommitdiff
path: root/src/ellinika/elmorph.c
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org.ua>2011-06-10 23:04:53 +0300
committerSergey Poznyakoff <gray@gnu.org.ua>2011-06-11 00:34:20 +0300
commita1a5b7ddd6c3c0532c37551b24fd573a554ac181 (patch)
treef86f3572c77dc986bb2dfb65619ac4bc35c83847 /src/ellinika/elmorph.c
parent2bae7da012e2125762855ce014e63345ecbbbb18 (diff)
downloadellinika-a1a5b7ddd6c3c0532c37551b24fd573a554ac181.tar.gz
ellinika-a1a5b7ddd6c3c0532c37551b24fd573a554ac181.tar.bz2
Fix syllabification.
* configure.ac: Add AC_PROG_YACC * src/ellinika/phoneme.y: New file. * src/ellinika/yyrename: New file. * src/ellinika/syllabificator.c: New file. * src/ellinika/.gitignore: Update. * src/ellinika/elchr.c (char_info_st): Move to header. (el_basic_ctype): (elchr_info): Remove static qualifier. Return a pointer to const. (elchr_letter,elchr_phoneme): New functions. (elchr_diphthong): Remove. * src/ellinika/elmorph.c (elstr)<phoneme,phoneme_count>: New members. (_elstr_syllabize): Rewrite. (invalidate_maps)" New static function. (_elstr_alloc): Initialize new fields, take function name as argument, for diagnostic purposes. (_elstr_print): Rewrite (deftab): Update. (elstr-syllable-prop,elstr-syllable) (_elstr_set_accent,_elstr_set_accent_on_char): Rewrite. (elstr-char-phoneme,elstr->phonetic-map): New functions. * src/ellinika/elmorph.h (CHF_DIPH1,CHF_DIPH2): Remove. (CHF_DIPHTHONG): New flag. (PHON_.*): New constants. (phoneme,syllable): New structures. (char_info_st)<letter,phoneme>: New members. (elchr_info,elchr_letter) (elchr_phoneme,phoneme_map) (syllable_map): New protos. (elchr_diphthong): Remove protos. * src/ellinika/elmorph.scm4: Move public definitions to elmorph-public.scm; include it here. * src/ellinika/xlat.scm (ellinika:sounds-like): Rewrite as a wrapper over elstr->soundslike. Describe Milesian numbers. * style.css (img.ellinika-img): New class. * xml/lingua.conf.in (IMAGE): New tag. * xml/pl/alfabhta.xml: Describe Milesian numbers. Various fixes. * data/dbverb.struct: fix a typo in flection. Use 'sub' theme for pas/sub/aor. * data/irregular-verbs.xml: Add more verbs. * scm/conjugator.scm: Various fixes. * scm/verbop.scm: Accept empty mood and voice declarations.
Diffstat (limited to 'src/ellinika/elmorph.c')
-rw-r--r--src/ellinika/elmorph.c308
1 files changed, 196 insertions, 112 deletions
diff --git a/src/ellinika/elmorph.c b/src/ellinika/elmorph.c
index 1831610..5a8acdf 100644
--- a/src/ellinika/elmorph.c
+++ b/src/ellinika/elmorph.c
@@ -26,55 +26,63 @@
struct elstr {
unsigned *str; /* UTF-8 string */
size_t len; /* Its length */
+
+ struct phoneme *phoneme; /* Phonetical map*/
+ unsigned phoneme_count; /* Number of phonemes */
+
+ struct syllable *sylmap; /* Syllable map (nsyl elements) */
unsigned nsyl; /* Number of syllables. */
- unsigned *sylmap; /* Syllable map (nsyl elements) */
unsigned acc_syl; /* Number of the accented syllable
(1-based, from the last syllable) */
unsigned acc_pos; /* Number of the accented character
(0-based, from str[0]) */
+
};
scm_t_bits _elstr_tag;
static void
-_elstr_syllabize(struct elstr *elstr)
-{
- unsigned *sylmap;
- unsigned i, nsyl = 0, accchr = 0;
- int accsyl = -1;
- int dstate = 0;
- int acc = 0;
-
- if (!elstr->sylmap) {
- elstr->sylmap = calloc(elstr->len, sizeof(sylmap[0]));
- if (!elstr->sylmap)
- scm_memory_error("_elstr_syllabize");
- }
- sylmap = elstr->sylmap;
+_elstr_syllabize(struct elstr *elstr, const char *func_name)
+{
+ unsigned i;
+
+ free(elstr->phoneme);
+ free(elstr->sylmap);
- for (i = 0; i < elstr->len; i++) {
- int nstate;
-
- if (elchr_getaccent(elstr->str[i])) {
- accsyl = nsyl;
- accchr = i;
- }
- nstate = elchr_diphthong(elstr->str[i], dstate);
- if (nstate)
- /* skip */;
- else if (dstate)
- sylmap[nsyl++] = i - 1;
- else if (elchr_isvowel(elstr->str[i]))
- sylmap[nsyl++] = i;
- dstate = nstate;
+ if (phoneme_map(&elstr->phoneme, &elstr->phoneme_count,
+ elstr->str, elstr->len))
+ scm_misc_error(func_name,
+ "cannot create phonetic map: ~S",
+ scm_from_int(errno));
+
+ if (syllable_map(&elstr->sylmap, &elstr->nsyl,
+ elstr->phoneme, elstr->phoneme_count))
+ scm_misc_error(func_name,
+ "cannot create syllable map: ~S",
+ scm_from_int(errno));
+
+ for (i = elstr->nsyl; i > 0; i--) {
+ if (elstr->sylmap[elstr->nsyl - i].flags & CHF_ACCENT_MASK)
+ break;
}
- if (dstate)
- sylmap[nsyl++] = i - 1;
- else if (nsyl)
- sylmap[nsyl-1] = i - 1;
- elstr->nsyl = nsyl;
- elstr->acc_pos = accchr;
- elstr->acc_syl = (accsyl >= 0) ? nsyl - accsyl : 0;
+ elstr->acc_syl = i;
+ for (i = 0; i < elstr->len; i++)
+ if (elchr_getaccent(elstr->str[i]))
+ break;
+ elstr->acc_pos = i;
+}
+
+static void
+invalidate_maps(struct elstr *elstr)
+{
+ free(elstr->sylmap);
+ elstr->sylmap = NULL;
+ elstr->nsyl = 0;
+ free(elstr->phoneme);
+ elstr->phoneme = NULL;
+ elstr->phoneme_count = 0;
+ elstr->acc_pos = 0;
+ elstr->acc_syl = 0;
}
static SCM
@@ -89,7 +97,7 @@ _elstr_alloc_empty(struct elstr **pelstr)
}
static SCM
-_elstr_alloc(const char *instr, int syl)
+_elstr_alloc(const char *instr, int syl, const char *func_name)
{
struct elstr *elstr;
unsigned *wptr;
@@ -105,8 +113,10 @@ _elstr_alloc(const char *instr, int syl)
elstr->nsyl = 0;
elstr->acc_syl = 0;
elstr->acc_pos = 0;
+ elstr->phoneme = 0;
+ elstr->phoneme_count = 0;
if (syl)
- _elstr_syllabize(elstr);
+ _elstr_syllabize(elstr, func_name);
SCM_RETURN_NEWSMOB(_elstr_tag, elstr);
}
@@ -120,19 +130,34 @@ _elstr_dup(struct elstr *elstr)
elnew->str = calloc(elstr->len, sizeof(elnew->str[0]));
if (!elnew->str)
scm_memory_error("_elstr_dup");
+ memcpy(elnew->str, elstr->str, sizeof(elstr->str[0]) * elstr->len);
+ elnew->len = elstr->len;
+
+ if (elstr->phoneme) {
+ elnew->phoneme = calloc(elstr->phoneme_count,
+ sizeof(elnew->phoneme[0]));
+ if (!elnew->phoneme) {
+ free(elnew->str);
+ scm_memory_error("_elstr_dup");
+ }
+ memcpy(elnew->phoneme, elstr->phoneme,
+ sizeof(elstr->phoneme[0]) * elstr->phoneme_count);
+ } else
+ elnew->phoneme = NULL;
+ elnew->phoneme_count = elstr->phoneme_count;
+
if (elstr->sylmap) {
elnew->sylmap = calloc(elstr->nsyl, sizeof(elnew->sylmap[0]));
if (!elnew->sylmap) {
free(elnew->str);
scm_memory_error("_elstr_dup");
}
+ memcpy(elnew->sylmap, elstr->sylmap,
+ sizeof(elstr->sylmap[0]) * elstr->nsyl);
} else
elnew->sylmap = NULL;
- memcpy(elnew->str, elstr->str, sizeof(elstr->str[0]) * elstr->len);
- elnew->len = elstr->len;
elnew->nsyl = elstr->nsyl;
- memcpy(elnew->sylmap, elstr->sylmap,
- sizeof(elstr->sylmap[0]) * elstr->nsyl);
+
elnew->acc_syl = elstr->acc_syl;
elnew->acc_pos = elstr->acc_pos;
SCM_RETURN_NEWSMOB(_elstr_tag, elnew);
@@ -162,6 +187,7 @@ _elstr_free(SCM smob)
struct elstr *elstr = (struct elstr *) SCM_CDR(smob);
free(elstr->str);
free(elstr->sylmap);
+ free(elstr->phoneme);
scm_gc_free(elstr, sizeof(struct elstr), "elstr");
return 0;
}
@@ -170,34 +196,31 @@ static int
_elstr_print(SCM smob, SCM port, scm_print_state *pstate)
{
struct elstr *elstr = (struct elstr *) SCM_CDR(smob);
- int i, j, an;
+ int i, j;
char *s;
scm_puts("#<elstr ", port);
if (elstr->sylmap) {
scm_puts("``", port);
- an = elstr->nsyl - elstr->acc_syl;
- if (an == 0)
- scm_puts("[", port);
- for (i = j = 0; i < elstr->len; i++) {
- char r[6];
- int n;
-
- if (i == elstr->sylmap[j] + 1) {
- if (j == an)
- scm_puts("]", port);
+ for (i = 0; i < elstr->nsyl; i++) {
+ size_t start = elstr->sylmap[i].char_start;
+ if (i)
scm_puts("-", port);
- if (++j == an)
- scm_puts("[", port);
+ if (elstr->sylmap[i].flags & CHF_ACCENT_MASK)
+ scm_puts("[", port);
+ for (j = 0; j < elstr->sylmap[i].char_count; j++) {
+ char r[6];
+ int n;
+
+ n = utf8_wctomb(r, elstr->str[start+j]);
+ if (n == -1)
+ continue;
+ r[n] = 0;
+ scm_puts(r, port);
}
- n = utf8_wctomb(r, elstr->str[i]);
- if (n == -1)
- continue;
- r[n] = 0;
- scm_puts(r, port);
+ if (elstr->sylmap[i].flags & CHF_ACCENT_MASK)
+ scm_puts("]", port);
}
- if (j == an)
- scm_puts("]", port);
} else {
scm_puts("[NS] ``", port);
for (i = j = 0; i < elstr->len; i++) {
@@ -238,7 +261,7 @@ force_elstr(struct elstr **ep, SCM scm, int sylopt,
SCM_ASSERT(scm_is_string(scm), scm, arg, func_name);
str = scm_to_locale_string(scm);
- newscm = _elstr_alloc(str, sylopt);
+ newscm = _elstr_alloc(str, sylopt, func_name);
free(str);
if (newscm == SCM_EOL)
scm_misc_error(func_name,
@@ -336,13 +359,10 @@ SCM_DEFINE_PUBLIC(scm_elstr_syllable_prop, "elstr-syllable-prop",
"cannot get syllable #~S: not enough syllables: ~S",
scm_list_2(el, n));
num = elstr->nsyl - num;
- if (num == 0)
- start = 0;
- else
- start = elstr->sylmap[num - 1] + 1;
- return scm_cons(scm_from_uint(start),
- scm_from_uint(elstr->sylmap[num]));
+ return scm_list_3(scm_from_uint(elstr->sylmap[num].char_start),
+ scm_from_uint(elstr->sylmap[num].char_count),
+ scm_from_int(elstr->sylmap[num].flags));
}
#undef FUNC_NAME
@@ -388,12 +408,8 @@ SCM_DEFINE_PUBLIC(scm_elstr_syllable, "elstr-syllable",
"cannot get syllable #~S: not enough syllables: ~S",
scm_list_2(el, n));
num = elstr->nsyl - num;
- if (num == 0)
- start = 0;
- else
- start = elstr->sylmap[num - 1] + 1;
- if (utf8_wc_to_mbstr(elstr->str + start,
- elstr->sylmap[num] - start + 1,
+ if (utf8_wc_to_mbstr(elstr->str + elstr->sylmap[num].char_start,
+ elstr->sylmap[num].char_count,
&s))
scm_misc_error(FUNC_NAME,
"cannot convert elstr to Scheme",
@@ -514,8 +530,7 @@ _elstr_deaccent(SCM el, int destructive, const char *func_name)
}
for (i = 0; i < elstr->len; i++)
elstr->str[i] = elchr_deaccent(elstr->str[i]);
- elstr->acc_pos = 0;
- elstr->acc_syl = 0;
+ invalidate_maps(elstr);
return scm;
}
@@ -544,9 +559,10 @@ _elstr_set_accent(SCM el, SCM n, int destructive, const char *func_name)
{
struct elstr *elstr;
unsigned i;
- unsigned acc_num, num, len, start;
+ unsigned acc_num, num, start;
SCM scm;
- int dstate;
+ unsigned pos;
+ struct phoneme *phoneme = NULL;
if (destructive) {
SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name);
@@ -556,15 +572,11 @@ _elstr_set_accent(SCM el, SCM n, int destructive, const char *func_name)
SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name);
num = scm_to_uint(n);
- if (num > elstr->nsyl)
+ if (num == 0 | num > elstr->nsyl)
scm_misc_error(func_name,
"cannot set accent on syllable #~S: not enough syllables: ~S",
scm_list_2(n, el));
acc_num = elstr->nsyl - num;
- if (acc_num == 0)
- start = 0;
- else
- start = elstr->sylmap[acc_num - 1] + 1;
if (destructive)
scm = SCM_UNSPECIFIED;
@@ -576,25 +588,38 @@ _elstr_set_accent(SCM el, SCM n, int destructive, const char *func_name)
/* Clear all accents */
for (i = 0; i < elstr->len; i++)
elstr->str[i] = elchr_deaccent(elstr->str[i]);
- len = elstr->sylmap[acc_num] - start + 1;
- dstate = 0;
- for (i = start; i <= start + len; i++) {
- int nstate;
-
- if (!elchr_isvowel(elstr->str[i])) {
- if (dstate) {
- --i;
- break;
- }
- continue;
- }
- nstate = elchr_diphthong(elstr->str[i], dstate);
- if (!nstate)
+ for (i = 0; i < elstr->nsyl; i++)
+ elstr->sylmap[i].flags &= ~CHF_ACCENT_MASK;
+ for (i = 0; i < elstr->phoneme_count; i++)
+ elstr->phoneme[i].flags &= ~CHF_ACCENT_MASK;
+
+ start = elstr->sylmap[acc_num].phoneme_start;
+ pos = 0;
+ for (i = 0; i < elstr->sylmap[acc_num].phoneme_count; i++) {
+ struct phoneme *ph = elstr->phoneme + start + i;
+ if (ph->flags & CHF_CONSONANT)
+ /* skip */ ;
+ else if (ph->flags & CHF_DIPHTHONG) {
+ phoneme = ph;
+ pos = ph->start + 1;
+ break;
+ } else if (ph->flags & CHF_VOWEL) {
+ phoneme = ph;
+ pos = ph->start;
break;
- dstate = nstate;
+ }
}
- elstr->str[i] = elchr_accent(elstr->str[i], CHF_OXEIA);
+ if (!phoneme)
+ scm_misc_error(func_name,
+ "cannot set accent on syllable #~S of ~S: "
+ "INTERNAL ERROR",
+ scm_list_2(n, el));
+ phoneme->flags |= CHF_OXEIA;
+ elstr->sylmap[acc_num].flags |= CHF_OXEIA;
+ elstr->str[pos] = elchr_accent(elstr->str[pos], CHF_OXEIA);
+
elstr->acc_syl = num;
+ elstr->acc_pos = pos;
return scm;
}
@@ -652,7 +677,8 @@ _elstr_set_accent_on_char(SCM el, SCM n, int destructive, const char *func_name)
elstr->str[i] = elchr_deaccent(elstr->str[i]);
elstr->str[num] = elchr_accent(elstr->str[num], CHF_OXEIA);
- _elstr_syllabize(elstr);
+ invalidate_maps(elstr);
+ _elstr_syllabize(elstr, func_name);
return scm;
}
@@ -716,11 +742,31 @@ static struct deftab {
{ CHF_LOWER, "elmorph:lower" },
{ CHF_UPPER, "elmorph:upper" },
{ CHF_NUMERIC, "elmorph:numeric" },
-
- { CHF_DIPH1, "elmorph:diph1" },
- { CHF_DIPH2, "elmorph:diph2" }
+ { CHF_DIPHTHONG, "elmorph:diphthong" },
};
-
+
+SCM_DEFINE_PUBLIC(scm_elstr_char_phoneme, "elstr-char-phoneme",
+ 2, 0, 0,
+ (SCM el, SCM n),
+"Returns a phoneme code of the Nth char in EL\n")
+#define FUNC_NAME s_scm_elstr_char_phoneme
+{
+ struct elstr *elstr;
+ int num;
+
+ force_elstr(&elstr, el, 0, SCM_ARG1, FUNC_NAME);
+ SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME);
+ num = scm_to_int(n);
+ if (num < 0)
+ num += elstr->len;
+ if (num >= elstr->len)
+ scm_misc_error(FUNC_NAME,
+ "cannot get character #~S: not enough characters: ~S",
+ scm_list_2(el, n));
+ return scm_from_uint(elchr_phoneme(elstr->str[num]));
+}
+#undef FUNC_NAME
+
SCM_DEFINE_PUBLIC(scm_utf8_toupper, "utf8-toupper", 1, 0, 0,
(SCM string),
"Convert STRING to uppercase\n")
@@ -818,7 +864,8 @@ _elstr_slice(SCM el, SCM n, SCM l, int destructive, const char *func_name)
SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name);
elstr = (struct elstr*) SCM_CDR(el);
} else
- scm = force_elstr(&elstr, el, 1, SCM_ARG1, func_name);
+ scm = force_elstr(&elstr, el, 0, SCM_ARG1, func_name);
+ invalidate_maps(elstr);
SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name);
SCM_ASSERT(scm_is_integer(l), l, SCM_ARG3, func_name);
num = scm_to_int(n);
@@ -842,7 +889,7 @@ _elstr_slice(SCM el, SCM n, SCM l, int destructive, const char *func_name)
memmove(elstr->str, elstr->str + num,
sizeof(elstr->str[0]) * len);
elstr->len = len;
- _elstr_syllabize(elstr);
+ _elstr_syllabize(elstr, func_name);
return scm;
}
@@ -869,7 +916,7 @@ SCM_DEFINE_PUBLIC(scm_elstr_slice_x, "elstr-slice!",
SCM_DEFINE_PUBLIC(scm_elstr_index, "elstr-index",
2, 0, 0,
(SCM word, SCM needle),
-"")
+"Returns position of NEEDLE in the WORD")
#define FUNC_NAME s_scm_elstr_index
{
struct elstr *elstr, *ep;
@@ -920,10 +967,10 @@ SCM_DEFINE_PUBLIC(scm_elstr_suffix_p, "elstr-suffix?",
SCM_DEFINE_PUBLIC(scm_elstr_append, "elstr-append",
0, 0, 1,
(SCM rest),
-"")
+"Concatenates arguments.\n")
#define FUNC_NAME s_scm_elstr_append
{
- SCM ret = _elstr_alloc("", 0);
+ SCM ret = _elstr_alloc("", 0, FUNC_NAME);
struct elstr *elstr = (struct elstr*) SCM_CDR(ret);
for (; !scm_is_null(rest); rest = SCM_CDR(rest)) {
@@ -933,11 +980,48 @@ SCM_DEFINE_PUBLIC(scm_elstr_append, "elstr-append",
force_elstr(&elt, val, 0, SCM_ARGn, FUNC_NAME);
_elstr_concat(elstr, elt, FUNC_NAME);
}
- _elstr_syllabize(elstr);
+ _elstr_syllabize(elstr, FUNC_NAME);
return ret;
}
#undef FUNC_NAME
+
+static SCM
+elmorph_scm_from_phoneme(struct phoneme *phoneme)
+{
+ return scm_list_4(scm_from_int(phoneme->code),
+ scm_from_uint(phoneme->start),
+ scm_from_uint(phoneme->count),
+ scm_from_bool(phoneme->flags));
+}
+SCM_DEFINE_PUBLIC(scm_elstr__phonetic_map, "elstr->phonetic-map",
+ 1, 0, 0,
+ (SCM word),
+"Converts WORD to a phonetic map.\n")
+#define FUNC_NAME s_scm_elstr__phonetic_map
+{
+ struct elstr *elstr;
+ struct phoneme *phmap;
+ size_t phlen, i;
+ SCM head = SCM_EOL, tail = SCM_EOL;
+
+ force_elstr(&elstr, word, 1, SCM_ARG1, FUNC_NAME);
+ phmap = elstr->phoneme;
+ phlen = elstr->phoneme_count;
+ for (i = 0; i < phlen; i++) {
+ SCM elt = scm_cons(elmorph_scm_from_phoneme(phmap + i),
+ SCM_EOL);
+ if (scm_is_null(head))
+ head = tail = elt;
+ else {
+ SCM_SETCDR(tail, elt);
+ tail = elt;
+ }
+ }
+ free(phmap);
+ return head;
+}
+#undef FUNC_NAME
void
scm_init_ellinika_elmorph_module()

Return to:

Send suggestions and report system problems to the System administrator.