/* This file is part of Ellinika project. Copyright (C) 2011 Sergey Poznyakoff Ellinika is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. Ellinika is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #ifdef HAVE_CONFIG_H # include #endif #include #include #include #include "utf8.h" #include "elmorph.h" struct elstr { unsigned *str; /* UTF-8 string */ size_t len; /* Its length */ struct phoneme *phoneme; /* Phonetical map*/ unsigned phoneme_count; /* Number of phonemes */ struct syllable *sylmap; /* Syllable map (nsyl elements) */ unsigned nsyl; /* Number of syllables. */ unsigned acc_syl; /* Number of the accented syllable (1-based, from the last syllable) */ unsigned acc_pos; /* Number of the accented character (0-based, from str[0]) */ }; scm_t_bits _elstr_tag; static void _elstr_syllabize(struct elstr *elstr, const char *func_name) { unsigned i; free(elstr->phoneme); free(elstr->sylmap); if (phoneme_map(&elstr->phoneme, &elstr->phoneme_count, elstr->str, elstr->len)) scm_misc_error(func_name, "cannot create phonetic map: ~S", scm_from_int(errno)); if (syllable_map(&elstr->sylmap, &elstr->nsyl, elstr->phoneme, elstr->phoneme_count)) scm_misc_error(func_name, "cannot create syllable map: ~S", scm_from_int(errno)); for (i = elstr->nsyl; i > 0; i--) { if (elstr->sylmap[elstr->nsyl - i].flags & CHF_ACCENT_MASK) break; } elstr->acc_syl = i; for (i = 0; i < elstr->len; i++) if (elchr_getaccent(elstr->str[i])) break; elstr->acc_pos = i; } static void invalidate_maps(struct elstr *elstr) { free(elstr->sylmap); elstr->sylmap = NULL; elstr->nsyl = 0; free(elstr->phoneme); elstr->phoneme = NULL; elstr->phoneme_count = 0; elstr->acc_pos = 0; elstr->acc_syl = 0; } static SCM _elstr_alloc_empty(struct elstr **pelstr) { struct elstr *elstr; elstr = scm_gc_malloc(sizeof(*elstr), "Elstr"); memset(elstr, 0, sizeof(*elstr)); *pelstr = elstr; SCM_RETURN_NEWSMOB(_elstr_tag, elstr); } static SCM _elstr_alloc(const char *instr, int syl, const char *func_name) { struct elstr *elstr; unsigned *wptr; size_t wlen; if (utf8_mbstr_to_wc(instr, &wptr, &wlen)) return SCM_EOL; elstr = scm_gc_malloc(sizeof(*elstr), "Elstr"); elstr->str = wptr; elstr->len = wlen; elstr->sylmap = NULL; elstr->nsyl = 0; elstr->acc_syl = 0; elstr->acc_pos = 0; elstr->phoneme = 0; elstr->phoneme_count = 0; if (syl) _elstr_syllabize(elstr, func_name); SCM_RETURN_NEWSMOB(_elstr_tag, elstr); } static SCM _elstr_dup(struct elstr *elstr) { struct elstr *elnew; elnew = scm_gc_malloc(sizeof(*elstr), "Elstr"); elnew->str = calloc(elstr->len, sizeof(elnew->str[0])); if (!elnew->str) scm_memory_error("_elstr_dup"); memcpy(elnew->str, elstr->str, sizeof(elstr->str[0]) * elstr->len); elnew->len = elstr->len; if (elstr->phoneme) { elnew->phoneme = calloc(elstr->phoneme_count, sizeof(elnew->phoneme[0])); if (!elnew->phoneme) { free(elnew->str); scm_memory_error("_elstr_dup"); } memcpy(elnew->phoneme, elstr->phoneme, sizeof(elstr->phoneme[0]) * elstr->phoneme_count); } else elnew->phoneme = NULL; elnew->phoneme_count = elstr->phoneme_count; if (elstr->sylmap) { elnew->sylmap = calloc(elstr->nsyl, sizeof(elnew->sylmap[0])); if (!elnew->sylmap) { free(elnew->str); scm_memory_error("_elstr_dup"); } memcpy(elnew->sylmap, elstr->sylmap, sizeof(elstr->sylmap[0]) * elstr->nsyl); } else elnew->sylmap = NULL; elnew->nsyl = elstr->nsyl; elnew->acc_syl = elstr->acc_syl; elnew->acc_pos = elstr->acc_pos; SCM_RETURN_NEWSMOB(_elstr_tag, elnew); } static void _elstr_concat(struct elstr *dest, struct elstr *src, const char *func_name) { unsigned *wp; if (src->len == 0) return; wp = realloc(dest->str, sizeof(dest->str[0]) * (dest->len + src->len)); if (!wp) scm_memory_error(func_name); dest->str = wp; memcpy(dest->str + dest->len, src->str, sizeof(src->str[0]) * src->len); dest->len += src->len; } static scm_sizet _elstr_free(SCM smob) { struct elstr *elstr = (struct elstr *) SCM_CDR(smob); free(elstr->str); free(elstr->sylmap); free(elstr->phoneme); scm_gc_free(elstr, sizeof(struct elstr), "elstr"); return 0; } static int _elstr_print(SCM smob, SCM port, scm_print_state *pstate) { struct elstr *elstr = (struct elstr *) SCM_CDR(smob); int i, j; char *s; scm_puts("#sylmap) { scm_puts("``", port); for (i = 0; i < elstr->nsyl; i++) { size_t start = elstr->sylmap[i].char_start; if (i) scm_puts("-", port); if (elstr->sylmap[i].flags & CHF_ACCENT_MASK) scm_puts("[", port); for (j = 0; j < elstr->sylmap[i].char_count; j++) { char r[6]; int n; n = utf8_wctomb(r, elstr->str[start+j]); if (n == -1) continue; r[n] = 0; scm_puts(r, port); } if (elstr->sylmap[i].flags & CHF_ACCENT_MASK) scm_puts("]", port); } } else { scm_puts("[NS] ``", port); for (i = j = 0; i < elstr->len; i++) { char r[6]; int n; n = utf8_wctomb(r, elstr->str[i]); if (n == -1) continue; r[n] = 0; scm_puts(r, port); } } scm_puts("''>", port); return 1; } static void _elstr_init() { _elstr_tag = scm_make_smob_type("Elstr", sizeof(struct elstr)); scm_set_smob_free(_elstr_tag, _elstr_free); scm_set_smob_print(_elstr_tag, _elstr_print); } #define scm_is_elstr(s) (!SCM_IMP(s) && SCM_CELL_TYPE(s) == _elstr_tag) static SCM force_elstr(struct elstr **ep, SCM scm, int sylopt, int arg, const char *func_name) { struct elstr *elstr; if (scm_is_elstr(scm)) { elstr = (struct elstr*) SCM_CDR(scm); } else { SCM newscm; char *str; SCM_ASSERT(scm_is_string(scm), scm, arg, func_name); str = scm_to_locale_string(scm); newscm = _elstr_alloc(str, sylopt, func_name); free(str); if (newscm == SCM_EOL) scm_misc_error(func_name, "Invalid input string: ~S", scm_list_1(scm)); scm = newscm; elstr = (struct elstr*) SCM_CDR(newscm); } if (ep) *ep = elstr; return scm; } SCM_DEFINE_PUBLIC(scm_elstr_p, "elstr?", 1, 0, 0, (SCM string), "Return true if STRING is an elstr\n") #define FUNC_NAME s_scm_elstr_p { return scm_is_elstr(string) ? SCM_BOOL_T : SCM_BOOL_F; } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_string__elstr, "string->elstr", 1, 0, 0, (SCM string), "Create new ELSTR from STRING\n") #define FUNC_NAME s_scm_string__elstr { char *str; SCM scm; SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME); return force_elstr(NULL, string, 1, SCM_ARG1, FUNC_NAME); } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_elstr__string, "elstr->string", 1, 0, 0, (SCM el), "Convert EL to a STRING\n") #define FUNC_NAME s_scm_elstr__string { struct elstr *elstr; char *s; SCM scm; SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); elstr = (struct elstr*) SCM_CDR(el); if (utf8_wc_to_mbstr(elstr->str, elstr->len, &s)) scm_misc_error(FUNC_NAME, "cannot convert elstr to Scheme", SCM_EOL); scm = scm_from_locale_string(s); free(s); return scm; } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_elstr_length, "elstr-length", 1, 0, 0, (SCM el), "Returns the number of characters in EL\n") #define FUNC_NAME s_scm_elstr_length { struct elstr *elstr; force_elstr(&elstr, el, 0, SCM_ARG1, FUNC_NAME); return scm_from_uint(elstr->len); } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_elstr_number_of_syllables, "elstr-number-of-syllables", 1, 0, 0, (SCM el), "Returns the number of characters in EL\n") #define FUNC_NAME s_scm_elstr_number_of_syllables { struct elstr *elstr; force_elstr(&elstr, el, 1, SCM_ARG1, FUNC_NAME); return scm_from_uint(elstr->nsyl); } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_elstr_syllable_prop, "elstr-syllable-prop", 2, 0, 0, (SCM el, SCM n), "Returns properties of the syllable N in EL\n") #define FUNC_NAME s_scm_elstr_syllable_prop { struct elstr *elstr; unsigned num, start; force_elstr(&elstr, el, 1, SCM_ARG1, FUNC_NAME); SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); num = scm_to_uint(n); if (num > elstr->nsyl) scm_misc_error(FUNC_NAME, "cannot get syllable #~S: not enough syllables: ~S", scm_list_2(el, n)); num = elstr->nsyl - num; return scm_list_3(scm_from_uint(elstr->sylmap[num].char_start), scm_from_uint(elstr->sylmap[num].char_count), scm_from_int(elstr->sylmap[num].flags)); } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_elstr_accent_position, "elstr-accent-position", 1, 0, 0, (SCM el), "Return position of the accented character in EL\n") #define FUNC_NAME s_scm_elstr_accent_position { struct elstr *elstr; force_elstr(&elstr, el, 1, SCM_ARG1, FUNC_NAME); return scm_from_uint(elstr->acc_pos); } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_elstr_accented_syllable, "elstr-accented-syllable", 1, 0, 0, (SCM el), "Return position of the accented syllable in EL\n") #define FUNC_NAME s_scm_elstr_accented_syllable { struct elstr *elstr; force_elstr(&elstr, el, 1, SCM_ARG1, FUNC_NAME); return scm_from_uint(elstr->acc_syl); } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_elstr_syllable, "elstr-syllable", 2, 0, 0, (SCM el, SCM n), "Return Nth syllable in EL\n") #define FUNC_NAME s_scm_elstr_accented_syllable { struct elstr *elstr; char *s; SCM scm; unsigned num, start; force_elstr(&elstr, el, 1, SCM_ARG1, FUNC_NAME); SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); num = scm_to_uint(n); if (num > elstr->nsyl) scm_misc_error(FUNC_NAME, "cannot get syllable #~S: not enough syllables: ~S", scm_list_2(el, n)); num = elstr->nsyl - num; if (utf8_wc_to_mbstr(elstr->str + elstr->sylmap[num].char_start, elstr->sylmap[num].char_count, &s)) scm_misc_error(FUNC_NAME, "cannot convert elstr to Scheme", SCM_EOL); scm = scm_from_locale_string(s); free(s); return scm; } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_elstr_character, "elstr-character", 2, 0, 0, (SCM el, SCM n), "Return Nth character in EL\n") #define FUNC_NAME s_scm_elstr_character { struct elstr *elstr; unsigned num; char r[6]; int len; force_elstr(&elstr, el, 0, SCM_ARG1, FUNC_NAME); SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); num = scm_to_uint(n); if (num >= elstr->len) scm_misc_error(FUNC_NAME, "cannot get character #~S: not enough characters: ~S", scm_list_2(el, n)); len = utf8_wctomb(r, elstr->str[num]); if (len <= 0) scm_misc_error(FUNC_NAME, "cannot convert elchr to Scheme", SCM_EOL); r[len] = 0; return scm_from_locale_string(r); } #undef FUNC_NAME static SCM _elstr_chgcase(SCM el, void (*chgfun)(unsigned *, size_t), int destructive, const char *func_name) { struct elstr *elstr; SCM scm; if (destructive) { SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); elstr = (struct elstr*) SCM_CDR(el); scm = SCM_UNSPECIFIED; } else { scm = force_elstr(&elstr, el, 1, SCM_ARG1, func_name); if (scm == el) { scm = _elstr_dup(elstr); elstr = (struct elstr*) SCM_CDR(scm); } } chgfun(elstr->str, elstr->len); return scm; } SCM_DEFINE_PUBLIC(scm_elstr_toupper, "elstr-toupper", 1, 0, 0, (SCM el), "Convert EL to upper case\n") #define FUNC_NAME s_scm_elstr_toupper { return _elstr_chgcase(el, utf8_wc_strnupper, 0, FUNC_NAME); } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_elstr_tolower, "elstr-tolower", 1, 0, 0, (SCM el), "Convert EL to lower case\n") #define FUNC_NAME s_scm_elstr_tolower { return _elstr_chgcase(el, utf8_wc_strnlower, 0, FUNC_NAME); } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_elstr_toupper_x, "elstr-toupper!", 1, 0, 0, (SCM el), "Convert EL to upper case (destructive)\n") #define FUNC_NAME s_scm_elstr_toupper_x { return _elstr_chgcase(el, utf8_wc_strnupper, 1, FUNC_NAME); } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_elstr_tolower_x, "elstr-tolower!", 1, 0, 0, (SCM el), "Convert EL to lower case (destructive)\n") #define FUNC_NAME s_scm_elstr_tolower_x { return _elstr_chgcase(el, utf8_wc_strnlower, 0, FUNC_NAME); } #undef FUNC_NAME static SCM _elstr_deaccent(SCM el, int destructive, const char *func_name) { struct elstr *elstr; unsigned i; SCM scm; if (destructive) { SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); scm = SCM_UNSPECIFIED; elstr = (struct elstr*) SCM_CDR(el); } else { scm = force_elstr(&elstr, el, 1, SCM_ARG1, func_name); if (scm == el) { scm = _elstr_dup(elstr); elstr = (struct elstr*) SCM_CDR(scm); } } for (i = 0; i < elstr->len; i++) elstr->str[i] = elchr_deaccent(elstr->str[i]); invalidate_maps(elstr); return scm; } SCM_DEFINE_PUBLIC(scm_elstr_deaccent, "elstr-deaccent", 1, 0, 0, (SCM el), "Remove all accents from EL\n") #define FUNC_NAME s_scm_elstr_deaccent { return _elstr_deaccent(el, 0, FUNC_NAME); } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_elstr_deaccent_x, "elstr-deaccent!", 1, 0, 0, (SCM el), "Remove all accents from EL (desctructive)\n") #define FUNC_NAME s_scm_elstr_deaccent_x { return _elstr_deaccent(el, 1, FUNC_NAME); } #undef FUNC_NAME static SCM _elstr_set_accent(SCM el, SCM n, int destructive, const char *func_name) { struct elstr *elstr; unsigned i; unsigned acc_num, num, start; SCM scm; unsigned pos; struct phoneme *phoneme = NULL; if (destructive) { SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); elstr = (struct elstr*) SCM_CDR(el); } else scm = force_elstr(&elstr, el, 1, SCM_ARG1, func_name); SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name); num = scm_to_uint(n); if (num == 0 | num > elstr->nsyl) scm_misc_error(func_name, "cannot set accent on syllable #~S: not enough syllables: ~S", scm_list_2(n, el)); acc_num = elstr->nsyl - num; if (destructive) scm = SCM_UNSPECIFIED; else if (scm == el) { scm = _elstr_dup(elstr); elstr = (struct elstr*) SCM_CDR(scm); } /* Clear all accents */ for (i = 0; i < elstr->len; i++) elstr->str[i] = elchr_deaccent(elstr->str[i]); for (i = 0; i < elstr->nsyl; i++) elstr->sylmap[i].flags &= ~CHF_ACCENT_MASK; for (i = 0; i < elstr->phoneme_count; i++) elstr->phoneme[i].flags &= ~CHF_ACCENT_MASK; start = elstr->sylmap[acc_num].phoneme_start; pos = 0; for (i = elstr->sylmap[acc_num].phoneme_count; i > 0; i--) { struct phoneme *ph = elstr->phoneme + start + i - 1; if (ph->flags & CHF_CONSONANT) /* skip */ ; else if (ph->flags & CHF_VOWEL) { phoneme = ph; break; } } if (!phoneme) scm_misc_error(func_name, "cannot set accent on syllable #~S of ~S: " "INTERNAL ERROR", scm_list_2(n, el)); else if (phoneme->flags & CHF_DIPHTHONG) pos = phoneme->start + phoneme->count - 1; else pos = phoneme->start; phoneme->flags |= CHF_OXEIA; elstr->sylmap[acc_num].flags |= CHF_OXEIA; elstr->str[pos] = elchr_accent(elstr->str[pos], CHF_OXEIA); elstr->acc_syl = num; elstr->acc_pos = pos; return scm; } SCM_DEFINE_PUBLIC(scm_elstr_set_accent, "elstr-set-accent", 2, 0, 0, (SCM el, SCM n), "Set accent on Nth syllable of EL\n") { return _elstr_set_accent(el, n, 0, s_scm_elstr_set_accent); } SCM_DEFINE_PUBLIC(scm_elstr_set_accent_x, "elstr-set-accent!", 2, 0, 0, (SCM el, SCM n), "Set accent on Nth syllable of EL (destructive)\n") { return _elstr_set_accent(el, n, 1, s_scm_elstr_set_accent_x); } static SCM _elstr_set_accent_on_char(SCM el, SCM n, int destructive, const char *func_name) { struct elstr *elstr; unsigned i; unsigned acc_num, num, len, start; SCM scm; int dstate; if (destructive) { SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); elstr = (struct elstr*) SCM_CDR(el); } else scm = force_elstr(&elstr, el, 0, SCM_ARG1, func_name); SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name); num = scm_to_uint(n); if (num > elstr->len) scm_misc_error(func_name, "cannot set accent on character #~S: not enough characters: ~S", scm_list_2(el, n)); if (!elchr_isvowel(elstr->str[num])) scm_misc_error(func_name, "cannot set accent on character #~S: not a vowel: ~S", scm_list_2(el, n)); if (destructive) scm = SCM_UNSPECIFIED; else if (scm == el) { scm = _elstr_dup(elstr); elstr = (struct elstr*) SCM_CDR(scm); } /* Clear all accents */ for (i = 0; i < elstr->len; i++) elstr->str[i] = elchr_deaccent(elstr->str[i]); elstr->str[num] = elchr_accent(elstr->str[num], CHF_OXEIA); invalidate_maps(elstr); _elstr_syllabize(elstr, func_name); return scm; } SCM_DEFINE_PUBLIC(scm_elstr_set_accent_character, "elstr-set-accent-character", 2, 0, 0, (SCM el, SCM n), "Set accent on Nth character of EL\n") { return _elstr_set_accent_on_char(el, n, 0, s_scm_elstr_set_accent_character); } SCM_DEFINE_PUBLIC(scm_elstr_set_accent_character_x, "elstr-set-accent-character!", 2, 0, 0, (SCM el, SCM n), "Set accent on Nth character of EL (destructive)\n") { return _elstr_set_accent_on_char(el, n, 1, s_scm_elstr_set_accent_character_x); } SCM_DEFINE_PUBLIC(scm_elstr_char_prop_bitmask, "elstr-char-prop-bitmask", 2, 0, 0, (SCM el, SCM n), "Returns properties of the Nth char in EL, as a bitmask\n") #define FUNC_NAME s_scm_elstr_char_prop_bitmask { struct elstr *elstr; int num; force_elstr(&elstr, el, 0, SCM_ARG1, FUNC_NAME); SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); num = scm_to_int(n); if (num < 0) num += elstr->len; if (num >= elstr->len) scm_misc_error(FUNC_NAME, "cannot get character #~S: not enough characters: ~S", scm_list_2(el, n)); return scm_from_uint(elchr_flags(elstr->str[num])); } #undef FUNC_NAME static struct deftab { unsigned val; char *sym; } deftab[] = { { CHF_OXEIA, "elmorph:oxeia" }, { CHF_PERISPWMENH, "elmorph:perispwmenh" }, { CHF_BAREIA, "elmorph:bareia" }, { CHF_ACCENT_MASK, "elmorph:accent-mask" }, { CHF_TREMA, "elmorph:trema" }, { CHF_VOWEL, "elmorph:vowel" }, { CHF_CONSONANT, "elmorph:consonant" }, { CHF_SEMIVOWEL, "elmorph:semivowel" }, { CHF_PUNCT, "elmorph:punct" }, { CHF_SYMBOL, "elmorph:symbol" }, { CHF_MODIFIER, "elmorph:modifier" }, { CHF_ARCHAIC, "elmorph:archaic" }, { CHF_LOWER, "elmorph:lower" }, { CHF_UPPER, "elmorph:upper" }, { CHF_NUMERIC, "elmorph:numeric" }, { CHF_DIPHTHONG, "elmorph:diphthong" }, { CHF_IOTA, "elmorph:iota" }, }; SCM_DEFINE_PUBLIC(scm_elstr_char_phoneme, "elstr-char-phoneme", 2, 0, 0, (SCM el, SCM n), "Returns a phoneme code of the Nth char in EL\n") #define FUNC_NAME s_scm_elstr_char_phoneme { struct elstr *elstr; int num; force_elstr(&elstr, el, 0, SCM_ARG1, FUNC_NAME); SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); num = scm_to_int(n); if (num < 0) num += elstr->len; if (num >= elstr->len) scm_misc_error(FUNC_NAME, "cannot get character #~S: not enough characters: ~S", scm_list_2(el, n)); return scm_from_uint(elchr_phoneme(elstr->str[num])); } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_utf8_toupper, "utf8-toupper", 1, 0, 0, (SCM string), "Convert STRING to uppercase\n") #define FUNC_NAME s_scm_utf8_toupper { char *str; SCM scm; SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME); str = scm_to_locale_string(string); if (utf8_toupper(str, strlen(str))) scm_misc_error(FUNC_NAME, "cannot convert to upper case: ~S", scm_list_1(string)); scm = scm_from_locale_string(str); free(str); return scm; } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_utf8_tolower, "utf8-tolower", 1, 0, 0, (SCM string), "Convert STRING to lowercase\n") #define FUNC_NAME s_scm_utf8_tolower { char *str; SCM scm; SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME); str = scm_to_locale_string(string); if (utf8_tolower(str, strlen(str))) scm_misc_error(FUNC_NAME, "cannot convert to lower case: ~S", scm_list_1(string)); scm = scm_from_locale_string(str); free(str); return scm; } #undef FUNC_NAME static SCM _elstr_thema_aoristoy(SCM el, int destructive, const char *func_name) { struct elstr *elstr; SCM scm; unsigned *wc; size_t wclen; if (destructive) { SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); elstr = (struct elstr*) SCM_CDR(el); scm = SCM_UNSPECIFIED; } else { scm = force_elstr(&elstr, el, 1, SCM_ARG1, func_name); if (scm == el) { scm = _elstr_dup(elstr); elstr = (struct elstr*) SCM_CDR(scm); } } if (elmorph_thema_aoristoy(elstr->str, elstr->len, &wc, &wclen)) scm_memory_error(func_name); free(elstr->str); elstr->str = wc; elstr->len = wclen; return scm; } SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy, "elstr-thema-aoristoy", 1, 0, 0, (SCM thema), "Convert THEMA, which must be a root of present. to an aorist root\n") #define FUNC_NAME s_scm_elstr_thema_aoristoy { return _elstr_thema_aoristoy(thema, 0, FUNC_NAME); } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy_x, "elstr-thema-aoristoy!", 1, 0, 0, (SCM thema), "Convert THEMA, which must be a root of present. to an aorist root (destructive)\n") #define FUNC_NAME s_scm_elstr_thema_aoristoy_x { return _elstr_thema_aoristoy(thema, 1, FUNC_NAME); } #undef FUNC_NAME static SCM _elstr_slice(SCM el, SCM n, SCM l, int destructive, const char *func_name) { struct elstr *elstr; int num; unsigned len; SCM scm; if (destructive) { SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); elstr = (struct elstr*) SCM_CDR(el); } else scm = force_elstr(&elstr, el, 0, SCM_ARG1, func_name); invalidate_maps(elstr); SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name); SCM_ASSERT(scm_is_integer(l), l, SCM_ARG3, func_name); num = scm_to_int(n); len = scm_to_uint(l); if (num < 0) num += elstr->len; if (num < 0 || num >= elstr->len || num + len > elstr->len) scm_misc_error(func_name, "invalid offset or length", SCM_EOL); if (destructive) scm = SCM_UNSPECIFIED; else if (scm == el) { scm = _elstr_dup(elstr); elstr = (struct elstr*) SCM_CDR(scm); } if (num) memmove(elstr->str, elstr->str + num, sizeof(elstr->str[0]) * len); elstr->len = len; _elstr_syllabize(elstr, func_name); return scm; } SCM_DEFINE_PUBLIC(scm_elstr_slice, "elstr-slice", 3, 0, 0, (SCM word, SCM off, SCM len), "Extract LEN characters from WORD starting from position OFF\n") #define FUNC_NAME s_scm_elstr_slice { return _elstr_slice(word, off, len, 0, FUNC_NAME); } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_elstr_slice_x, "elstr-slice!", 3, 0, 0, (SCM word, SCM off, SCM len), "Extract LEN characters from WORD starting from position OFF (destructive)\n") #define FUNC_NAME s_scm_elstr_slice_x { return _elstr_slice(word, off, len, 1, FUNC_NAME); } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_elstr_index, "elstr-index", 2, 0, 0, (SCM word, SCM needle), "Returns position of NEEDLE in the WORD") #define FUNC_NAME s_scm_elstr_index { struct elstr *elstr, *ep; unsigned *p; force_elstr(&elstr, word, 0, SCM_ARG1, FUNC_NAME); force_elstr(&ep, needle, 0, SCM_ARG2, FUNC_NAME); p = (unsigned*)utf8_wc_strnstr(elstr->str, elstr->len, ep->str, ep->len); if (p) return scm_from_int(p - elstr->str); return SCM_BOOL_F; } #undef FUNC_NAME static int _suffix_matches(struct elstr *elstr, struct elstr *ep) { return (ep->len < elstr->len && memcmp(elstr->str + elstr->len - ep->len, ep->str, ep->len * sizeof(elstr->str[0])) == 0); } SCM_DEFINE_PUBLIC(scm_elstr_suffix_p, "elstr-suffix?", 2, 0, 1, (SCM word, SCM suffix, SCM rest), "Return #t if WORDS ends with SUFFIX") #define FUNC_NAME s_scm_elstr_suffix_p { struct elstr *elstr, *ep; force_elstr(&elstr, word, 0, SCM_ARG1, FUNC_NAME); force_elstr(&ep, suffix, 0, SCM_ARG2, FUNC_NAME); if (_suffix_matches(elstr, ep)) return suffix; for (; !scm_is_null(rest); rest = SCM_CDR(rest)) { SCM val = SCM_CAR(rest); force_elstr(&ep, val, 0, SCM_ARGn, FUNC_NAME); if (_suffix_matches(elstr, ep)) return val; } return SCM_BOOL_F; } #undef FUNC_NAME SCM_DEFINE_PUBLIC(scm_elstr_append, "elstr-append", 0, 0, 1, (SCM rest), "Concatenates arguments.\n") #define FUNC_NAME s_scm_elstr_append { SCM ret = _elstr_alloc("", 0, FUNC_NAME); struct elstr *elstr = (struct elstr*) SCM_CDR(ret); for (; !scm_is_null(rest); rest = SCM_CDR(rest)) { struct elstr *elt; SCM val = SCM_CAR(rest); force_elstr(&elt, val, 0, SCM_ARGn, FUNC_NAME); _elstr_concat(elstr, elt, FUNC_NAME); } _elstr_syllabize(elstr, FUNC_NAME); return ret; } #undef FUNC_NAME static SCM elmorph_scm_from_phoneme(struct phoneme *phoneme) { return scm_list_4(scm_from_int(phoneme->code), scm_from_uint(phoneme->start), scm_from_uint(phoneme->count), scm_from_bool(phoneme->flags)); } SCM_DEFINE_PUBLIC(scm_elstr__phonetic_map, "elstr->phonetic-map", 1, 0, 0, (SCM word), "Converts WORD to a phonetic map.\n") #define FUNC_NAME s_scm_elstr__phonetic_map { struct elstr *elstr; struct phoneme *phmap; size_t phlen, i; SCM head = SCM_EOL, tail = SCM_EOL; force_elstr(&elstr, word, 1, SCM_ARG1, FUNC_NAME); phmap = elstr->phoneme; phlen = elstr->phoneme_count; for (i = 0; i < phlen; i++) { SCM elt = scm_cons(elmorph_scm_from_phoneme(phmap + i), SCM_EOL); if (scm_is_null(head)) head = tail = elt; else { SCM_SETCDR(tail, elt); tail = elt; } } free(phmap); return head; } #undef FUNC_NAME void scm_init_ellinika_elmorph_module() { int i; _elstr_init(); for (i = 0; i < sizeof(deftab)/sizeof(deftab[0]); i++) { scm_c_define(deftab[i].sym, scm_from_uint(deftab[i].val)); scm_c_export(deftab[i].sym, NULL); } #include "elmorph.x" }