From a1a5b7ddd6c3c0532c37551b24fd573a554ac181 Mon Sep 17 00:00:00 2001 From: Sergey Poznyakoff Date: Fri, 10 Jun 2011 23:04:53 +0300 Subject: Fix syllabification. * configure.ac: Add AC_PROG_YACC * src/ellinika/phoneme.y: New file. * src/ellinika/yyrename: New file. * src/ellinika/syllabificator.c: New file. * src/ellinika/.gitignore: Update. * src/ellinika/elchr.c (char_info_st): Move to header. (el_basic_ctype): (elchr_info): Remove static qualifier. Return a pointer to const. (elchr_letter,elchr_phoneme): New functions. (elchr_diphthong): Remove. * src/ellinika/elmorph.c (elstr): New members. (_elstr_syllabize): Rewrite. (invalidate_maps)" New static function. (_elstr_alloc): Initialize new fields, take function name as argument, for diagnostic purposes. (_elstr_print): Rewrite (deftab): Update. (elstr-syllable-prop,elstr-syllable) (_elstr_set_accent,_elstr_set_accent_on_char): Rewrite. (elstr-char-phoneme,elstr->phonetic-map): New functions. * src/ellinika/elmorph.h (CHF_DIPH1,CHF_DIPH2): Remove. (CHF_DIPHTHONG): New flag. (PHON_.*): New constants. (phoneme,syllable): New structures. (char_info_st): New members. (elchr_info,elchr_letter) (elchr_phoneme,phoneme_map) (syllable_map): New protos. (elchr_diphthong): Remove protos. * src/ellinika/elmorph.scm4: Move public definitions to elmorph-public.scm; include it here. * src/ellinika/xlat.scm (ellinika:sounds-like): Rewrite as a wrapper over elstr->soundslike. Describe Milesian numbers. * style.css (img.ellinika-img): New class. * xml/lingua.conf.in (IMAGE): New tag. * xml/pl/alfabhta.xml: Describe Milesian numbers. Various fixes. * data/dbverb.struct: fix a typo in flection. Use 'sub' theme for pas/sub/aor. * data/irregular-verbs.xml: Add more verbs. * scm/conjugator.scm: Various fixes. * scm/verbop.scm: Accept empty mood and voice declarations. --- src/ellinika/phoneme.y | 353 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 353 insertions(+) create mode 100644 src/ellinika/phoneme.y (limited to 'src/ellinika/phoneme.y') diff --git a/src/ellinika/phoneme.y b/src/ellinika/phoneme.y new file mode 100644 index 0000000..353d175 --- /dev/null +++ b/src/ellinika/phoneme.y @@ -0,0 +1,353 @@ +/* This file is part of Ellinika project. + Copyright (C) 2011 Sergey Poznyakoff + + Ellinika is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + Ellinika is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ +%{ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "utf8.h" +#include "elmorph.h" + +static struct phoneme *phoneme_base; +static size_t phoneme_max; +static size_t phoneme_count; +static int error_state; + +#define PHONEME_MAP_INITIAL_ALLOC 16 + +static void +phoneme_append(struct phoneme *phoneme) +{ + if (error_state) + return; + + if (phoneme_max == phoneme_count) { + struct phoneme *np; + size_t nsize; + + if (!phoneme_max) + nsize = PHONEME_MAP_INITIAL_ALLOC; + else { + nsize = 2 * phoneme_max; + if (nsize < phoneme_max) { + error_state = ENOMEM; + return; + } + } + np = realloc(phoneme_base, nsize * sizeof(phoneme_base[0])); + if (!np) { + error_state = ENOMEM; + return; + } + phoneme_max = nsize; + phoneme_base = np; + } + phoneme_base[phoneme_count++] = *phoneme; +} + +#define DIPHTHONG(a,b,pc,fl) do { \ + (a).count = 2; \ + (a).code = pc; \ + (a).flags = (fl) | CHF_DIPHTHONG | \ + (((a.flags) | (b).flags) & CHF_ACCENT_MASK); \ + } while (0) + +%} +%union { + struct phoneme phoneme; +}; + +%token LETTER_A 1 +%token LETTER_A_ACC 2 +%token LETTER_B 3 +%token LETTER_G 4 +%token LETTER_D 5 +%token LETTER_E 6 +%token LETTER_E_ACC 7 +%token LETTER_Z 8 +%token LETTER_H 9 +%token LETTER_H_ACC 10 +%token LETTER_TH 11 +%token LETTER_I 12 +%token LETTER_I_ACC 13 +%token LETTER_I_TREMA 14 +%token LETTER_I_TREMA_ACC 15 +%token LETTER_K 16 +%token LETTER_L 17 +%token LETTER_M 18 +%token LETTER_N 19 +%token LETTER_KS 20 +%token LETTER_OMICRON 21 +%token LETTER_OMICRON_ACC 22 +%token LETTER_P 23 +%token LETTER_R 24 +%token LETTER_S 25 +%token LETTER_T 26 +%token LETTER_Y 27 +%token LETTER_Y_ACC 28 +%token LETTER_Y_TREMA 29 +%token LETTER_Y_TREMA_ACC 30 +%token LETTER_F 31 +%token LETTER_X 32 +%token LETTER_PS 33 +%token LETTER_OMEGA 34 +%token LETTER_OMEGA_ACC 35 + +%type monophthong diphthong phoneme + +%% +input : phoneme + { + phoneme_append(&$1); + } + | input phoneme + { + phoneme_append(&$2); + } + ; + +phoneme : monophthong + | diphthong + ; + +monophthong: + LETTER_A + | LETTER_A_ACC + | LETTER_B + | LETTER_G + | LETTER_D + | LETTER_E + | LETTER_E_ACC + | LETTER_Z + | LETTER_H + | LETTER_H_ACC + | LETTER_TH + | LETTER_I + | LETTER_I_ACC + | LETTER_I_TREMA + | LETTER_I_TREMA_ACC + | LETTER_K + | LETTER_L + | LETTER_M + | LETTER_N + | LETTER_KS + | LETTER_OMICRON + | LETTER_OMICRON_ACC + | LETTER_P + | LETTER_R + | LETTER_S + | LETTER_T + | LETTER_Y + | LETTER_Y_ACC + | LETTER_Y_TREMA + | LETTER_Y_TREMA_ACC + | LETTER_F + | LETTER_X + | LETTER_PS + | LETTER_OMEGA + | LETTER_OMEGA_ACC + ; + +diphthong: + LETTER_A LETTER_I + { + DIPHTHONG($1, $2, PHON_E, CHF_VOWEL); + $$ = $1; + } + | LETTER_A LETTER_I_ACC + { + DIPHTHONG($1, $2, PHON_E, CHF_VOWEL); + $$ = $1; + } + | LETTER_E LETTER_I + { + DIPHTHONG($1, $2, PHON_I, CHF_VOWEL); + $$ = $1; + } + | LETTER_E LETTER_I_ACC + { + DIPHTHONG($1, $2, PHON_I, CHF_VOWEL); + $$ = $1; + } + | LETTER_OMICRON LETTER_I + { + DIPHTHONG($1, $2, PHON_I, CHF_VOWEL); + $$ = $1; + } + | LETTER_OMICRON LETTER_I_ACC + { + DIPHTHONG($1, $2, PHON_I, CHF_VOWEL); + $$ = $1; + } + | LETTER_Y LETTER_I + { + DIPHTHONG($1, $2, PHON_I, CHF_VOWEL); + $$ = $1; + } + | LETTER_Y LETTER_I_ACC + { + DIPHTHONG($1, $2, PHON_I, CHF_VOWEL); + $$ = $1; + } + | LETTER_OMICRON LETTER_Y + { + DIPHTHONG($1, $2, PHON_U, CHF_VOWEL); + $$ = $1; + } + | LETTER_OMICRON LETTER_Y_ACC + { + DIPHTHONG($1, $2, PHON_U, CHF_VOWEL); + $$ = $1; + } + | LETTER_M LETTER_P + { + DIPHTHONG($1, $2, PHON_B, CHF_CONSONANT); + $$ = $1; + } + | LETTER_N LETTER_T + { + DIPHTHONG($1, $2, PHON_D, CHF_CONSONANT); + $$ = $1; + } + | LETTER_G LETTER_G + { + DIPHTHONG($1, $2, PHON_G, CHF_CONSONANT); + $$ = $1; + } + | LETTER_G LETTER_K + { + DIPHTHONG($1, $2, PHON_G, CHF_CONSONANT); + $$ = $1; + } + | LETTER_G LETTER_X + { + DIPHTHONG($1, $2, PHON_G, CHF_CONSONANT); + $$ = $1; + } + | LETTER_S LETTER_M + { + DIPHTHONG($1, $2, PHON_ZM, CHF_CONSONANT); + $$ = $1; + } + | LETTER_T LETTER_S + { + DIPHTHONG($1, $2, PHON_TS, CHF_CONSONANT); + $$ = $1; + } + | LETTER_T LETTER_Z + { + DIPHTHONG($1, $2, PHON_DZ, CHF_CONSONANT); + $$ = $1; + } + | LETTER_G LETTER_KS + { + DIPHTHONG($1, $2, PHON_NGZ, CHF_CONSONANT); + $$ = $1; + } + | LETTER_A LETTER_Y + { + DIPHTHONG($1, $2, PHON_AV, 0); + $$ = $1; + } + | LETTER_A LETTER_Y_ACC + { + DIPHTHONG($1, $2, PHON_AV, 0); + $$ = $1; + } + | LETTER_E LETTER_Y + { + DIPHTHONG($1, $2, PHON_EV, 0); + $$ = $1; + } + | LETTER_E LETTER_Y_ACC + { + DIPHTHONG($1, $2, PHON_EV, 0); + $$ = $1; + } + ; + +%% + +static unsigned *input_base; +static size_t input_len; +static size_t input_pos; + +#define ISALPHA(ci) ((ci) && ci->letter) + +#define PHONEME_FLAG_MASK \ + (CHF_ACCENT_MASK|CHF_VOWEL|CHF_CONSONANT) + +int +yylex() +{ + unsigned c; + struct char_info_st const *ci; + + do { + if (input_pos == input_len) + return 0; + c = input_base[input_pos++]; + ci = elchr_info(c); + } while (!ISALPHA(ci)); + + yylval.phoneme.code = ci->phoneme; + yylval.phoneme.start = input_pos - 1; + yylval.phoneme.count = 1; + yylval.phoneme.flags = ci->flags & PHONEME_FLAG_MASK; + return ci->letter; +} + +int +yyerror(const char *s) +{ + fprintf("\n%s:%d: INTERNAL ERROR: %s\n", __FILE__, __LINE__, s); + abort(); +} + +int +phoneme_map(struct phoneme **pph, size_t *plen, unsigned *word, size_t len) +{ + int rc; + + input_base = word; + input_len = len; + input_pos = 0; + phoneme_base = NULL; + phoneme_max = 0; + phoneme_count = 0; + error_state = 0; + rc = yyparse(); + if (rc) { + free(phoneme_base); + errno = EINVAL; + return errno; + } + if (error_state) { + free(phoneme_base); + errno = error_state; + return errno; + } + if (phoneme_count < phoneme_max) + phoneme_base = + realloc(phoneme_base, + phoneme_count * sizeof(phoneme_base[0])); + *pph = phoneme_base; + *plen = phoneme_count; + return 0; +} -- cgit v1.2.1