/* This file is part of Ellinika project.
Copyright (C) 2011 Sergey Poznyakoff
Ellinika is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or
(at your option) any later version.
Ellinika is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#ifdef HAVE_CONFIG_H
# include
#endif
#include
#include
#include
#include "utf8.h"
#include "elmorph.h"
struct elstr {
unsigned *str; /* UTF-8 string */
size_t len; /* Its length */
unsigned nsyl; /* Number of syllables. */
unsigned *sylmap; /* Syllable map (nsyl elements) */
unsigned acc_syl; /* Number of the accented syllable
(1-based, from the last syllable) */
unsigned acc_pos; /* Number of the accented character
(0-based, from str[0]) */
};
scm_t_bits _elstr_tag;
static void
_elstr_syllabize(struct elstr *elstr)
{
unsigned *sylmap;
unsigned i, nsyl = 0, accsyl = 0, accchr = 0;
int dstate = 0;
int acc = 0;
sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len, "syllable map");
for (i = 0; i < elstr->len; i++) {
int nstate;
if (elchr_getaccent(elstr->str[i])) {
accsyl = nsyl;
accchr = i;
}
nstate = elchr_diphthong(elstr->str[i], dstate);
if (nstate)
/* skip */;
else if (dstate)
sylmap[nsyl++] = i - 1;
else if (elchr_isvowel(elstr->str[i]))
sylmap[nsyl++] = i;
dstate = nstate;
}
if (dstate)
sylmap[nsyl++] = i - 1;
else
sylmap[nsyl-1] = i - 1;
elstr->sylmap = sylmap;
elstr->nsyl = nsyl;
elstr->acc_pos = accchr;
elstr->acc_syl = nsyl - accsyl;
}
static SCM
_elstr_alloc(const char *instr)
{
struct elstr *elstr;
unsigned *wptr;
size_t wlen;
if (utf8_mbstr_to_wc(instr, &wptr, &wlen))
return SCM_EOL;
elstr = scm_gc_malloc(sizeof(*elstr), "Elstr");
elstr->str = wptr;
elstr->len = wlen;
_elstr_syllabize(elstr);
SCM_RETURN_NEWSMOB(_elstr_tag, elstr);
}
static SCM
_elstr_dup(struct elstr *elstr)
{
struct elstr *elnew;
elnew = scm_gc_malloc(sizeof(*elstr), "Elstr");
elnew->str = calloc(elstr->len, sizeof(elnew->str[0]));
if (!elnew->str)
scm_memory_error("_elstr_dup");
elnew->sylmap = calloc(elstr->nsyl, sizeof(elnew->sylmap[0]));
if (!elnew->sylmap) {
free(elnew->str);
scm_memory_error("_elstr_dup");
}
memcpy(elnew->str, elstr->str, sizeof(elstr->str[0]) * elstr->len);
elnew->len = elstr->len;
elnew->nsyl = elstr->nsyl;
memcpy(elnew->sylmap, elstr->sylmap,
sizeof(elstr->sylmap[0]) * elstr->nsyl);
elnew->acc_syl = elstr->acc_syl;
elnew->acc_pos = elstr->acc_pos;
SCM_RETURN_NEWSMOB(_elstr_tag, elnew);
}
static scm_sizet
_elstr_free(SCM smob)
{
struct elstr *elstr = (struct elstr *) SCM_CDR(smob);
free(elstr->str);
free(elstr->sylmap);
free(elstr);
return 0;
}
static int
_elstr_print(SCM smob, SCM port, scm_print_state *pstate)
{
struct elstr *elstr = (struct elstr *) SCM_CDR(smob);
int i, j, an;
char *s;
scm_puts("#nsyl - elstr->acc_syl;
if (an == 0)
scm_puts("[", port);
for (i = j = 0; i < elstr->len; i++) {
char r[6];
int n;
if (i == elstr->sylmap[j] + 1) {
if (j == an)
scm_puts("]", port);
scm_puts("-", port);
if (++j == an)
scm_puts("[", port);
}
n = utf8_wctomb(r, elstr->str[i]);
if (n == -1)
continue;
r[n] = 0;
scm_puts(r, port);
}
if (j == an)
scm_puts("]", port);
scm_puts("''>", port);
return 1;
}
static void
_elstr_init()
{
_elstr_tag = scm_make_smob_type("Elstr", sizeof(struct elstr));
scm_set_smob_free(_elstr_tag, _elstr_free);
scm_set_smob_print(_elstr_tag, _elstr_print);
}
SCM_DEFINE_PUBLIC(scm_string__elstr, "string->elstr", 1, 0, 0,
(SCM string),
"Create new ELSTR from STRING\n")
#define FUNC_NAME s_scm_string__elstr
{
char *str;
SCM scm;
SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME);
str = scm_to_locale_string(string);
scm = _elstr_alloc(str);
free(str);
if (scm == SCM_EOL)
scm_misc_error(FUNC_NAME,
"Invalid input string: ~S",
scm_list_1(string));
return scm;
}
#undef FUNC_NAME
#define scm_is_elstr(s) (!SCM_IMP(s) && SCM_CELL_TYPE(s) == _elstr_tag)
SCM_DEFINE_PUBLIC(scm_elstr__string, "elstr->string", 1, 0, 0,
(SCM el),
"Convert EL to a STRING\n")
#define FUNC_NAME s_scm_elstr__string
{
struct elstr *elstr;
char *s;
SCM scm;
SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
elstr = (struct elstr*) SCM_CDR(el);
if (utf8_wc_to_mbstr(elstr->str, elstr->len, &s))
scm_misc_error(FUNC_NAME,
"cannot convert elstr to Scheme",
SCM_EOL);
scm = scm_from_locale_string(s);
free(s);
return scm;
}
#undef FUNC_NAME
SCM_DEFINE_PUBLIC(scm_elstr_length, "elstr-length", 1, 0, 0,
(SCM el),
"Returns the number of characters in EL\n")
#define FUNC_NAME s_scm_elstr_length
{
struct elstr *elstr;
SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
elstr = (struct elstr*) SCM_CDR(el);
return scm_from_uint(elstr->len);
}
#undef FUNC_NAME
SCM_DEFINE_PUBLIC(scm_elstr_number_of_syllables, "elstr-number-of-syllables",
1, 0, 0,
(SCM el),
"Returns the number of characters in EL\n")
#define FUNC_NAME s_scm_elstr_number_of_syllables
{
struct elstr *elstr;
SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
elstr = (struct elstr*) SCM_CDR(el);
return scm_from_uint(elstr->nsyl);
}
#undef FUNC_NAME
SCM_DEFINE_PUBLIC(scm_elstr_syllable_prop, "elstr-syllable-prop",
2, 0, 0,
(SCM el, SCM n),
"Returns properties of the syllable N in EL\n")
#define FUNC_NAME s_scm_elstr_syllable_prop
{
struct elstr *elstr;
unsigned num, start;
SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
elstr = (struct elstr*) SCM_CDR(el);
SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME);
num = scm_to_uint(n);
if (num > elstr->nsyl)
scm_misc_error(FUNC_NAME,
"cannot get syllable #~S: not enough syllables: ~S",
scm_list_2(el, n));
num = elstr->nsyl - num;
if (num == 0)
start = 0;
else
start = elstr->sylmap[num - 1] + 1;
return scm_cons(scm_from_uint(start),
scm_from_uint(elstr->sylmap[num]));
}
#undef FUNC_NAME
SCM_DEFINE_PUBLIC(scm_elstr_accent_position, "elstr-accent-position", 1, 0, 0,
(SCM el),
"Return position of the accented character in EL\n")
#define FUNC_NAME s_scm_elstr_accent_position
{
struct elstr *elstr;
SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
elstr = (struct elstr*) SCM_CDR(el);
return scm_from_uint(elstr->acc_pos);
}
#undef FUNC_NAME
SCM_DEFINE_PUBLIC(scm_elstr_accented_syllable, "elstr-accented-syllable",
1, 0, 0,
(SCM el),
"Return position of the accented syllable in EL\n")
#define FUNC_NAME s_scm_elstr_accented_syllable
{
struct elstr *elstr;
SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
elstr = (struct elstr*) SCM_CDR(el);
return scm_from_uint(elstr->acc_syl);
}
#undef FUNC_NAME
SCM_DEFINE_PUBLIC(scm_elstr_syllable, "elstr-syllable",
2, 0, 0,
(SCM el, SCM n),
"Return Nth syllable in EL\n")
#define FUNC_NAME s_scm_elstr_accented_syllable
{
struct elstr *elstr;
char *s;
SCM scm;
unsigned num, start;
SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
elstr = (struct elstr*) SCM_CDR(el);
SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME);
num = scm_to_uint(n);
if (num > elstr->nsyl)
scm_misc_error(FUNC_NAME,
"cannot get syllable #~S: not enough syllables: ~S",
scm_list_2(el, n));
num = elstr->nsyl - num;
if (num == 0)
start = 0;
else
start = elstr->sylmap[num - 1] + 1;
if (utf8_wc_to_mbstr(elstr->str + start,
elstr->sylmap[num] - start + 1,
&s))
scm_misc_error(FUNC_NAME,
"cannot convert elstr to Scheme",
SCM_EOL);
scm = scm_from_locale_string(s);
free(s);
return scm;
}
#undef FUNC_NAME
SCM_DEFINE_PUBLIC(scm_elstr_character, "elstr-character",
2, 0, 0,
(SCM el, SCM n),
"Return Nth character in EL\n")
#define FUNC_NAME s_scm_elstr_character
{
struct elstr *elstr;
unsigned num;
char r[6];
int len;
SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
elstr = (struct elstr*) SCM_CDR(el);
SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME);
num = scm_to_uint(n);
if (num >= elstr->len)
scm_misc_error(FUNC_NAME,
"cannot get character #~S: not enough characters: ~S",
scm_list_2(el, n));
len = utf8_wctomb(r, elstr->str[num]);
if (len <= 0)
scm_misc_error(FUNC_NAME,
"cannot convert elchr to Scheme",
SCM_EOL);
r[len] = 0;
return scm_from_locale_string(r);
}
#undef FUNC_NAME
static SCM
_elstr_chgcase(SCM el, void (*chgfun)(unsigned *, size_t),
int destructive, const char *func_name)
{
struct elstr *elstr;
SCM scm;
SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name);
elstr = (struct elstr*) SCM_CDR(el);
if (destructive)
scm = SCM_UNSPECIFIED;
else {
scm = _elstr_dup(elstr);
elstr = (struct elstr*) SCM_CDR(scm);
}
chgfun(elstr->str, elstr->len);
return scm;
}
SCM_DEFINE_PUBLIC(scm_elstr_toupper, "elstr-toupper",
1, 0, 0,
(SCM el),
"Convert EL to upper case\n")
#define FUNC_NAME s_scm_elstr_toupper
{
return _elstr_chgcase(el, utf8_wc_strnupper, 0, FUNC_NAME);
}
#undef FUNC_NAME
SCM_DEFINE_PUBLIC(scm_elstr_tolower, "elstr-tolower",
1, 0, 0,
(SCM el),
"Convert EL to lower case\n")
#define FUNC_NAME s_scm_elstr_tolower
{
return _elstr_chgcase(el, utf8_wc_strnlower, 0, FUNC_NAME);
}
#undef FUNC_NAME
SCM_DEFINE_PUBLIC(scm_elstr_toupper_x, "elstr-toupper!",
1, 0, 0,
(SCM el),
"Convert EL to upper case (destructive)\n")
#define FUNC_NAME s_scm_elstr_toupper_x
{
return _elstr_chgcase(el, utf8_wc_strnupper, 1, FUNC_NAME);
}
#undef FUNC_NAME
SCM_DEFINE_PUBLIC(scm_elstr_tolower_x, "elstr-tolower!",
1, 0, 0,
(SCM el),
"Convert EL to lower case (destructive)\n")
#define FUNC_NAME s_scm_elstr_tolower_x
{
return _elstr_chgcase(el, utf8_wc_strnlower, 0, FUNC_NAME);
}
#undef FUNC_NAME
static SCM
_elstr_deaccent(SCM el, int destructive, const char *func_name)
{
struct elstr *elstr;
unsigned i;
SCM scm;
SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name);
elstr = (struct elstr*) SCM_CDR(el);
if (destructive)
scm = SCM_UNSPECIFIED;
else {
scm = _elstr_dup(elstr);
elstr = (struct elstr*) SCM_CDR(scm);
}
for (i = 0; i < elstr->len; i++)
elstr->str[i] = elchr_deaccent(elstr->str[i]);
elstr->acc_pos = 0;
elstr->acc_syl = 0;
return scm;
}
SCM_DEFINE_PUBLIC(scm_elstr_deaccent, "elstr-deaccent",
1, 0, 0,
(SCM el),
"Remove all accents from EL\n")
#define FUNC_NAME s_scm_elstr_deaccent
{
return _elstr_deaccent(el, 0, FUNC_NAME);
}
#undef FUNC_NAME
SCM_DEFINE_PUBLIC(scm_elstr_deaccent_x, "elstr-deaccent!",
1, 0, 0,
(SCM el),
"Remove all accents from EL (desctructive)\n")
#define FUNC_NAME s_scm_elstr_deaccent_x
{
return _elstr_deaccent(el, 1, FUNC_NAME);
}
#undef FUNC_NAME
static SCM
_elstr_set_accent(SCM el, SCM n, int destructive, const char *func_name)
{
struct elstr *elstr;
unsigned i;
unsigned acc_num, num, len, start;
SCM scm;
int dstate;
SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name);
SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name);
elstr = (struct elstr*) SCM_CDR(el);
num = scm_to_uint(n);
if (num > elstr->nsyl)
scm_misc_error(func_name,
"cannot get syllable #~S: not enough syllables: ~S",
scm_list_2(el, n));
acc_num = elstr->nsyl - num;
if (acc_num == 0)
start = 0;
else
start = elstr->sylmap[acc_num - 1] + 1;
if (destructive)
scm = SCM_UNSPECIFIED;
else {
scm = _elstr_dup(elstr);
elstr = (struct elstr*) SCM_CDR(scm);
}
/* Clear all accents */
for (i = 0; i < elstr->len; i++)
elstr->str[i] = elchr_deaccent(elstr->str[i]);
len = elstr->sylmap[acc_num] - start + 1;
dstate = 0;
for (i = start; i <= start + len; i++) {
int nstate;
if (!elchr_isvowel(elstr->str[i])) {
if (dstate) {
--i;
break;
}
continue;
}
nstate = elchr_diphthong(elstr->str[i], dstate);
if (!nstate)
break;
dstate = nstate;
}
elstr->str[i] = elchr_accent(elstr->str[i], CHF_OXEIA);
elstr->acc_syl = num;
return scm;
}
SCM_DEFINE_PUBLIC(scm_elstr_set_accent, "elstr-set-accent",
2, 0, 0,
(SCM el, SCM n),
"Set accent on Nth syllable of EL\n")
{
return _elstr_set_accent(el, n, 0, s_scm_elstr_set_accent);
}
SCM_DEFINE_PUBLIC(scm_elstr_set_accent_x, "elstr-set-accent!",
2, 0, 0,
(SCM el, SCM n),
"Set accent on Nth syllable of EL (destructive)\n")
{
return _elstr_set_accent(el, n, 1, s_scm_elstr_set_accent_x);
}
#undef FUNC_NAME
SCM_DEFINE_PUBLIC(scm_elstr_char_prop_bitmask, "elstr-char-prop-bitmask",
2, 0, 0,
(SCM el, SCM n),
"Returns properties of the Nth char in EL, as a bitmask\n")
#define FUNC_NAME s_scm_elstr_char_prop_bitmask
{
struct elstr *elstr;
unsigned num;
SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
elstr = (struct elstr*) SCM_CDR(el);
num = scm_to_uint(n);
if (num >= elstr->len)
scm_misc_error(FUNC_NAME,
"cannot get character #~S: not enough characters: ~S",
scm_list_2(el, n));
return scm_from_uint(elchr_flags(elstr->str[num]));
}
#undef FUNC_NAME
static struct deftab {
unsigned val;
char *sym;
} deftab[] = {
{ CHF_OXEIA, "elmorph:oxeia" },
{ CHF_PERISPWMENH, "elmorph:perispwmenh" },
{ CHF_BAREIA, "elmorph:bareia" },
{ CHF_ACCENT_MASK, "elmorph:accent-mask" },
{ CHF_TREMA, "elmorph:trema" },
{ CHF_VOWEL, "elmorph:vowel" },
{ CHF_CONSONANT, "elmorph:consonant" },
{ CHF_SEMIVOWEL, "elmorph:semivowel" },
{ CHF_PUNCT, "elmorph:punct" },
{ CHF_SYMBOL, "elmorph:symbol" },
{ CHF_MODIFIER, "elmorph:modifier" },
{ CHF_ARCHAIC, "elmorph:archaic" },
{ CHF_LOWER, "elmorph:lower" },
{ CHF_UPPER, "elmorph:upper" },
{ CHF_NUMERIC, "elmorph:numeric" },
{ CHF_DIPH1, "elmorph:diph1" },
{ CHF_DIPH2, "elmorph:diph2" }
};
SCM_DEFINE_PUBLIC(scm_utf8_toupper, "utf8-toupper", 1, 0, 0,
(SCM string),
"Convert STRING to uppercase\n")
#define FUNC_NAME s_scm_utf8_toupper
{
char *str;
SCM scm;
SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME);
str = scm_to_locale_string(string);
if (utf8_toupper(str, strlen(str)))
scm_misc_error(FUNC_NAME,
"cannot convert to upper case: ~S",
scm_list_1(string));
scm = scm_from_locale_string(str);
free(str);
return scm;
}
#undef FUNC_NAME
SCM_DEFINE_PUBLIC(scm_utf8_tolower, "utf8-tolower", 1, 0, 0,
(SCM string),
"Convert STRING to lowercase\n")
#define FUNC_NAME s_scm_utf8_tolower
{
char *str;
SCM scm;
SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME);
str = scm_to_locale_string(string);
if (utf8_tolower(str, strlen(str)))
scm_misc_error(FUNC_NAME,
"cannot convert to lower case: ~S",
scm_list_1(string));
scm = scm_from_locale_string(str);
free(str);
return scm;
}
#undef FUNC_NAME
static SCM
_elstr_thema_aoristoy(SCM el, int destructive, const char *func_name)
{
struct elstr *elstr;
SCM scm;
unsigned *wc;
size_t wclen;
SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name);
elstr = (struct elstr*) SCM_CDR(el);
if (destructive)
scm = SCM_UNSPECIFIED;
else {
scm = _elstr_dup(elstr);
elstr = (struct elstr*) SCM_CDR(scm);
}
if (elmorph_thema_aoristoy(elstr->str, elstr->len, &wc, &wclen))
scm_memory_error(func_name);
free(elstr->str);
elstr->str = wc;
elstr->len = wclen;
return scm;
}
SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy, "elstr-thema-aoristoy", 1, 0, 0,
(SCM thema),
"Convert THEMA, which must be a root of present. to an aorist root\n")
#define FUNC_NAME s_scm_elstr_thema_aoristoy
{
return _elstr_thema_aoristoy(thema, 0, FUNC_NAME);
}
#undef FUNC_NAME
SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy_x, "elstr-thema-aoristoy!", 1, 0, 0,
(SCM thema),
"Convert THEMA, which must be a root of present. to an aorist root (destructive)\n")
#define FUNC_NAME s_scm_elstr_thema_aoristoy_x
{
return _elstr_thema_aoristoy(thema, 1, FUNC_NAME);
}
#undef FUNC_NAME
void
scm_init_ellinika_elmorph_module()
{
int i;
_elstr_init();
for (i = 0; i < sizeof(deftab)/sizeof(deftab[0]); i++) {
scm_c_define(deftab[i].sym, scm_from_uint(deftab[i].val));
scm_c_export(deftab[i].sym, NULL);
}
#include "elmorph.x"
}