diff options
Diffstat (limited to 'src/ellinika')
-rw-r--r-- | src/ellinika/elmorph.c | 107 | ||||
-rw-r--r-- | src/ellinika/elmorph.scm4 | 19 | ||||
-rw-r--r-- | src/ellinika/utf8.c | 40 | ||||
-rw-r--r-- | src/ellinika/utf8.h | 4 |
4 files changed, 166 insertions, 4 deletions
diff --git a/src/ellinika/elmorph.c b/src/ellinika/elmorph.c index 88520a7..6ff5f01 100644 --- a/src/ellinika/elmorph.c +++ b/src/ellinika/elmorph.c @@ -45,4 +45,7 @@ _elstr_syllabize(struct elstr *elstr) int acc = 0; - - sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len, "syllable map"); + + if (!elstr->sylmap) + elstr->sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len, + "syllable map"); + sylmap = elstr->sylmap; @@ -68,3 +71,2 @@ _elstr_syllabize(struct elstr *elstr) sylmap[nsyl-1] = i - 1; - elstr->sylmap = sylmap; elstr->nsyl = nsyl; @@ -87,3 +89,3 @@ _elstr_alloc(const char *instr) elstr->len = wlen; - + elstr->sylmap = NULL; _elstr_syllabize(elstr); @@ -542,2 +544,3 @@ SCM_DEFINE_PUBLIC(scm_elstr_char_prop_bitmask, "elstr-char-prop-bitmask", SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); elstr = (struct elstr*) SCM_CDR(el); @@ -657,2 +660,98 @@ SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy_x, "elstr-thema-aoristoy!", 1, 0, 0, #undef FUNC_NAME + +static SCM +_elstr_slice(SCM el, SCM n, SCM l, int destructive, const char *func_name) +{ + struct elstr *elstr; + int num; + unsigned len; + SCM scm; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name); + SCM_ASSERT(scm_is_integer(l), l, SCM_ARG3, func_name); + elstr = (struct elstr*) SCM_CDR(el); + num = scm_to_int(n); + len = scm_to_uint(l); + + if (num < 0) + num += elstr->len; + if (num < 0 || num >= elstr->len || num + len > elstr->len) + scm_misc_error(func_name, + "invalid offset or length", + SCM_EOL); + + if (destructive) + scm = SCM_UNSPECIFIED; + else { + scm = _elstr_dup(elstr); + elstr = (struct elstr*) SCM_CDR(scm); + } + + if (num) + memmove(elstr->str, elstr->str + num, + sizeof(elstr->str[0]) * len); + elstr->len = len; + _elstr_syllabize(elstr); + return scm; +} + +SCM_DEFINE_PUBLIC(scm_elstr_slice, "elstr-slice", + 3, 0, 0, + (SCM word, SCM off, SCM len), +"Extract LEN characters from WORD starting from position OFF\n") +#define FUNC_NAME s_scm_elstr_slice +{ + return _elstr_slice(word, off, len, 0, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_slice_x, "elstr-slice!", + 3, 0, 0, + (SCM word, SCM off, SCM len), +"Extract LEN characters from WORD starting from position OFF (destructive)\n") +#define FUNC_NAME s_scm_elstr_slice_x +{ + return _elstr_slice(word, off, len, 1, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_index, "elstr-index", + 2, 0, 0, + (SCM word, SCM needle), +"") +#define FUNC_NAME s_scm_elstr_index +{ + struct elstr *elstr; + unsigned *wc, *wtmp = NULL, *p; + unsigned wlen; + + SCM_ASSERT(scm_is_elstr(word), word, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(word); + if (scm_is_elstr(needle)) { + struct elstr *ep = (struct elstr*) SCM_CDR(needle); + wc = ep->str; + wlen = ep->len; + } else { + SCM scm; + char *str; + + SCM_ASSERT(scm_is_string(needle), needle, SCM_ARG2, FUNC_NAME); + str = scm_to_locale_string(needle); + if (utf8_mbstr_to_wc(str, &wtmp, &wlen)) { + free(str); + scm_misc_error(FUNC_NAME, + "Invalid needle string: ~S", + scm_list_1(needle)); + } + free(str); + wc = wtmp; + } + p = (unsigned*)utf8_wc_strnstr(elstr->str, elstr->len, wc, wlen); + free(wtmp); + if (p) + return scm_from_int(p - elstr->str); + return SCM_BOOL_F; +} +#undef FUNC_NAME diff --git a/src/ellinika/elmorph.scm4 b/src/ellinika/elmorph.scm4 index 546bcb5..e3ed4b5 100644 --- a/src/ellinika/elmorph.scm4 +++ b/src/ellinika/elmorph.scm4 @@ -21 +21,20 @@ "scm_init_ellinika_elmorph_module") + +(define-public (elstr-trim word n) + (cond + ((> n 0) + (elstr-slice word n (- (elstr-length word) n))) + ((< n 0) + (elstr-slice word 0 (+ (elstr-length word) n))) + (else + word))) + +(define-public (elstr-trim! word n) + (cond + ((> n 0) + (elstr-slice! word n (- (elstr-length word) n))) + ((< n 0) + (elstr-slice! word 0 (+ (elstr-length word) n))))) + + + diff --git a/src/ellinika/utf8.c b/src/ellinika/utf8.c index 952af07..b946a3b 100644 --- a/src/ellinika/utf8.c +++ b/src/ellinika/utf8.c @@ -1935,2 +1935,11 @@ utf8_wc_strcasecmp(const unsigned *a, const unsigned *b) const unsigned * +utf8_wc_strnchr(const unsigned *str, unsigned chr, size_t len) +{ + for (; len; str++, len--) + if (*str == chr) + return str; + return NULL; +} + +const unsigned * utf8_wc_strchr(const unsigned *str, unsigned chr) @@ -1982,2 +1991,33 @@ utf8_wc_strstr(const unsigned *haystack, const unsigned *needle) +const unsigned * +utf8_wc_strnstr(const unsigned *haystack, size_t hlen, + const unsigned *needle, size_t nlen) +{ + unsigned first; + + /* Is needle empty? */ + if (hlen == 0) + return haystack; + first = needle[0]; + /* Is needle nearly empty? */ + if (nlen == 1) + return utf8_wc_strnchr(haystack, first, hlen); + for (; hlen; haystack++, hlen--) + if (*haystack == first) { + /* Compare with needle's remaining units. */ + const unsigned *hptr = haystack + 1; + size_t len = 1; + for (;;) { + if (*hptr != needle[len]) + break; + hptr++; + len++; + if (len == nlen) + return haystack; + } + } + + return NULL; +} + unsigned * diff --git a/src/ellinika/utf8.h b/src/ellinika/utf8.h index ce26f09..c4b5e44 100644 --- a/src/ellinika/utf8.h +++ b/src/ellinika/utf8.h @@ -60,2 +60,3 @@ unsigned *utf8_wc_quote (const unsigned *s); const unsigned *utf8_wc_strchr(const unsigned *str, unsigned chr); +const unsigned *utf8_wc_strnchr(const unsigned *str, unsigned chr, size_t len); const unsigned *utf8_wc_strchr_ci(const unsigned *str, unsigned chr); @@ -63,2 +64,5 @@ const unsigned *utf8_wc_strstr(const unsigned *haystack, const unsigned *needle); +const unsigned *utf8_wc_strnstr(const unsigned *haystack, size_t hlen, + const unsigned *needle, size_t nlen); + |