From 8d4d5b758ef1f09a4ed39a25a07a09f9c26d0aec Mon Sep 17 00:00:00 2001 From: Sergey Poznyakoff Date: Sat, 4 Jun 2011 13:12:10 +0000 Subject: Add new functions for operations over elstrs. * src/ellinika/utf8.c (utf8_wc_strnchr) (utf8_wc_strnstr): New functions. * src/ellinika/utf8.h (utf8_wc_strnchr) (utf8_wc_strnstr): New protos. * src/ellinika/elmorph.c (_elstr_alloc): Reuse existing sylmap. (_elstr_slice): New function. (elstr-slice,elstr-slice!,elstr-index): New functions. * src/ellinika/elmorph.scm4 (elstr-trim,elstr-trim!): New functions. git-svn-id: file:///home/puszcza/svnroot/ellinika/trunk@562 941c8c0f-9102-463b-b60b-cd22ce0e6858 --- src/ellinika/elmorph.c | 107 ++++++++++++++++++++++++++++++++++++++++++++-- src/ellinika/elmorph.scm4 | 19 ++++++++ src/ellinika/utf8.c | 40 +++++++++++++++++ src/ellinika/utf8.h | 4 ++ 4 files changed, 166 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/ellinika/elmorph.c b/src/ellinika/elmorph.c index 88520a7..6ff5f01 100644 --- a/src/ellinika/elmorph.c +++ b/src/ellinika/elmorph.c @@ -43,8 +43,11 @@ _elstr_syllabize(struct elstr *elstr) unsigned i, nsyl = 0, accsyl = 0, accchr = 0; int dstate = 0; int acc = 0; - - sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len, "syllable map"); + + if (!elstr->sylmap) + elstr->sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len, + "syllable map"); + sylmap = elstr->sylmap; for (i = 0; i < elstr->len; i++) { int nstate; @@ -66,7 +69,6 @@ _elstr_syllabize(struct elstr *elstr) sylmap[nsyl++] = i - 1; else sylmap[nsyl-1] = i - 1; - elstr->sylmap = sylmap; elstr->nsyl = nsyl; elstr->acc_pos = accchr; elstr->acc_syl = nsyl - accsyl; @@ -85,7 +87,7 @@ _elstr_alloc(const char *instr) elstr = scm_gc_malloc(sizeof(*elstr), "Elstr"); elstr->str = wptr; elstr->len = wlen; - + elstr->sylmap = NULL; _elstr_syllabize(elstr); SCM_RETURN_NEWSMOB(_elstr_tag, elstr); @@ -540,6 +542,7 @@ SCM_DEFINE_PUBLIC(scm_elstr_char_prop_bitmask, "elstr-char-prop-bitmask", unsigned num; SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); elstr = (struct elstr*) SCM_CDR(el); num = scm_to_uint(n); if (num >= elstr->len) @@ -655,6 +658,102 @@ SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy_x, "elstr-thema-aoristoy!", 1, 0, 0, return _elstr_thema_aoristoy(thema, 1, FUNC_NAME); } #undef FUNC_NAME + +static SCM +_elstr_slice(SCM el, SCM n, SCM l, int destructive, const char *func_name) +{ + struct elstr *elstr; + int num; + unsigned len; + SCM scm; + + SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); + SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name); + SCM_ASSERT(scm_is_integer(l), l, SCM_ARG3, func_name); + elstr = (struct elstr*) SCM_CDR(el); + num = scm_to_int(n); + len = scm_to_uint(l); + + if (num < 0) + num += elstr->len; + if (num < 0 || num >= elstr->len || num + len > elstr->len) + scm_misc_error(func_name, + "invalid offset or length", + SCM_EOL); + + if (destructive) + scm = SCM_UNSPECIFIED; + else { + scm = _elstr_dup(elstr); + elstr = (struct elstr*) SCM_CDR(scm); + } + + if (num) + memmove(elstr->str, elstr->str + num, + sizeof(elstr->str[0]) * len); + elstr->len = len; + _elstr_syllabize(elstr); + return scm; +} + +SCM_DEFINE_PUBLIC(scm_elstr_slice, "elstr-slice", + 3, 0, 0, + (SCM word, SCM off, SCM len), +"Extract LEN characters from WORD starting from position OFF\n") +#define FUNC_NAME s_scm_elstr_slice +{ + return _elstr_slice(word, off, len, 0, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_slice_x, "elstr-slice!", + 3, 0, 0, + (SCM word, SCM off, SCM len), +"Extract LEN characters from WORD starting from position OFF (destructive)\n") +#define FUNC_NAME s_scm_elstr_slice_x +{ + return _elstr_slice(word, off, len, 1, FUNC_NAME); +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_elstr_index, "elstr-index", + 2, 0, 0, + (SCM word, SCM needle), +"") +#define FUNC_NAME s_scm_elstr_index +{ + struct elstr *elstr; + unsigned *wc, *wtmp = NULL, *p; + unsigned wlen; + + SCM_ASSERT(scm_is_elstr(word), word, SCM_ARG1, FUNC_NAME); + elstr = (struct elstr*) SCM_CDR(word); + if (scm_is_elstr(needle)) { + struct elstr *ep = (struct elstr*) SCM_CDR(needle); + wc = ep->str; + wlen = ep->len; + } else { + SCM scm; + char *str; + + SCM_ASSERT(scm_is_string(needle), needle, SCM_ARG2, FUNC_NAME); + str = scm_to_locale_string(needle); + if (utf8_mbstr_to_wc(str, &wtmp, &wlen)) { + free(str); + scm_misc_error(FUNC_NAME, + "Invalid needle string: ~S", + scm_list_1(needle)); + } + free(str); + wc = wtmp; + } + p = (unsigned*)utf8_wc_strnstr(elstr->str, elstr->len, wc, wlen); + free(wtmp); + if (p) + return scm_from_int(p - elstr->str); + return SCM_BOOL_F; +} +#undef FUNC_NAME void diff --git a/src/ellinika/elmorph.scm4 b/src/ellinika/elmorph.scm4 index 546bcb5..e3ed4b5 100644 --- a/src/ellinika/elmorph.scm4 +++ b/src/ellinika/elmorph.scm4 @@ -19,3 +19,22 @@ (load-extension "LIBDIR/libguile-elmorph-v-VERSION" "scm_init_ellinika_elmorph_module") + +(define-public (elstr-trim word n) + (cond + ((> n 0) + (elstr-slice word n (- (elstr-length word) n))) + ((< n 0) + (elstr-slice word 0 (+ (elstr-length word) n))) + (else + word))) + +(define-public (elstr-trim! word n) + (cond + ((> n 0) + (elstr-slice! word n (- (elstr-length word) n))) + ((< n 0) + (elstr-slice! word 0 (+ (elstr-length word) n))))) + + + diff --git a/src/ellinika/utf8.c b/src/ellinika/utf8.c index 952af07..b946a3b 100644 --- a/src/ellinika/utf8.c +++ b/src/ellinika/utf8.c @@ -1932,6 +1932,15 @@ utf8_wc_strcasecmp(const unsigned *a, const unsigned *b) return 0; } +const unsigned * +utf8_wc_strnchr(const unsigned *str, unsigned chr, size_t len) +{ + for (; len; str++, len--) + if (*str == chr) + return str; + return NULL; +} + const unsigned * utf8_wc_strchr(const unsigned *str, unsigned chr) { @@ -1980,6 +1989,37 @@ utf8_wc_strstr(const unsigned *haystack, const unsigned *needle) return NULL; } +const unsigned * +utf8_wc_strnstr(const unsigned *haystack, size_t hlen, + const unsigned *needle, size_t nlen) +{ + unsigned first; + + /* Is needle empty? */ + if (hlen == 0) + return haystack; + first = needle[0]; + /* Is needle nearly empty? */ + if (nlen == 1) + return utf8_wc_strnchr(haystack, first, hlen); + for (; hlen; haystack++, hlen--) + if (*haystack == first) { + /* Compare with needle's remaining units. */ + const unsigned *hptr = haystack + 1; + size_t len = 1; + for (;;) { + if (*hptr != needle[len]) + break; + hptr++; + len++; + if (len == nlen) + return haystack; + } + } + + return NULL; +} + unsigned * utf8_wc_quote(const unsigned *s) { diff --git a/src/ellinika/utf8.h b/src/ellinika/utf8.h index ce26f09..c4b5e44 100644 --- a/src/ellinika/utf8.h +++ b/src/ellinika/utf8.h @@ -58,9 +58,13 @@ int utf8_quote (const char *str, char **sptr); unsigned *utf8_wc_quote (const unsigned *s); const unsigned *utf8_wc_strchr(const unsigned *str, unsigned chr); +const unsigned *utf8_wc_strnchr(const unsigned *str, unsigned chr, size_t len); const unsigned *utf8_wc_strchr_ci(const unsigned *str, unsigned chr); const unsigned *utf8_wc_strstr(const unsigned *haystack, const unsigned *needle); +const unsigned *utf8_wc_strnstr(const unsigned *haystack, size_t hlen, + const unsigned *needle, size_t nlen); + void utf8_wc_strupper(unsigned *str); void utf8_wc_strlower(unsigned *str); -- cgit v1.2.1