summaryrefslogtreecommitdiffabout
path: root/src/ellinika
authorSergey Poznyakoff <gray@gnu.org.ua>2011-06-04 13:12:10 (GMT)
committer Sergey Poznyakoff <gray@gnu.org.ua>2011-06-04 13:12:10 (GMT)
commit8d4d5b758ef1f09a4ed39a25a07a09f9c26d0aec (patch) (side-by-side diff)
treeacd3235e1e2cdbff16bfb0d9d8ee09065171968e /src/ellinika
parentaeb69dcd0e430b5e539c5ca33c73703dad59253d (diff)
downloadellinika-8d4d5b758ef1f09a4ed39a25a07a09f9c26d0aec.tar.gz
ellinika-8d4d5b758ef1f09a4ed39a25a07a09f9c26d0aec.tar.bz2
Add new functions for operations over elstrs.
* src/ellinika/utf8.c (utf8_wc_strnchr) (utf8_wc_strnstr): New functions. * src/ellinika/utf8.h (utf8_wc_strnchr) (utf8_wc_strnstr): New protos. * src/ellinika/elmorph.c (_elstr_alloc): Reuse existing sylmap. (_elstr_slice): New function. (elstr-slice,elstr-slice!,elstr-index): New functions. * src/ellinika/elmorph.scm4 (elstr-trim,elstr-trim!): New functions. git-svn-id: file:///home/puszcza/svnroot/ellinika/trunk@562 941c8c0f-9102-463b-b60b-cd22ce0e6858
Diffstat (limited to 'src/ellinika') (more/less context) (ignore whitespace changes)
-rw-r--r--src/ellinika/elmorph.c107
-rw-r--r--src/ellinika/elmorph.scm419
-rw-r--r--src/ellinika/utf8.c40
-rw-r--r--src/ellinika/utf8.h4
4 files changed, 166 insertions, 4 deletions
diff --git a/src/ellinika/elmorph.c b/src/ellinika/elmorph.c
index 88520a7..6ff5f01 100644
--- a/src/ellinika/elmorph.c
+++ b/src/ellinika/elmorph.c
@@ -43,8 +43,11 @@ _elstr_syllabize(struct elstr *elstr)
unsigned i, nsyl = 0, accsyl = 0, accchr = 0;
int dstate = 0;
int acc = 0;
-
- sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len, "syllable map");
+
+ if (!elstr->sylmap)
+ elstr->sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len,
+ "syllable map");
+ sylmap = elstr->sylmap;
for (i = 0; i < elstr->len; i++) {
int nstate;
@@ -66,7 +69,6 @@ _elstr_syllabize(struct elstr *elstr)
sylmap[nsyl++] = i - 1;
else
sylmap[nsyl-1] = i - 1;
- elstr->sylmap = sylmap;
elstr->nsyl = nsyl;
elstr->acc_pos = accchr;
elstr->acc_syl = nsyl - accsyl;
@@ -85,7 +87,7 @@ _elstr_alloc(const char *instr)
elstr = scm_gc_malloc(sizeof(*elstr), "Elstr");
elstr->str = wptr;
elstr->len = wlen;
-
+ elstr->sylmap = NULL;
_elstr_syllabize(elstr);
SCM_RETURN_NEWSMOB(_elstr_tag, elstr);
@@ -540,6 +542,7 @@ SCM_DEFINE_PUBLIC(scm_elstr_char_prop_bitmask, "elstr-char-prop-bitmask",
unsigned num;
SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
+ SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME);
elstr = (struct elstr*) SCM_CDR(el);
num = scm_to_uint(n);
if (num >= elstr->len)
@@ -655,6 +658,102 @@ SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy_x, "elstr-thema-aoristoy!", 1, 0, 0,
return _elstr_thema_aoristoy(thema, 1, FUNC_NAME);
}
#undef FUNC_NAME
+
+static SCM
+_elstr_slice(SCM el, SCM n, SCM l, int destructive, const char *func_name)
+{
+ struct elstr *elstr;
+ int num;
+ unsigned len;
+ SCM scm;
+
+ SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name);
+ SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name);
+ SCM_ASSERT(scm_is_integer(l), l, SCM_ARG3, func_name);
+ elstr = (struct elstr*) SCM_CDR(el);
+ num = scm_to_int(n);
+ len = scm_to_uint(l);
+
+ if (num < 0)
+ num += elstr->len;
+ if (num < 0 || num >= elstr->len || num + len > elstr->len)
+ scm_misc_error(func_name,
+ "invalid offset or length",
+ SCM_EOL);
+
+ if (destructive)
+ scm = SCM_UNSPECIFIED;
+ else {
+ scm = _elstr_dup(elstr);
+ elstr = (struct elstr*) SCM_CDR(scm);
+ }
+
+ if (num)
+ memmove(elstr->str, elstr->str + num,
+ sizeof(elstr->str[0]) * len);
+ elstr->len = len;
+ _elstr_syllabize(elstr);
+ return scm;
+}
+
+SCM_DEFINE_PUBLIC(scm_elstr_slice, "elstr-slice",
+ 3, 0, 0,
+ (SCM word, SCM off, SCM len),
+"Extract LEN characters from WORD starting from position OFF\n")
+#define FUNC_NAME s_scm_elstr_slice
+{
+ return _elstr_slice(word, off, len, 0, FUNC_NAME);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_slice_x, "elstr-slice!",
+ 3, 0, 0,
+ (SCM word, SCM off, SCM len),
+"Extract LEN characters from WORD starting from position OFF (destructive)\n")
+#define FUNC_NAME s_scm_elstr_slice_x
+{
+ return _elstr_slice(word, off, len, 1, FUNC_NAME);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE_PUBLIC(scm_elstr_index, "elstr-index",
+ 2, 0, 0,
+ (SCM word, SCM needle),
+"")
+#define FUNC_NAME s_scm_elstr_index
+{
+ struct elstr *elstr;
+ unsigned *wc, *wtmp = NULL, *p;
+ unsigned wlen;
+
+ SCM_ASSERT(scm_is_elstr(word), word, SCM_ARG1, FUNC_NAME);
+ elstr = (struct elstr*) SCM_CDR(word);
+ if (scm_is_elstr(needle)) {
+ struct elstr *ep = (struct elstr*) SCM_CDR(needle);
+ wc = ep->str;
+ wlen = ep->len;
+ } else {
+ SCM scm;
+ char *str;
+
+ SCM_ASSERT(scm_is_string(needle), needle, SCM_ARG2, FUNC_NAME);
+ str = scm_to_locale_string(needle);
+ if (utf8_mbstr_to_wc(str, &wtmp, &wlen)) {
+ free(str);
+ scm_misc_error(FUNC_NAME,
+ "Invalid needle string: ~S",
+ scm_list_1(needle));
+ }
+ free(str);
+ wc = wtmp;
+ }
+ p = (unsigned*)utf8_wc_strnstr(elstr->str, elstr->len, wc, wlen);
+ free(wtmp);
+ if (p)
+ return scm_from_int(p - elstr->str);
+ return SCM_BOOL_F;
+}
+#undef FUNC_NAME
void
diff --git a/src/ellinika/elmorph.scm4 b/src/ellinika/elmorph.scm4
index 546bcb5..e3ed4b5 100644
--- a/src/ellinika/elmorph.scm4
+++ b/src/ellinika/elmorph.scm4
@@ -19,3 +19,22 @@
(load-extension
"LIBDIR/libguile-elmorph-v-VERSION"
"scm_init_ellinika_elmorph_module")
+
+(define-public (elstr-trim word n)
+ (cond
+ ((> n 0)
+ (elstr-slice word n (- (elstr-length word) n)))
+ ((< n 0)
+ (elstr-slice word 0 (+ (elstr-length word) n)))
+ (else
+ word)))
+
+(define-public (elstr-trim! word n)
+ (cond
+ ((> n 0)
+ (elstr-slice! word n (- (elstr-length word) n)))
+ ((< n 0)
+ (elstr-slice! word 0 (+ (elstr-length word) n)))))
+
+
+
diff --git a/src/ellinika/utf8.c b/src/ellinika/utf8.c
index 952af07..b946a3b 100644
--- a/src/ellinika/utf8.c
+++ b/src/ellinika/utf8.c
@@ -1933,6 +1933,15 @@ utf8_wc_strcasecmp(const unsigned *a, const unsigned *b)
}
const unsigned *
+utf8_wc_strnchr(const unsigned *str, unsigned chr, size_t len)
+{
+ for (; len; str++, len--)
+ if (*str == chr)
+ return str;
+ return NULL;
+}
+
+const unsigned *
utf8_wc_strchr(const unsigned *str, unsigned chr)
{
for (; *str; str++)
@@ -1980,6 +1989,37 @@ utf8_wc_strstr(const unsigned *haystack, const unsigned *needle)
return NULL;
}
+const unsigned *
+utf8_wc_strnstr(const unsigned *haystack, size_t hlen,
+ const unsigned *needle, size_t nlen)
+{
+ unsigned first;
+
+ /* Is needle empty? */
+ if (hlen == 0)
+ return haystack;
+ first = needle[0];
+ /* Is needle nearly empty? */
+ if (nlen == 1)
+ return utf8_wc_strnchr(haystack, first, hlen);
+ for (; hlen; haystack++, hlen--)
+ if (*haystack == first) {
+ /* Compare with needle's remaining units. */
+ const unsigned *hptr = haystack + 1;
+ size_t len = 1;
+ for (;;) {
+ if (*hptr != needle[len])
+ break;
+ hptr++;
+ len++;
+ if (len == nlen)
+ return haystack;
+ }
+ }
+
+ return NULL;
+}
+
unsigned *
utf8_wc_quote(const unsigned *s)
{
diff --git a/src/ellinika/utf8.h b/src/ellinika/utf8.h
index ce26f09..c4b5e44 100644
--- a/src/ellinika/utf8.h
+++ b/src/ellinika/utf8.h
@@ -58,9 +58,13 @@ int utf8_quote (const char *str, char **sptr);
unsigned *utf8_wc_quote (const unsigned *s);
const unsigned *utf8_wc_strchr(const unsigned *str, unsigned chr);
+const unsigned *utf8_wc_strnchr(const unsigned *str, unsigned chr, size_t len);
const unsigned *utf8_wc_strchr_ci(const unsigned *str, unsigned chr);
const unsigned *utf8_wc_strstr(const unsigned *haystack,
const unsigned *needle);
+const unsigned *utf8_wc_strnstr(const unsigned *haystack, size_t hlen,
+ const unsigned *needle, size_t nlen);
+
void utf8_wc_strupper(unsigned *str);
void utf8_wc_strlower(unsigned *str);

Return to:

Send suggestions and report system problems to the System administrator.