diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2011-06-04 13:12:10 +0000 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2011-06-04 13:12:10 +0000 |
commit | 8d4d5b758ef1f09a4ed39a25a07a09f9c26d0aec (patch) | |
tree | acd3235e1e2cdbff16bfb0d9d8ee09065171968e /src/ellinika | |
parent | aeb69dcd0e430b5e539c5ca33c73703dad59253d (diff) | |
download | ellinika-8d4d5b758ef1f09a4ed39a25a07a09f9c26d0aec.tar.gz ellinika-8d4d5b758ef1f09a4ed39a25a07a09f9c26d0aec.tar.bz2 |
Add new functions for operations over elstrs.
* src/ellinika/utf8.c (utf8_wc_strnchr)
(utf8_wc_strnstr): New functions.
* src/ellinika/utf8.h (utf8_wc_strnchr)
(utf8_wc_strnstr): New protos.
* src/ellinika/elmorph.c (_elstr_alloc): Reuse existing sylmap.
(_elstr_slice): New function.
(elstr-slice,elstr-slice!,elstr-index): New functions.
* src/ellinika/elmorph.scm4 (elstr-trim,elstr-trim!): New functions.
git-svn-id: file:///home/puszcza/svnroot/ellinika/trunk@562 941c8c0f-9102-463b-b60b-cd22ce0e6858
Diffstat (limited to 'src/ellinika')
-rw-r--r-- | src/ellinika/elmorph.c | 107 | ||||
-rw-r--r-- | src/ellinika/elmorph.scm4 | 19 | ||||
-rw-r--r-- | src/ellinika/utf8.c | 40 | ||||
-rw-r--r-- | src/ellinika/utf8.h | 4 |
4 files changed, 166 insertions, 4 deletions
diff --git a/src/ellinika/elmorph.c b/src/ellinika/elmorph.c index 88520a7..6ff5f01 100644 --- a/src/ellinika/elmorph.c +++ b/src/ellinika/elmorph.c | |||
@@ -45,4 +45,7 @@ _elstr_syllabize(struct elstr *elstr) | |||
45 | int acc = 0; | 45 | int acc = 0; |
46 | 46 | ||
47 | sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len, "syllable map"); | 47 | if (!elstr->sylmap) |
48 | elstr->sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len, | ||
49 | "syllable map"); | ||
50 | sylmap = elstr->sylmap; | ||
48 | 51 | ||
@@ -68,3 +71,2 @@ _elstr_syllabize(struct elstr *elstr) | |||
68 | sylmap[nsyl-1] = i - 1; | 71 | sylmap[nsyl-1] = i - 1; |
69 | elstr->sylmap = sylmap; | ||
70 | elstr->nsyl = nsyl; | 72 | elstr->nsyl = nsyl; |
@@ -87,3 +89,3 @@ _elstr_alloc(const char *instr) | |||
87 | elstr->len = wlen; | 89 | elstr->len = wlen; |
88 | 90 | elstr->sylmap = NULL; | |
89 | _elstr_syllabize(elstr); | 91 | _elstr_syllabize(elstr); |
@@ -542,2 +544,3 @@ SCM_DEFINE_PUBLIC(scm_elstr_char_prop_bitmask, "elstr-char-prop-bitmask", | |||
542 | SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); | 544 | SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); |
545 | SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME); | ||
543 | elstr = (struct elstr*) SCM_CDR(el); | 546 | elstr = (struct elstr*) SCM_CDR(el); |
@@ -657,2 +660,98 @@ SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy_x, "elstr-thema-aoristoy!", 1, 0, 0, | |||
657 | #undef FUNC_NAME | 660 | #undef FUNC_NAME |
661 | |||
662 | static SCM | ||
663 | _elstr_slice(SCM el, SCM n, SCM l, int destructive, const char *func_name) | ||
664 | { | ||
665 | struct elstr *elstr; | ||
666 | int num; | ||
667 | unsigned len; | ||
668 | SCM scm; | ||
669 | |||
670 | SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name); | ||
671 | SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name); | ||
672 | SCM_ASSERT(scm_is_integer(l), l, SCM_ARG3, func_name); | ||
673 | elstr = (struct elstr*) SCM_CDR(el); | ||
674 | num = scm_to_int(n); | ||
675 | len = scm_to_uint(l); | ||
676 | |||
677 | if (num < 0) | ||
678 | num += elstr->len; | ||
679 | if (num < 0 || num >= elstr->len || num + len > elstr->len) | ||
680 | scm_misc_error(func_name, | ||
681 | "invalid offset or length", | ||
682 | SCM_EOL); | ||
683 | |||
684 | if (destructive) | ||
685 | scm = SCM_UNSPECIFIED; | ||
686 | else { | ||
687 | scm = _elstr_dup(elstr); | ||
688 | elstr = (struct elstr*) SCM_CDR(scm); | ||
689 | } | ||
690 | |||
691 | if (num) | ||
692 | memmove(elstr->str, elstr->str + num, | ||
693 | sizeof(elstr->str[0]) * len); | ||
694 | elstr->len = len; | ||
695 | _elstr_syllabize(elstr); | ||
696 | return scm; | ||
697 | } | ||
698 | |||
699 | SCM_DEFINE_PUBLIC(scm_elstr_slice, "elstr-slice", | ||
700 | 3, 0, 0, | ||
701 | (SCM word, SCM off, SCM len), | ||
702 | "Extract LEN characters from WORD starting from position OFF\n") | ||
703 | #define FUNC_NAME s_scm_elstr_slice | ||
704 | { | ||
705 | return _elstr_slice(word, off, len, 0, FUNC_NAME); | ||
706 | } | ||
707 | #undef FUNC_NAME | ||
708 | |||
709 | SCM_DEFINE_PUBLIC(scm_elstr_slice_x, "elstr-slice!", | ||
710 | 3, 0, 0, | ||
711 | (SCM word, SCM off, SCM len), | ||
712 | "Extract LEN characters from WORD starting from position OFF (destructive)\n") | ||
713 | #define FUNC_NAME s_scm_elstr_slice_x | ||
714 | { | ||
715 | return _elstr_slice(word, off, len, 1, FUNC_NAME); | ||
716 | } | ||
717 | #undef FUNC_NAME | ||
718 | |||
719 | SCM_DEFINE_PUBLIC(scm_elstr_index, "elstr-index", | ||
720 | 2, 0, 0, | ||
721 | (SCM word, SCM needle), | ||
722 | "") | ||
723 | #define FUNC_NAME s_scm_elstr_index | ||
724 | { | ||
725 | struct elstr *elstr; | ||
726 | unsigned *wc, *wtmp = NULL, *p; | ||
727 | unsigned wlen; | ||
728 | |||
729 | SCM_ASSERT(scm_is_elstr(word), word, SCM_ARG1, FUNC_NAME); | ||
730 | elstr = (struct elstr*) SCM_CDR(word); | ||
731 | if (scm_is_elstr(needle)) { | ||
732 | struct elstr *ep = (struct elstr*) SCM_CDR(needle); | ||
733 | wc = ep->str; | ||
734 | wlen = ep->len; | ||
735 | } else { | ||
736 | SCM scm; | ||
737 | char *str; | ||
738 | |||
739 | SCM_ASSERT(scm_is_string(needle), needle, SCM_ARG2, FUNC_NAME); | ||
740 | str = scm_to_locale_string(needle); | ||
741 | if (utf8_mbstr_to_wc(str, &wtmp, &wlen)) { | ||
742 | free(str); | ||
743 | scm_misc_error(FUNC_NAME, | ||
744 | "Invalid needle string: ~S", | ||
745 | scm_list_1(needle)); | ||
746 | } | ||
747 | free(str); | ||
748 | wc = wtmp; | ||
749 | } | ||
750 | p = (unsigned*)utf8_wc_strnstr(elstr->str, elstr->len, wc, wlen); | ||
751 | free(wtmp); | ||
752 | if (p) | ||
753 | return scm_from_int(p - elstr->str); | ||
754 | return SCM_BOOL_F; | ||
755 | } | ||
756 | #undef FUNC_NAME | ||
658 | 757 | ||
diff --git a/src/ellinika/elmorph.scm4 b/src/ellinika/elmorph.scm4 index 546bcb5..e3ed4b5 100644 --- a/src/ellinika/elmorph.scm4 +++ b/src/ellinika/elmorph.scm4 | |||
@@ -21 +21,20 @@ | |||
21 | "scm_init_ellinika_elmorph_module") | 21 | "scm_init_ellinika_elmorph_module") |
22 | |||
23 | (define-public (elstr-trim word n) | ||
24 | (cond | ||
25 | ((> n 0) | ||
26 | (elstr-slice word n (- (elstr-length word) n))) | ||
27 | ((< n 0) | ||
28 | (elstr-slice word 0 (+ (elstr-length word) n))) | ||
29 | (else | ||
30 | word))) | ||
31 | |||
32 | (define-public (elstr-trim! word n) | ||
33 | (cond | ||
34 | ((> n 0) | ||
35 | (elstr-slice! word n (- (elstr-length word) n))) | ||
36 | ((< n 0) | ||
37 | (elstr-slice! word 0 (+ (elstr-length word) n))))) | ||
38 | |||
39 | |||
40 | |||
diff --git a/src/ellinika/utf8.c b/src/ellinika/utf8.c index 952af07..b946a3b 100644 --- a/src/ellinika/utf8.c +++ b/src/ellinika/utf8.c | |||
@@ -1935,2 +1935,11 @@ utf8_wc_strcasecmp(const unsigned *a, const unsigned *b) | |||
1935 | const unsigned * | 1935 | const unsigned * |
1936 | utf8_wc_strnchr(const unsigned *str, unsigned chr, size_t len) | ||
1937 | { | ||
1938 | for (; len; str++, len--) | ||
1939 | if (*str == chr) | ||
1940 | return str; | ||
1941 | return NULL; | ||
1942 | } | ||
1943 | |||
1944 | const unsigned * | ||
1936 | utf8_wc_strchr(const unsigned *str, unsigned chr) | 1945 | utf8_wc_strchr(const unsigned *str, unsigned chr) |
@@ -1982,2 +1991,33 @@ utf8_wc_strstr(const unsigned *haystack, const unsigned *needle) | |||
1982 | 1991 | ||
1992 | const unsigned * | ||
1993 | utf8_wc_strnstr(const unsigned *haystack, size_t hlen, | ||
1994 | const unsigned *needle, size_t nlen) | ||
1995 | { | ||
1996 | unsigned first; | ||
1997 | |||
1998 | /* Is needle empty? */ | ||
1999 | if (hlen == 0) | ||
2000 | return haystack; | ||
2001 | first = needle[0]; | ||
2002 | /* Is needle nearly empty? */ | ||
2003 | if (nlen == 1) | ||
2004 | return utf8_wc_strnchr(haystack, first, hlen); | ||
2005 | for (; hlen; haystack++, hlen--) | ||
2006 | if (*haystack == first) { | ||
2007 | /* Compare with needle's remaining units. */ | ||
2008 | const unsigned *hptr = haystack + 1; | ||
2009 | size_t len = 1; | ||
2010 | for (;;) { | ||
2011 | if (*hptr != needle[len]) | ||
2012 | break; | ||
2013 | hptr++; | ||
2014 | len++; | ||
2015 | if (len == nlen) | ||
2016 | return haystack; | ||
2017 | } | ||
2018 | } | ||
2019 | |||
2020 | return NULL; | ||
2021 | } | ||
2022 | |||
1983 | unsigned * | 2023 | unsigned * |
diff --git a/src/ellinika/utf8.h b/src/ellinika/utf8.h index ce26f09..c4b5e44 100644 --- a/src/ellinika/utf8.h +++ b/src/ellinika/utf8.h | |||
@@ -60,2 +60,3 @@ unsigned *utf8_wc_quote (const unsigned *s); | |||
60 | const unsigned *utf8_wc_strchr(const unsigned *str, unsigned chr); | 60 | const unsigned *utf8_wc_strchr(const unsigned *str, unsigned chr); |
61 | const unsigned *utf8_wc_strnchr(const unsigned *str, unsigned chr, size_t len); | ||
61 | const unsigned *utf8_wc_strchr_ci(const unsigned *str, unsigned chr); | 62 | const unsigned *utf8_wc_strchr_ci(const unsigned *str, unsigned chr); |
@@ -63,2 +64,5 @@ const unsigned *utf8_wc_strstr(const unsigned *haystack, | |||
63 | const unsigned *needle); | 64 | const unsigned *needle); |
65 | const unsigned *utf8_wc_strnstr(const unsigned *haystack, size_t hlen, | ||
66 | const unsigned *needle, size_t nlen); | ||
67 | |||
64 | 68 | ||