aboutsummaryrefslogtreecommitdiff
path: root/src/ellinika
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org.ua>2011-06-04 13:12:10 +0000
committerSergey Poznyakoff <gray@gnu.org.ua>2011-06-04 13:12:10 +0000
commit8d4d5b758ef1f09a4ed39a25a07a09f9c26d0aec (patch)
treeacd3235e1e2cdbff16bfb0d9d8ee09065171968e /src/ellinika
parentaeb69dcd0e430b5e539c5ca33c73703dad59253d (diff)
downloadellinika-8d4d5b758ef1f09a4ed39a25a07a09f9c26d0aec.tar.gz
ellinika-8d4d5b758ef1f09a4ed39a25a07a09f9c26d0aec.tar.bz2
Add new functions for operations over elstrs.
* src/ellinika/utf8.c (utf8_wc_strnchr) (utf8_wc_strnstr): New functions. * src/ellinika/utf8.h (utf8_wc_strnchr) (utf8_wc_strnstr): New protos. * src/ellinika/elmorph.c (_elstr_alloc): Reuse existing sylmap. (_elstr_slice): New function. (elstr-slice,elstr-slice!,elstr-index): New functions. * src/ellinika/elmorph.scm4 (elstr-trim,elstr-trim!): New functions. git-svn-id: file:///home/puszcza/svnroot/ellinika/trunk@562 941c8c0f-9102-463b-b60b-cd22ce0e6858
Diffstat (limited to 'src/ellinika')
-rw-r--r--src/ellinika/elmorph.c107
-rw-r--r--src/ellinika/elmorph.scm419
-rw-r--r--src/ellinika/utf8.c40
-rw-r--r--src/ellinika/utf8.h4
4 files changed, 166 insertions, 4 deletions
diff --git a/src/ellinika/elmorph.c b/src/ellinika/elmorph.c
index 88520a7..6ff5f01 100644
--- a/src/ellinika/elmorph.c
+++ b/src/ellinika/elmorph.c
@@ -45,4 +45,7 @@ _elstr_syllabize(struct elstr *elstr)
45 int acc = 0; 45 int acc = 0;
46 46
47 sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len, "syllable map"); 47 if (!elstr->sylmap)
48 elstr->sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len,
49 "syllable map");
50 sylmap = elstr->sylmap;
48 51
@@ -68,3 +71,2 @@ _elstr_syllabize(struct elstr *elstr)
68 sylmap[nsyl-1] = i - 1; 71 sylmap[nsyl-1] = i - 1;
69 elstr->sylmap = sylmap;
70 elstr->nsyl = nsyl; 72 elstr->nsyl = nsyl;
@@ -87,3 +89,3 @@ _elstr_alloc(const char *instr)
87 elstr->len = wlen; 89 elstr->len = wlen;
88 90 elstr->sylmap = NULL;
89 _elstr_syllabize(elstr); 91 _elstr_syllabize(elstr);
@@ -542,2 +544,3 @@ SCM_DEFINE_PUBLIC(scm_elstr_char_prop_bitmask, "elstr-char-prop-bitmask",
542 SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); 544 SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
545 SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME);
543 elstr = (struct elstr*) SCM_CDR(el); 546 elstr = (struct elstr*) SCM_CDR(el);
@@ -657,2 +660,98 @@ SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy_x, "elstr-thema-aoristoy!", 1, 0, 0,
657#undef FUNC_NAME 660#undef FUNC_NAME
661
662static SCM
663_elstr_slice(SCM el, SCM n, SCM l, int destructive, const char *func_name)
664{
665 struct elstr *elstr;
666 int num;
667 unsigned len;
668 SCM scm;
669
670 SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name);
671 SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name);
672 SCM_ASSERT(scm_is_integer(l), l, SCM_ARG3, func_name);
673 elstr = (struct elstr*) SCM_CDR(el);
674 num = scm_to_int(n);
675 len = scm_to_uint(l);
676
677 if (num < 0)
678 num += elstr->len;
679 if (num < 0 || num >= elstr->len || num + len > elstr->len)
680 scm_misc_error(func_name,
681 "invalid offset or length",
682 SCM_EOL);
683
684 if (destructive)
685 scm = SCM_UNSPECIFIED;
686 else {
687 scm = _elstr_dup(elstr);
688 elstr = (struct elstr*) SCM_CDR(scm);
689 }
690
691 if (num)
692 memmove(elstr->str, elstr->str + num,
693 sizeof(elstr->str[0]) * len);
694 elstr->len = len;
695 _elstr_syllabize(elstr);
696 return scm;
697}
698
699SCM_DEFINE_PUBLIC(scm_elstr_slice, "elstr-slice",
700 3, 0, 0,
701 (SCM word, SCM off, SCM len),
702"Extract LEN characters from WORD starting from position OFF\n")
703#define FUNC_NAME s_scm_elstr_slice
704{
705 return _elstr_slice(word, off, len, 0, FUNC_NAME);
706}
707#undef FUNC_NAME
708
709SCM_DEFINE_PUBLIC(scm_elstr_slice_x, "elstr-slice!",
710 3, 0, 0,
711 (SCM word, SCM off, SCM len),
712"Extract LEN characters from WORD starting from position OFF (destructive)\n")
713#define FUNC_NAME s_scm_elstr_slice_x
714{
715 return _elstr_slice(word, off, len, 1, FUNC_NAME);
716}
717#undef FUNC_NAME
718
719SCM_DEFINE_PUBLIC(scm_elstr_index, "elstr-index",
720 2, 0, 0,
721 (SCM word, SCM needle),
722"")
723#define FUNC_NAME s_scm_elstr_index
724{
725 struct elstr *elstr;
726 unsigned *wc, *wtmp = NULL, *p;
727 unsigned wlen;
728
729 SCM_ASSERT(scm_is_elstr(word), word, SCM_ARG1, FUNC_NAME);
730 elstr = (struct elstr*) SCM_CDR(word);
731 if (scm_is_elstr(needle)) {
732 struct elstr *ep = (struct elstr*) SCM_CDR(needle);
733 wc = ep->str;
734 wlen = ep->len;
735 } else {
736 SCM scm;
737 char *str;
738
739 SCM_ASSERT(scm_is_string(needle), needle, SCM_ARG2, FUNC_NAME);
740 str = scm_to_locale_string(needle);
741 if (utf8_mbstr_to_wc(str, &wtmp, &wlen)) {
742 free(str);
743 scm_misc_error(FUNC_NAME,
744 "Invalid needle string: ~S",
745 scm_list_1(needle));
746 }
747 free(str);
748 wc = wtmp;
749 }
750 p = (unsigned*)utf8_wc_strnstr(elstr->str, elstr->len, wc, wlen);
751 free(wtmp);
752 if (p)
753 return scm_from_int(p - elstr->str);
754 return SCM_BOOL_F;
755}
756#undef FUNC_NAME
658 757
diff --git a/src/ellinika/elmorph.scm4 b/src/ellinika/elmorph.scm4
index 546bcb5..e3ed4b5 100644
--- a/src/ellinika/elmorph.scm4
+++ b/src/ellinika/elmorph.scm4
@@ -21 +21,20 @@
21 "scm_init_ellinika_elmorph_module") 21 "scm_init_ellinika_elmorph_module")
22
23(define-public (elstr-trim word n)
24 (cond
25 ((> n 0)
26 (elstr-slice word n (- (elstr-length word) n)))
27 ((< n 0)
28 (elstr-slice word 0 (+ (elstr-length word) n)))
29 (else
30 word)))
31
32(define-public (elstr-trim! word n)
33 (cond
34 ((> n 0)
35 (elstr-slice! word n (- (elstr-length word) n)))
36 ((< n 0)
37 (elstr-slice! word 0 (+ (elstr-length word) n)))))
38
39
40
diff --git a/src/ellinika/utf8.c b/src/ellinika/utf8.c
index 952af07..b946a3b 100644
--- a/src/ellinika/utf8.c
+++ b/src/ellinika/utf8.c
@@ -1935,2 +1935,11 @@ utf8_wc_strcasecmp(const unsigned *a, const unsigned *b)
1935const unsigned * 1935const unsigned *
1936utf8_wc_strnchr(const unsigned *str, unsigned chr, size_t len)
1937{
1938 for (; len; str++, len--)
1939 if (*str == chr)
1940 return str;
1941 return NULL;
1942}
1943
1944const unsigned *
1936utf8_wc_strchr(const unsigned *str, unsigned chr) 1945utf8_wc_strchr(const unsigned *str, unsigned chr)
@@ -1982,2 +1991,33 @@ utf8_wc_strstr(const unsigned *haystack, const unsigned *needle)
1982 1991
1992const unsigned *
1993utf8_wc_strnstr(const unsigned *haystack, size_t hlen,
1994 const unsigned *needle, size_t nlen)
1995{
1996 unsigned first;
1997
1998 /* Is needle empty? */
1999 if (hlen == 0)
2000 return haystack;
2001 first = needle[0];
2002 /* Is needle nearly empty? */
2003 if (nlen == 1)
2004 return utf8_wc_strnchr(haystack, first, hlen);
2005 for (; hlen; haystack++, hlen--)
2006 if (*haystack == first) {
2007 /* Compare with needle's remaining units. */
2008 const unsigned *hptr = haystack + 1;
2009 size_t len = 1;
2010 for (;;) {
2011 if (*hptr != needle[len])
2012 break;
2013 hptr++;
2014 len++;
2015 if (len == nlen)
2016 return haystack;
2017 }
2018 }
2019
2020 return NULL;
2021}
2022
1983unsigned * 2023unsigned *
diff --git a/src/ellinika/utf8.h b/src/ellinika/utf8.h
index ce26f09..c4b5e44 100644
--- a/src/ellinika/utf8.h
+++ b/src/ellinika/utf8.h
@@ -60,2 +60,3 @@ unsigned *utf8_wc_quote (const unsigned *s);
60const unsigned *utf8_wc_strchr(const unsigned *str, unsigned chr); 60const unsigned *utf8_wc_strchr(const unsigned *str, unsigned chr);
61const unsigned *utf8_wc_strnchr(const unsigned *str, unsigned chr, size_t len);
61const unsigned *utf8_wc_strchr_ci(const unsigned *str, unsigned chr); 62const unsigned *utf8_wc_strchr_ci(const unsigned *str, unsigned chr);
@@ -63,2 +64,5 @@ const unsigned *utf8_wc_strstr(const unsigned *haystack,
63 const unsigned *needle); 64 const unsigned *needle);
65const unsigned *utf8_wc_strnstr(const unsigned *haystack, size_t hlen,
66 const unsigned *needle, size_t nlen);
67
64 68

Return to:

Send suggestions and report system problems to the System administrator.