aboutsummaryrefslogtreecommitdiff
path: root/src/ellinika
diff options
context:
space:
mode:
Diffstat (limited to 'src/ellinika')
-rw-r--r--src/ellinika/elmorph.c107
-rw-r--r--src/ellinika/elmorph.scm419
-rw-r--r--src/ellinika/utf8.c40
-rw-r--r--src/ellinika/utf8.h4
4 files changed, 166 insertions, 4 deletions
diff --git a/src/ellinika/elmorph.c b/src/ellinika/elmorph.c
index 88520a7..6ff5f01 100644
--- a/src/ellinika/elmorph.c
+++ b/src/ellinika/elmorph.c
@@ -43,8 +43,11 @@ _elstr_syllabize(struct elstr *elstr)
43 unsigned i, nsyl = 0, accsyl = 0, accchr = 0; 43 unsigned i, nsyl = 0, accsyl = 0, accchr = 0;
44 int dstate = 0; 44 int dstate = 0;
45 int acc = 0; 45 int acc = 0;
46 46
47 sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len, "syllable map"); 47 if (!elstr->sylmap)
48 elstr->sylmap = scm_gc_malloc(sizeof(sylmap[0])*elstr->len,
49 "syllable map");
50 sylmap = elstr->sylmap;
48 51
49 for (i = 0; i < elstr->len; i++) { 52 for (i = 0; i < elstr->len; i++) {
50 int nstate; 53 int nstate;
@@ -66,7 +69,6 @@ _elstr_syllabize(struct elstr *elstr)
66 sylmap[nsyl++] = i - 1; 69 sylmap[nsyl++] = i - 1;
67 else 70 else
68 sylmap[nsyl-1] = i - 1; 71 sylmap[nsyl-1] = i - 1;
69 elstr->sylmap = sylmap;
70 elstr->nsyl = nsyl; 72 elstr->nsyl = nsyl;
71 elstr->acc_pos = accchr; 73 elstr->acc_pos = accchr;
72 elstr->acc_syl = nsyl - accsyl; 74 elstr->acc_syl = nsyl - accsyl;
@@ -85,7 +87,7 @@ _elstr_alloc(const char *instr)
85 elstr = scm_gc_malloc(sizeof(*elstr), "Elstr"); 87 elstr = scm_gc_malloc(sizeof(*elstr), "Elstr");
86 elstr->str = wptr; 88 elstr->str = wptr;
87 elstr->len = wlen; 89 elstr->len = wlen;
88 90 elstr->sylmap = NULL;
89 _elstr_syllabize(elstr); 91 _elstr_syllabize(elstr);
90 92
91 SCM_RETURN_NEWSMOB(_elstr_tag, elstr); 93 SCM_RETURN_NEWSMOB(_elstr_tag, elstr);
@@ -540,6 +542,7 @@ SCM_DEFINE_PUBLIC(scm_elstr_char_prop_bitmask, "elstr-char-prop-bitmask",
540 unsigned num; 542 unsigned num;
541 543
542 SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME); 544 SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, FUNC_NAME);
545 SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, FUNC_NAME);
543 elstr = (struct elstr*) SCM_CDR(el); 546 elstr = (struct elstr*) SCM_CDR(el);
544 num = scm_to_uint(n); 547 num = scm_to_uint(n);
545 if (num >= elstr->len) 548 if (num >= elstr->len)
@@ -655,6 +658,102 @@ SCM_DEFINE_PUBLIC(scm_elstr_thema_aoristoy_x, "elstr-thema-aoristoy!", 1, 0, 0,
655 return _elstr_thema_aoristoy(thema, 1, FUNC_NAME); 658 return _elstr_thema_aoristoy(thema, 1, FUNC_NAME);
656} 659}
657#undef FUNC_NAME 660#undef FUNC_NAME
661
662static SCM
663_elstr_slice(SCM el, SCM n, SCM l, int destructive, const char *func_name)
664{
665 struct elstr *elstr;
666 int num;
667 unsigned len;
668 SCM scm;
669
670 SCM_ASSERT(scm_is_elstr(el), el, SCM_ARG1, func_name);
671 SCM_ASSERT(scm_is_integer(n), n, SCM_ARG2, func_name);
672 SCM_ASSERT(scm_is_integer(l), l, SCM_ARG3, func_name);
673 elstr = (struct elstr*) SCM_CDR(el);
674 num = scm_to_int(n);
675 len = scm_to_uint(l);
676
677 if (num < 0)
678 num += elstr->len;
679 if (num < 0 || num >= elstr->len || num + len > elstr->len)
680 scm_misc_error(func_name,
681 "invalid offset or length",
682 SCM_EOL);
683
684 if (destructive)
685 scm = SCM_UNSPECIFIED;
686 else {
687 scm = _elstr_dup(elstr);
688 elstr = (struct elstr*) SCM_CDR(scm);
689 }
690
691 if (num)
692 memmove(elstr->str, elstr->str + num,
693 sizeof(elstr->str[0]) * len);
694 elstr->len = len;
695 _elstr_syllabize(elstr);
696 return scm;
697}
698
699SCM_DEFINE_PUBLIC(scm_elstr_slice, "elstr-slice",
700 3, 0, 0,
701 (SCM word, SCM off, SCM len),
702"Extract LEN characters from WORD starting from position OFF\n")
703#define FUNC_NAME s_scm_elstr_slice
704{
705 return _elstr_slice(word, off, len, 0, FUNC_NAME);
706}
707#undef FUNC_NAME
708
709SCM_DEFINE_PUBLIC(scm_elstr_slice_x, "elstr-slice!",
710 3, 0, 0,
711 (SCM word, SCM off, SCM len),
712"Extract LEN characters from WORD starting from position OFF (destructive)\n")
713#define FUNC_NAME s_scm_elstr_slice_x
714{
715 return _elstr_slice(word, off, len, 1, FUNC_NAME);
716}
717#undef FUNC_NAME
718
719SCM_DEFINE_PUBLIC(scm_elstr_index, "elstr-index",
720 2, 0, 0,
721 (SCM word, SCM needle),
722"")
723#define FUNC_NAME s_scm_elstr_index
724{
725 struct elstr *elstr;
726 unsigned *wc, *wtmp = NULL, *p;
727 unsigned wlen;
728
729 SCM_ASSERT(scm_is_elstr(word), word, SCM_ARG1, FUNC_NAME);
730 elstr = (struct elstr*) SCM_CDR(word);
731 if (scm_is_elstr(needle)) {
732 struct elstr *ep = (struct elstr*) SCM_CDR(needle);
733 wc = ep->str;
734 wlen = ep->len;
735 } else {
736 SCM scm;
737 char *str;
738
739 SCM_ASSERT(scm_is_string(needle), needle, SCM_ARG2, FUNC_NAME);
740 str = scm_to_locale_string(needle);
741 if (utf8_mbstr_to_wc(str, &wtmp, &wlen)) {
742 free(str);
743 scm_misc_error(FUNC_NAME,
744 "Invalid needle string: ~S",
745 scm_list_1(needle));
746 }
747 free(str);
748 wc = wtmp;
749 }
750 p = (unsigned*)utf8_wc_strnstr(elstr->str, elstr->len, wc, wlen);
751 free(wtmp);
752 if (p)
753 return scm_from_int(p - elstr->str);
754 return SCM_BOOL_F;
755}
756#undef FUNC_NAME
658 757
659 758
660void 759void
diff --git a/src/ellinika/elmorph.scm4 b/src/ellinika/elmorph.scm4
index 546bcb5..e3ed4b5 100644
--- a/src/ellinika/elmorph.scm4
+++ b/src/ellinika/elmorph.scm4
@@ -19,3 +19,22 @@
19(load-extension 19(load-extension
20 "LIBDIR/libguile-elmorph-v-VERSION" 20 "LIBDIR/libguile-elmorph-v-VERSION"
21 "scm_init_ellinika_elmorph_module") 21 "scm_init_ellinika_elmorph_module")
22
23(define-public (elstr-trim word n)
24 (cond
25 ((> n 0)
26 (elstr-slice word n (- (elstr-length word) n)))
27 ((< n 0)
28 (elstr-slice word 0 (+ (elstr-length word) n)))
29 (else
30 word)))
31
32(define-public (elstr-trim! word n)
33 (cond
34 ((> n 0)
35 (elstr-slice! word n (- (elstr-length word) n)))
36 ((< n 0)
37 (elstr-slice! word 0 (+ (elstr-length word) n)))))
38
39
40
diff --git a/src/ellinika/utf8.c b/src/ellinika/utf8.c
index 952af07..b946a3b 100644
--- a/src/ellinika/utf8.c
+++ b/src/ellinika/utf8.c
@@ -1933,6 +1933,15 @@ utf8_wc_strcasecmp(const unsigned *a, const unsigned *b)
1933} 1933}
1934 1934
1935const unsigned * 1935const unsigned *
1936utf8_wc_strnchr(const unsigned *str, unsigned chr, size_t len)
1937{
1938 for (; len; str++, len--)
1939 if (*str == chr)
1940 return str;
1941 return NULL;
1942}
1943
1944const unsigned *
1936utf8_wc_strchr(const unsigned *str, unsigned chr) 1945utf8_wc_strchr(const unsigned *str, unsigned chr)
1937{ 1946{
1938 for (; *str; str++) 1947 for (; *str; str++)
@@ -1980,6 +1989,37 @@ utf8_wc_strstr(const unsigned *haystack, const unsigned *needle)
1980 return NULL; 1989 return NULL;
1981} 1990}
1982 1991
1992const unsigned *
1993utf8_wc_strnstr(const unsigned *haystack, size_t hlen,
1994 const unsigned *needle, size_t nlen)
1995{
1996 unsigned first;
1997
1998 /* Is needle empty? */
1999 if (hlen == 0)
2000 return haystack;
2001 first = needle[0];
2002 /* Is needle nearly empty? */
2003 if (nlen == 1)
2004 return utf8_wc_strnchr(haystack, first, hlen);
2005 for (; hlen; haystack++, hlen--)
2006 if (*haystack == first) {
2007 /* Compare with needle's remaining units. */
2008 const unsigned *hptr = haystack + 1;
2009 size_t len = 1;
2010 for (;;) {
2011 if (*hptr != needle[len])
2012 break;
2013 hptr++;