aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org.ua>2016-07-29 16:37:37 +0300
committerSergey Poznyakoff <gray@gnu.org.ua>2016-07-29 16:37:37 +0300
commitd5ae2f7493485606de05ad3c342412310f3c9352 (patch)
treeb5cedb957348624f24a5aabbc84598cf120711df
parent699fc16c7654c5af9260407eb7ecdbb20fc501de (diff)
downloaddico-d5ae2f7493485606de05ad3c342412310f3c9352.tar.gz
dico-d5ae2f7493485606de05ad3c342412310f3c9352.tar.bz2
Use Knuth-Morris-Pratt algorithm for substing look-ups
* include/dico/utf8.h (utf8_wc_strpat): New proto. * lib/utf8.c (utf8_wc_strpat): New function. (utf8_wc_strstr): Rewrite as a wrapper over utf8_wc_strpat. * lib/tests/utf8.c: Use %td specifier to print ptrdiff_t values. * doc/libdico.texi: Document string-matching functions. * lib/tests/wcstrstr.at: Add more tests. * modules/substr/substr.c (substr_sel): Use utf8_wc_strpat.
-rw-r--r--doc/libdico.texi56
-rw-r--r--include/dico/utf8.h14
-rw-r--r--lib/tests/utf8.c15
-rw-r--r--lib/tests/wcstrstr.at9
-rw-r--r--lib/utf8.c128
-rw-r--r--modules/substr/substr.c17
6 files changed, 203 insertions, 36 deletions
diff --git a/doc/libdico.texi b/doc/libdico.texi
index 0e6f3b5..3f1888f 100644
--- a/doc/libdico.texi
+++ b/doc/libdico.texi
@@ -701,13 +701,67 @@ struct utf8_iterator @{
size_t @var{n_buckets})
@end deftypefn
@deftypefn Function int utf8_wc_strcmp (const unsigned *@var{a}, @
const unsigned *@var{b})
@end deftypefn
-
+
+@deftp enum utf8_strpat_result
+Integer return type for string matching functions. Defined as:
+
+@smallexample
+enum utf8_strpat_result @{
+ strpat_found,
+ strpat_not_found,
+ strpat_error
+@};
+@end smallexample
+
+Its values:
+
+@table @code
+@item strpat_found:
+Pattern was found in text.
+
+@item strpat_not_found
+Pattern was not found in text.
+
+@item strpat_error
+An error occurred. The @code{errno} variable should be examined to
+obtain more information about the error.
+@end table
+@end deftp
+
+@deftypefn Function {enum utf8_strpat_result} utf8_wc_strpat (const unsigned *@var{text},@
+ const unsigned *@var{pattern}, size_t *@var{return_offset})
+Finds the first occurrence of @var{pattern} in @var{text}. Return value:
+
+@table @code
+@item strpat_found
+Pattern was found. Unless @var{return_offset} is @code{NULL}, the
+offset of the pattern occurrence in @var{text} is stored in the memory
+location pointed to by @var{return_offset}.
+
+@item strpat_not_found
+Pattern was not found.
+
+@item strpat_error
+An error occurred. The @code{errno }variable is set to @samp{ERANGE},
+if pattern is too long, or to @samp{ENOMEM} if unable to allocate memory.
+@end table
+@end deftypefn
+
+@deftypefn Function {const unsigned *} utf8_wc_strstr( const unsigned *var{text},@
+ const unsigned *@var{pattern})
+Alternative interface to @code{utf8_wc_strpat}. Finds the first
+occurrence of @var{pattern} in @var{text}. Returns a pointer to the
+beginning of pattern in @var{text}. Returns @code{NULL} if no occurrence
+was found or if an error occurred. In the latter case, the global
+variable @code{errno} is set appropriately.
+@end deftypefn
+
@deftypefn Function int utf8_wc_to_mbstr (const unsigned *@var{wordbuf}, @
size_t @var{wordlen}, char *@var{s}, size_t @var{size})
@end deftypefn
@deftypefn Function int utf8_mbstr_to_wc (const char *@var{str}, @
unsigned **@var{wptr})
diff --git a/include/dico/utf8.h b/include/dico/utf8.h
index a1847c9..ad0d43d 100644
--- a/include/dico/utf8.h
+++ b/include/dico/utf8.h
@@ -1,8 +1,8 @@
/* This file is part of GNU Dico.
- Copyright (C) 2008, 2010, 2012 Sergey Poznyakoff
+ Copyright (C) 2008, 2010, 2012, 2016 Sergey Poznyakoff
GNU Dico is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
@@ -63,12 +63,24 @@ int utf8_mbstr_to_norm_wc(const char *str, unsigned **nptr, size_t *plen);
int utf8_quote (const char *str, char **sptr);
unsigned *utf8_wc_quote (const unsigned *s);
const unsigned *utf8_wc_strchr(const unsigned *str, unsigned chr);
const unsigned *utf8_wc_strchr_ci(const unsigned *str, unsigned chr);
+
+enum utf8_strpat_result {
+ strpat_found,
+ strpat_not_found,
+ strpat_error
+};
+
+enum utf8_strpat_result utf8_wc_strpat(const unsigned *text,
+ const unsigned *pattern,
+ size_t *return_offset);
+
+
const unsigned *utf8_wc_strstr(const unsigned *haystack,
const unsigned *needle);
void utf8_wc_strupper(unsigned *str);
void utf8_wc_strlower(unsigned *str);
diff --git a/lib/tests/utf8.c b/lib/tests/utf8.c
index 6372b65..6233637 100644
--- a/lib/tests/utf8.c
+++ b/lib/tests/utf8.c
@@ -1,8 +1,8 @@
/* This file is part of GNU Dico
- Copyright (C) 2012 Sergey Poznyakoff
+ Copyright (C) 2012, 2016 Sergey Poznyakoff
GNU Dico is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
@@ -203,13 +203,13 @@ op_wc_strchr(int argc, char **argv)
wa = strtowc(argv[0]);
wb = strtowc(argv[1]);
p = utf8_wc_strchr(wa, wb[0]);
if (!p)
return 2;
- printf("%ld\n", p - wa);
+ printf("%td\n", p - wa);
return 0;
}
static int
op_wc_strchr_ci(int argc, char **argv)
{
@@ -224,13 +224,13 @@ op_wc_strchr_ci(int argc, char **argv)
wa = strtowc(argv[0]);
wb = strtowc(argv[1]);
p = utf8_wc_strchr_ci(wa, wb[0]);
if (!p)
return 2;
- printf("%ld\n", p - wa);
+ printf("%td\n", p - wa);
return 0;
}
static int
op_wc_strstr(int argc, char **argv)
{
@@ -243,15 +243,20 @@ op_wc_strstr(int argc, char **argv)
return 1;
}
wa = strtowc(argv[0]);
wb = strtowc(argv[1]);
p = utf8_wc_strstr(wa, wb);
- if (!p)
+ if (!p) {
+ if (errno) {
+ dico_log(L_ERR, errno, "can't match");
+ return 3;
+ }
return 2;
- printf("%ld\n", p - wa);
+ }
+ printf("%td\n", p - wa);
return 0;
}
struct optab optab[] = {
{ "help", help },
{ "strlen", op_strlen },
diff --git a/lib/tests/wcstrstr.at b/lib/tests/wcstrstr.at
index f7299a0..387a7ab 100644
--- a/lib/tests/wcstrstr.at
+++ b/lib/tests/wcstrstr.at
@@ -1,8 +1,8 @@
# This file is part of GNU Dico. -*- Autotest -*-
-# Copyright (C) 2012 Sergey Poznyakoff
+# Copyright (C) 2012, 2016 Sergey Poznyakoff
#
# GNU Dico is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3, or (at your option)
# any later version.
#
@@ -14,12 +14,19 @@
# You should have received a copy of the GNU General Public License
# along with GNU Dico. If not, see <http://www.gnu.org/licenses/>.
AT_SETUP(wc_strstr)
AT_KEYWORDS(utf8 wc_strstr)
+AT_CHECK([utf8 wc_strstr gcatcgcagagagtatacagtacg gcagagag],
+0,
+[5
+])
+
+AT_CHECK([utf8 wc_strstr gcatcgcagagagtatacagtacg gcagagat], 2)
+
AT_CHECK([utf8 wc_strstr εκδρομή δρομ],
0,
[2
])
AT_CHECK([utf8 wc_strstr εκδρομή δρομο],
diff --git a/lib/utf8.c b/lib/utf8.c
index fcb40b2..183e47f 100644
--- a/lib/utf8.c
+++ b/lib/utf8.c
@@ -1,8 +1,8 @@
/* This file is part of GNU Dico
- Copyright (C) 2007-2008, 2010, 2012 Sergey Poznyakoff
+ Copyright (C) 2007-2008, 2010, 2012, 2016 Sergey Poznyakoff
GNU Dico is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
@@ -1997,39 +1997,117 @@ utf8_wc_strchr_ci(const unsigned *str, unsigned chr)
unsigned u = utf8_wc_toupper(chr);
for (; *str; str++)
if (utf8_wc_toupper(*str) == u)
return str;
return NULL;
}
+
+#define INIT ((size_t) -1)
+
+/* Compute the table of tagged border lengths for use in the
+ Knuth-Morris-Pratt algorithm.
+*/
+static size_t *
+kmp_next(const unsigned *text, size_t text_len,
+ const unsigned *pattern, size_t pattern_len)
+{
+ size_t *nextab;
+ size_t i, j;
+
+ if (pattern_len + 1 == 0) {
+ errno = ERANGE;
+ return NULL;
+ }
-const unsigned *
-utf8_wc_strstr(const unsigned *haystack, const unsigned *needle)
+ nextab = calloc(pattern_len + 1, sizeof nextab[0]);
+ if (!nextab)
+ return NULL;
+
+ i = 0;
+ j = INIT;
+ nextab[0] = j;
+ while (i < pattern_len) {
+ while (j != INIT && pattern[i] != text[j])
+ j = nextab[j];
+ i++;
+ j++;
+ if (pattern[i] == pattern[j])
+ nextab[i] = nextab[j];
+ else
+ nextab[i] = j;
+ }
+
+ return nextab;
+}
+
+/* Find the first occurrence of PATTERN in TEXT.
+ Return value:
+ strpat_found - Pattern was found. Unless RETURN_OFFSET is NULL, the
+ offset of the pattern is stored in the memory location
+ it points to.
+ strpat_not_found - Pattern was not found.
+ strpat_error - An error occurred. The errno variable is set to
+ ERANGE if pattern is too long, or to ENOMEM if unable
+ to allocate memory.
+
+ This implementation uses Knuth-Morris-Pratt algorithm.
+*/
+enum utf8_strpat_result
+utf8_wc_strpat(const unsigned *text, const unsigned *pattern,
+ size_t *return_offset)
{
- unsigned first = needle[0];
-
- /* Is needle empty? */
- if (first == 0)
- return haystack;
-
- /* Is needle nearly empty? */
- if (needle[1] == 0)
- return utf8_wc_strchr(haystack, first);
- for (; *haystack; haystack++)
- if (*haystack == first) {
- /* Compare with needle's remaining units. */
- const unsigned *hptr = haystack + 1;
- const unsigned *nptr = needle + 1;
- for (;;) {
- if (*hptr != *nptr)
- break;
- hptr++;
- nptr++;
- if (*nptr == 0)
- return haystack;
- }
+ size_t i, j;
+ enum utf8_strpat_result result;
+ size_t text_len = utf8_wc_strlen(text);
+ size_t pattern_len = utf8_wc_strlen(pattern);
+ size_t *nextab;
+
+ /* Handle corner cases */
+ if (pattern_len > text_len)
+ return strpat_not_found;
+ else if (pattern_len == text_len) {
+ if (utf8_wc_strcmp(text, pattern) == 0) {
+ if (return_offset)
+ *return_offset = 0;
+ return strpat_found;
+ } else
+ return strpat_not_found;
+ }
+
+ /* Uses Knuth-Morris-Pratt algorithm for the general case */
+ nextab = kmp_next(text, text_len, pattern, pattern_len);
+ if (!nextab)
+ return strpat_error;
+
+ i = j = 0;
+ result = strpat_not_found;
+ while (j < text_len) {
+ while (i != INIT && pattern[i] != text[j])
+ i = nextab[i];
+ i++;
+ j++;
+ if (i >= pattern_len) {
+ if (return_offset)
+ *return_offset = j - i;
+ result = strpat_found;
+ break;
}
+ }
+
+ free(nextab);
+
+ return result;
+}
+
+unsigned const *
+utf8_wc_strstr(const unsigned *haystack, const unsigned *needle)
+{
+ size_t n;
+ errno = 0;
+ if (utf8_wc_strpat(haystack, needle, &n) == strpat_found)
+ return haystack + n;
return NULL;
}
unsigned *
utf8_wc_quote(const unsigned *s)
{
diff --git a/modules/substr/substr.c b/modules/substr/substr.c
index d6f045b..49453da 100644
--- a/modules/substr/substr.c
+++ b/modules/substr/substr.c
@@ -27,13 +27,14 @@
static int
substr_sel(int cmd, struct dico_key *key, const char *dict_word)
{
unsigned *sample;
unsigned *tmp;
- int res;
+ enum utf8_strpat_result res;
+ int ec;
switch (cmd) {
case DICO_SELECT_BEGIN:
if (utf8_mbstr_to_wc(key->word, &sample, NULL))
return 1;
utf8_wc_strupper(sample);
@@ -42,15 +43,25 @@ substr_sel(int cmd, struct dico_key *key, const char *dict_word)
case DICO_SELECT_RUN:
sample = key->call_data;
if (utf8_mbstr_to_wc(dict_word, &tmp, NULL))
return 0;
utf8_wc_strupper(tmp);
- res = !!utf8_wc_strstr(tmp, sample);
+ res = utf8_wc_strpat(tmp, sample, NULL);
+ ec = errno;
free(tmp);
- return res;
+ switch (res) {
+ case strpat_found:
+ return 1;
+
+ case strpat_error:
+ dico_log(L_ERR, ec, "can't match");
+ /* fall throuog */
+ case strpat_not_found:
+ return 0;
+ }
case DICO_SELECT_END:
free(key->call_data);
break;
}
return 0;

Return to:

Send suggestions and report system problems to the System administrator.