diff options
author | Sergey Poznyakoff <gray@gnu.org.ua> | 2016-07-29 16:37:37 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org.ua> | 2016-07-29 16:37:37 +0300 |
commit | d5ae2f7493485606de05ad3c342412310f3c9352 (patch) | |
tree | b5cedb957348624f24a5aabbc84598cf120711df | |
parent | 699fc16c7654c5af9260407eb7ecdbb20fc501de (diff) | |
download | dico-d5ae2f7493485606de05ad3c342412310f3c9352.tar.gz dico-d5ae2f7493485606de05ad3c342412310f3c9352.tar.bz2 |
Use Knuth-Morris-Pratt algorithm for substing look-ups
* include/dico/utf8.h (utf8_wc_strpat): New proto.
* lib/utf8.c (utf8_wc_strpat): New function.
(utf8_wc_strstr): Rewrite as a wrapper over utf8_wc_strpat.
* lib/tests/utf8.c: Use %td specifier to print ptrdiff_t values.
* doc/libdico.texi: Document string-matching functions.
* lib/tests/wcstrstr.at: Add more tests.
* modules/substr/substr.c (substr_sel): Use utf8_wc_strpat.
-rw-r--r-- | doc/libdico.texi | 56 | ||||
-rw-r--r-- | include/dico/utf8.h | 14 | ||||
-rw-r--r-- | lib/tests/utf8.c | 15 | ||||
-rw-r--r-- | lib/tests/wcstrstr.at | 9 | ||||
-rw-r--r-- | lib/utf8.c | 128 | ||||
-rw-r--r-- | modules/substr/substr.c | 17 |
6 files changed, 203 insertions, 36 deletions
diff --git a/doc/libdico.texi b/doc/libdico.texi index 0e6f3b5..3f1888f 100644 --- a/doc/libdico.texi +++ b/doc/libdico.texi @@ -701,13 +701,67 @@ struct utf8_iterator @{ size_t @var{n_buckets}) @end deftypefn @deftypefn Function int utf8_wc_strcmp (const unsigned *@var{a}, @ const unsigned *@var{b}) @end deftypefn - + +@deftp enum utf8_strpat_result +Integer return type for string matching functions. Defined as: + +@smallexample +enum utf8_strpat_result @{ + strpat_found, + strpat_not_found, + strpat_error +@}; +@end smallexample + +Its values: + +@table @code +@item strpat_found: +Pattern was found in text. + +@item strpat_not_found +Pattern was not found in text. + +@item strpat_error +An error occurred. The @code{errno} variable should be examined to +obtain more information about the error. +@end table +@end deftp + +@deftypefn Function {enum utf8_strpat_result} utf8_wc_strpat (const unsigned *@var{text},@ + const unsigned *@var{pattern}, size_t *@var{return_offset}) +Finds the first occurrence of @var{pattern} in @var{text}. Return value: + +@table @code +@item strpat_found +Pattern was found. Unless @var{return_offset} is @code{NULL}, the +offset of the pattern occurrence in @var{text} is stored in the memory +location pointed to by @var{return_offset}. + +@item strpat_not_found +Pattern was not found. + +@item strpat_error +An error occurred. The @code{errno }variable is set to @samp{ERANGE}, +if pattern is too long, or to @samp{ENOMEM} if unable to allocate memory. +@end table +@end deftypefn + +@deftypefn Function {const unsigned *} utf8_wc_strstr( const unsigned *var{text},@ + const unsigned *@var{pattern}) +Alternative interface to @code{utf8_wc_strpat}. Finds the first +occurrence of @var{pattern} in @var{text}. Returns a pointer to the +beginning of pattern in @var{text}. Returns @code{NULL} if no occurrence +was found or if an error occurred. In the latter case, the global +variable @code{errno} is set appropriately. +@end deftypefn + @deftypefn Function int utf8_wc_to_mbstr (const unsigned *@var{wordbuf}, @ size_t @var{wordlen}, char *@var{s}, size_t @var{size}) @end deftypefn @deftypefn Function int utf8_mbstr_to_wc (const char *@var{str}, @ unsigned **@var{wptr}) diff --git a/include/dico/utf8.h b/include/dico/utf8.h index a1847c9..ad0d43d 100644 --- a/include/dico/utf8.h +++ b/include/dico/utf8.h @@ -1,8 +1,8 @@ /* This file is part of GNU Dico. - Copyright (C) 2008, 2010, 2012 Sergey Poznyakoff + Copyright (C) 2008, 2010, 2012, 2016 Sergey Poznyakoff GNU Dico is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. @@ -63,12 +63,24 @@ int utf8_mbstr_to_norm_wc(const char *str, unsigned **nptr, size_t *plen); int utf8_quote (const char *str, char **sptr); unsigned *utf8_wc_quote (const unsigned *s); const unsigned *utf8_wc_strchr(const unsigned *str, unsigned chr); const unsigned *utf8_wc_strchr_ci(const unsigned *str, unsigned chr); + +enum utf8_strpat_result { + strpat_found, + strpat_not_found, + strpat_error +}; + +enum utf8_strpat_result utf8_wc_strpat(const unsigned *text, + const unsigned *pattern, + size_t *return_offset); + + const unsigned *utf8_wc_strstr(const unsigned *haystack, const unsigned *needle); void utf8_wc_strupper(unsigned *str); void utf8_wc_strlower(unsigned *str); diff --git a/lib/tests/utf8.c b/lib/tests/utf8.c index 6372b65..6233637 100644 --- a/lib/tests/utf8.c +++ b/lib/tests/utf8.c @@ -1,8 +1,8 @@ /* This file is part of GNU Dico - Copyright (C) 2012 Sergey Poznyakoff + Copyright (C) 2012, 2016 Sergey Poznyakoff GNU Dico is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. @@ -203,13 +203,13 @@ op_wc_strchr(int argc, char **argv) wa = strtowc(argv[0]); wb = strtowc(argv[1]); p = utf8_wc_strchr(wa, wb[0]); if (!p) return 2; - printf("%ld\n", p - wa); + printf("%td\n", p - wa); return 0; } static int op_wc_strchr_ci(int argc, char **argv) { @@ -224,13 +224,13 @@ op_wc_strchr_ci(int argc, char **argv) wa = strtowc(argv[0]); wb = strtowc(argv[1]); p = utf8_wc_strchr_ci(wa, wb[0]); if (!p) return 2; - printf("%ld\n", p - wa); + printf("%td\n", p - wa); return 0; } static int op_wc_strstr(int argc, char **argv) { @@ -243,15 +243,20 @@ op_wc_strstr(int argc, char **argv) return 1; } wa = strtowc(argv[0]); wb = strtowc(argv[1]); p = utf8_wc_strstr(wa, wb); - if (!p) + if (!p) { + if (errno) { + dico_log(L_ERR, errno, "can't match"); + return 3; + } return 2; - printf("%ld\n", p - wa); + } + printf("%td\n", p - wa); return 0; } struct optab optab[] = { { "help", help }, { "strlen", op_strlen }, diff --git a/lib/tests/wcstrstr.at b/lib/tests/wcstrstr.at index f7299a0..387a7ab 100644 --- a/lib/tests/wcstrstr.at +++ b/lib/tests/wcstrstr.at @@ -1,8 +1,8 @@ # This file is part of GNU Dico. -*- Autotest -*- -# Copyright (C) 2012 Sergey Poznyakoff +# Copyright (C) 2012, 2016 Sergey Poznyakoff # # GNU Dico is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3, or (at your option) # any later version. # @@ -14,12 +14,19 @@ # You should have received a copy of the GNU General Public License # along with GNU Dico. If not, see <http://www.gnu.org/licenses/>. AT_SETUP(wc_strstr) AT_KEYWORDS(utf8 wc_strstr) +AT_CHECK([utf8 wc_strstr gcatcgcagagagtatacagtacg gcagagag], +0, +[5 +]) + +AT_CHECK([utf8 wc_strstr gcatcgcagagagtatacagtacg gcagagat], 2) + AT_CHECK([utf8 wc_strstr εκδρομή δρομ], 0, [2 ]) AT_CHECK([utf8 wc_strstr εκδρομή δρομο], @@ -1,8 +1,8 @@ /* This file is part of GNU Dico - Copyright (C) 2007-2008, 2010, 2012 Sergey Poznyakoff + Copyright (C) 2007-2008, 2010, 2012, 2016 Sergey Poznyakoff GNU Dico is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. @@ -1997,39 +1997,117 @@ utf8_wc_strchr_ci(const unsigned *str, unsigned chr) unsigned u = utf8_wc_toupper(chr); for (; *str; str++) if (utf8_wc_toupper(*str) == u) return str; return NULL; } + +#define INIT ((size_t) -1) + +/* Compute the table of tagged border lengths for use in the + Knuth-Morris-Pratt algorithm. +*/ +static size_t * +kmp_next(const unsigned *text, size_t text_len, + const unsigned *pattern, size_t pattern_len) +{ + size_t *nextab; + size_t i, j; + + if (pattern_len + 1 == 0) { + errno = ERANGE; + return NULL; + } -const unsigned * -utf8_wc_strstr(const unsigned *haystack, const unsigned *needle) + nextab = calloc(pattern_len + 1, sizeof nextab[0]); + if (!nextab) + return NULL; + + i = 0; + j = INIT; + nextab[0] = j; + while (i < pattern_len) { + while (j != INIT && pattern[i] != text[j]) + j = nextab[j]; + i++; + j++; + if (pattern[i] == pattern[j]) + nextab[i] = nextab[j]; + else + nextab[i] = j; + } + + return nextab; +} + +/* Find the first occurrence of PATTERN in TEXT. + Return value: + strpat_found - Pattern was found. Unless RETURN_OFFSET is NULL, the + offset of the pattern is stored in the memory location + it points to. + strpat_not_found - Pattern was not found. + strpat_error - An error occurred. The errno variable is set to + ERANGE if pattern is too long, or to ENOMEM if unable + to allocate memory. + + This implementation uses Knuth-Morris-Pratt algorithm. +*/ +enum utf8_strpat_result +utf8_wc_strpat(const unsigned *text, const unsigned *pattern, + size_t *return_offset) { - unsigned first = needle[0]; - - /* Is needle empty? */ - if (first == 0) - return haystack; - - /* Is needle nearly empty? */ - if (needle[1] == 0) - return utf8_wc_strchr(haystack, first); - for (; *haystack; haystack++) - if (*haystack == first) { - /* Compare with needle's remaining units. */ - const unsigned *hptr = haystack + 1; - const unsigned *nptr = needle + 1; - for (;;) { - if (*hptr != *nptr) - break; - hptr++; - nptr++; - if (*nptr == 0) - return haystack; - } + size_t i, j; + enum utf8_strpat_result result; + size_t text_len = utf8_wc_strlen(text); + size_t pattern_len = utf8_wc_strlen(pattern); + size_t *nextab; + + /* Handle corner cases */ + if (pattern_len > text_len) + return strpat_not_found; + else if (pattern_len == text_len) { + if (utf8_wc_strcmp(text, pattern) == 0) { + if (return_offset) + *return_offset = 0; + return strpat_found; + } else + return strpat_not_found; + } + + /* Uses Knuth-Morris-Pratt algorithm for the general case */ + nextab = kmp_next(text, text_len, pattern, pattern_len); + if (!nextab) + return strpat_error; + + i = j = 0; + result = strpat_not_found; + while (j < text_len) { + while (i != INIT && pattern[i] != text[j]) + i = nextab[i]; + i++; + j++; + if (i >= pattern_len) { + if (return_offset) + *return_offset = j - i; + result = strpat_found; + break; } + } + + free(nextab); + + return result; +} + +unsigned const * +utf8_wc_strstr(const unsigned *haystack, const unsigned *needle) +{ + size_t n; + errno = 0; + if (utf8_wc_strpat(haystack, needle, &n) == strpat_found) + return haystack + n; return NULL; } unsigned * utf8_wc_quote(const unsigned *s) { diff --git a/modules/substr/substr.c b/modules/substr/substr.c index d6f045b..49453da 100644 --- a/modules/substr/substr.c +++ b/modules/substr/substr.c @@ -27,13 +27,14 @@ static int substr_sel(int cmd, struct dico_key *key, const char *dict_word) { unsigned *sample; unsigned *tmp; - int res; + enum utf8_strpat_result res; + int ec; switch (cmd) { case DICO_SELECT_BEGIN: if (utf8_mbstr_to_wc(key->word, &sample, NULL)) return 1; utf8_wc_strupper(sample); @@ -42,15 +43,25 @@ substr_sel(int cmd, struct dico_key *key, const char *dict_word) case DICO_SELECT_RUN: sample = key->call_data; if (utf8_mbstr_to_wc(dict_word, &tmp, NULL)) return 0; utf8_wc_strupper(tmp); - res = !!utf8_wc_strstr(tmp, sample); + res = utf8_wc_strpat(tmp, sample, NULL); + ec = errno; free(tmp); - return res; + switch (res) { + case strpat_found: + return 1; + + case strpat_error: + dico_log(L_ERR, ec, "can't match"); + /* fall throuog */ + case strpat_not_found: + return 0; + } case DICO_SELECT_END: free(key->call_data); break; } return 0; |