aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org>2018-09-16 10:24:48 +0300
committerSergey Poznyakoff <gray@gnu.org>2018-09-16 10:24:48 +0300
commit1c5e14432740e8af3a42e5e83a396e82221499ad (patch)
tree622ba85b52b1e6c09e7acd01d22419d2df4b5bfc
parentb785e98d2a06eb1f74f71337431e38f896f397a2 (diff)
downloaddico-1c5e14432740e8af3a42e5e83a396e82221499ad.tar.gz
dico-1c5e14432740e8af3a42e5e83a396e82221499ad.tar.bz2
utf8_compare: new general-purpose comparator function
* lib/utf8.c (utf8_strcmp_cc) (utf8_strcmp_alnumspace_cc,utf8_strcmp_alnumspace) (utf8_strcasecmp_alnumspace): Remove (utf8_compare): New function. (utf8_strcmp,utf8_strcasecmp) (utf8_strncasecmp): Rewrite as a wrapper over utf8_compare * include/dico/utf8.h: Update protos. * modules/dict.org/dictorg.c: Use new utf comparators.
-rw-r--r--include/dico/utf8.h18
-rw-r--r--lib/utf8.c125
-rw-r--r--modules/dict.org/dictorg.c154
3 files changed, 99 insertions, 198 deletions
diff --git a/include/dico/utf8.h b/include/dico/utf8.h
index c100a15..953a4f1 100644
--- a/include/dico/utf8.h
+++ b/include/dico/utf8.h
@@ -42,15 +42,19 @@ int utf8_wctomb (char *r, unsigned int wc);
int utf8_symcmp(char *a, char *b);
int utf8_symcasecmp(char *a, char *b);
-int utf8_strcmp_cc(char *a, char *b, int ci);
+int utf8_strcmp_cc(char const *a, char const *b, int ci);
-int utf8_strcmp(char *a, char *b);
-int utf8_strcasecmp(char *a, char *b);
-int utf8_strncasecmp(char *a, char *b, size_t maxlen);
+enum {
+ case_sensitive,
+ case_insensitive
+};
+
+int utf8_compare(char const *a, char const *b, int ci, size_t maxlen,
+ int (*wcsel)(unsigned));
-int utf8_strcmp_alnumspace_cc(char *a, char *b, int ci);
-int utf8_strcmp_alnumspace(char *a, char *b);
-int utf8_strcasecmp_alnumspace(char *a, char *b);
+int utf8_strcmp(char const *a, char const *b);
+int utf8_strcasecmp(char const *a, char const *b);
+int utf8_strncasecmp(char const *a, char const *b, size_t maxlen);
unsigned utf8_wc_toupper (unsigned wc);
int utf8_toupper (char *s);
diff --git a/lib/utf8.c b/lib/utf8.c
index 837a594..3ba6048 100644
--- a/lib/utf8.c
+++ b/lib/utf8.c
@@ -1748,120 +1748,43 @@ urf8_symcasecmp(char *a, char *b)
return 1;
return 0;
}
-
-enum {
- case_sensitive,
- case_insensitive
-};
-
-int
-utf8_strcmp_cc(char *a, char *b, int ci)
-{
- int alen, blen;
-
- for (; *a; a += alen, b += blen) {
- unsigned wa, wb;
-
- if (*b == 0)
- return 1;
-
- alen = utf8_char_width(a);
- if (alen == 0)
- return -1;
- utf8_mbtowc(&wa, a, alen);
- blen = utf8_char_width(b);
- if (blen == 0)
- return 1;
- utf8_mbtowc(&wb, b, blen);
- if (ci == case_insensitive) {
- wa = utf8_wc_toupper(wa);
- wb = utf8_wc_toupper(wb);
- }
- if (wa < wb)
- return -1;
- if (wa > wb)
- return 1;
-
- }
- if (*b)
- return -1;
- return 0;
-}
-
-int
-utf8_strcmp(char *a, char *b)
-{
- return utf8_strcmp_cc(a, b, case_sensitive);
-}
-
-int
-utf8_strcasecmp(char *a, char *b)
-{
- return utf8_strcmp_cc(a, b, case_insensitive);
-}
-
-int
-utf8_strncasecmp(char *a, char *b, size_t maxlen)
-{
- int alen, blen;
- unsigned asz = 0, bsz = 0;
-
- while (asz < maxlen) {
- unsigned wa, wb;
-
- if (*a == 0)
- return (*b == 0) ? 0 : -1;
-
- if (*b == 0)
- return 1;
-
- alen = utf8_char_width(a);
- if (alen == 0)
- return -1;
- utf8_mbtowc(&wa, a, alen);
- blen = utf8_char_width(b);
- if (blen == 0)
- return 1;
- utf8_mbtowc(&wb, b, blen);
- wa = utf8_wc_toupper(wa);
- wb = utf8_wc_toupper(wb);
- if (wa < wb)
- return -1;
- if (wa > wb)
- return 1;
- a += alen;
- b += blen;
- asz ++;
- bsz ++;
- }
- return 0;
-}
-#define is_alnumspace(c) (utf8_wc_is_alnum(c) || utf8_wc_is_space(c))
int
-utf8_strcmp_alnumspace_cc(char *a, char *b, int ci)
+utf8_compare(char const *a, char const *b,
+ int ci, size_t maxlen, int (*wcsel)(unsigned))
{
int alen, blen;
+ size_t an = 0, bn = 0;
unsigned wa, wb;
- while (*a) {
+ while (1) {
+ if (maxlen != 0 && an == maxlen)
+ return 0;
+ if (*a == 0)
+ break;
+
alen = utf8_char_width(a);
if (alen == 0)
return -1;
+
utf8_mbtowc(&wa, a, alen);
a += alen;
+ an++;
- if (is_alnumspace(wa)) {
+ if (!wcsel || wcsel(wa)) {
if (*b == 0)
return 1;
while (*b) {
+ if (maxlen != 0 && bn == maxlen)
+ return 0;
blen = utf8_char_width(b);
if (blen == 0)
return 1;
utf8_mbtowc(&wb, b, blen);
b += blen;
+ bn++;
- if (is_alnumspace(wb)) {
+ if (!wcsel || wcsel(wb)) {
if (ci == case_insensitive) {
wa = utf8_wc_toupper(wa);
wb = utf8_wc_toupper(wb);
@@ -1882,23 +1805,29 @@ utf8_strcmp_alnumspace_cc(char *a, char *b, int ci)
return 1;
utf8_mbtowc(&wb, b, blen);
b += blen;
- if (is_alnumspace(wb))
+ if (!wcsel || wcsel(wb))
return -1;
}
return 0;
}
+
+int
+utf8_strcmp(char const *a, char const *b)
+{
+ return utf8_compare(a, b, case_sensitive, 0, NULL);
+}
int
-utf8_strcmp_alnumspace(char *a, char *b)
+utf8_strcasecmp(char const *a, char const *b)
{
- return utf8_strcmp_alnumspace_cc(a, b, case_sensitive);
+ return utf8_compare(a, b, case_insensitive, 0, NULL);
}
int
-utf8_strcasecmp_alnumspace(char *a, char *b)
+utf8_strncasecmp(char const *a, char const *b, size_t maxlen)
{
- return utf8_strcmp_alnumspace_cc(a, b, case_insensitive);
+ return utf8_compare(a, b, case_insensitive, maxlen, NULL);
}
unsigned
diff --git a/modules/dict.org/dictorg.c b/modules/dict.org/dictorg.c
index fbaec48..bec219a 100644
--- a/modules/dict.org/dictorg.c
+++ b/modules/dict.org/dictorg.c
@@ -22,7 +22,42 @@ static int sort_index;
static int trim_ws;
static int show_dictorg_entries;
-typedef int (*COMPARATOR) (const void *, const void *, void *closure);
+static int
+is_alnumspace(unsigned c)
+{
+ return utf8_wc_is_alnum(c) || utf8_wc_is_space(c);
+}
+
+static inline int
+headword_compare(char const *a, char const *b, struct dictdb *db)
+{
+ return utf8_compare(a, b,
+ db->flag_casesensitive
+ ? case_sensitive : case_insensitive,
+ 0,
+ db->flag_allchars ? NULL : is_alnumspace);
+}
+
+static inline int
+headword_compare_allchars(char const *a, char const *b, struct dictdb *db,
+ size_t len)
+{
+ return utf8_compare(a, b,
+ db->flag_casesensitive
+ ? case_sensitive : case_insensitive,
+ len,
+ NULL);
+}
+
+static int
+compare_index_entry(const void *a, const void *b, void *closure)
+{
+ const struct index_entry *epa = a;
+ const struct index_entry *epb = b;
+ compare_count++;
+ return headword_compare(epa->word, epb->word, (struct dictdb *)closure);
+}
+
static int get_db_flag(struct dictdb *db, const char *name);
static int register_strategies(void);
@@ -132,9 +167,6 @@ b64_decode(const char *val, size_t len, size_t *presult)
return 0;
}
-static COMPARATOR comparator(struct dictdb *db);
-static COMPARATOR case_comparator(struct dictdb *db);
-
static int
parse_index_entry(const char *filename, size_t line,
dico_list_t list, char *buf, int tws)
@@ -480,12 +512,11 @@ mod_init_db(const char *dbname, int argc, char **argv)
if (sort_option) {
/* Sort index entries */
dico_sort(db->index, db->numwords, sizeof(db->index[0]),
- comparator(db), NULL);
+ compare_index_entry, db);
}
return (dico_handle_t)db;
}
-
static void
revert_word(char *dst, const char *src, size_t len)
@@ -505,11 +536,11 @@ revert_word(char *dst, const char *src, size_t len)
}
static int
-compare_rev_entry(const void *a, const void *b)
+compare_rev_entry(const void *a, const void *b, void *closure)
{
struct rev_entry const *epa = a;
struct rev_entry const *epb = b;
- return utf8_strcasecmp(epa->word, epb->word);
+ return headword_compare_allchars(epa->word, epb->word, closure, 0);
}
static int
@@ -533,8 +564,8 @@ init_suffix_index(struct dictdb *db)
db->suf_index[i].word = p;
db->suf_index[i].ptr = &db->index[i];
}
- qsort(db->suf_index, db->numwords, sizeof(db->suf_index[0]),
- compare_rev_entry);
+ dico_sort(db->suf_index, db->numwords, sizeof(db->suf_index[0]),
+ compare_rev_entry, db);
}
return 0;
}
@@ -580,73 +611,12 @@ register_strategies(void)
}
static int
-compare_allchars(const void *a, const void *b, void *closure)
-{
- const struct index_entry *epa = a;
- const struct index_entry *epb = b;
- compare_count++;
- return utf8_strcmp(epa->word, epb->word);
-}
-
-static int
-compare_allchars_ci(const void *a, const void *b, void *closure)
-{
- const struct index_entry *epa = a;
- const struct index_entry *epb = b;
- compare_count++;
- return utf8_strcasecmp(epa->word, epb->word);
-}
-
-static int
-compare_alnumspace(const void *a, const void *b, void *closure)
-{
- const struct index_entry *epa = a;
- const struct index_entry *epb = b;
- compare_count++;
- return utf8_strcmp_alnumspace(epa->word, epb->word);
-}
-
-static int
-compare_alnumspace_ci(const void *a, const void *b, void *closure)
-{
- const struct index_entry *epa = a;
- const struct index_entry *epb = b;
- compare_count++;
- return utf8_strcasecmp_alnumspace(epa->word, epb->word);
-}
-
-static COMPARATOR
-comparator(struct dictdb *db)
-{
- if (db->flag_allchars) {
- if (db->flag_casesensitive)
- return compare_allchars;
- else
- return compare_allchars_ci;
- } else {
- if (db->flag_casesensitive)
- return compare_alnumspace;
- else
- return compare_alnumspace_ci;
- }
-}
-
-static COMPARATOR
-case_comparator(struct dictdb *db)
-{
- if (db->flag_casesensitive)
- return compare_allchars;
- else
- return compare_allchars_ci;
-}
-
-static int
compare_entry_ptr(const void *a, const void *b, void *closure)
{
const struct index_entry *epa = *(const struct index_entry **)a;
const struct index_entry *epb = *(const struct index_entry **)b;
- COMPARATOR cmp = closure;
- return cmp(epa, epb, NULL);
+ struct dictdb *db = closure;
+ return headword_compare_allchars(epa->word, epb->word, db, 0);
}
static int
@@ -654,11 +624,10 @@ uniq_comp(const void *a, const void *b, void *closure)
{
const struct index_entry *epa = a;
const struct index_entry *epb = b;
- COMPARATOR cmp = closure;
- struct index_entry atmp, btmp;
+ struct dictdb *db = closure;
/* Entries differ if their headwords differ */
- if (utf8_strcasecmp(epa->word, epb->word))
+ if (headword_compare(epa->word, epb->word, db))
return 1;
/* Otherwise, if neither entry has the original headword, they
are equal */
@@ -668,14 +637,12 @@ uniq_comp(const void *a, const void *b, void *closure)
if (!epa->orig || !epb->orig)
return 1;
/* We have both original headwords. Compare them to decide. */
- atmp.word = epa->orig;
- btmp.word = epb->orig;
- return cmp(&atmp, &btmp, NULL);
+ return headword_compare(epa->orig, epb->orig, db);
}
static int
common_match(struct dictdb *db, const char *word,
- COMPARATOR compare,
+ int (*compare)(const void *, const void *, void *),
int unique, struct result *res)
{
struct index_entry x, *ep;
@@ -685,7 +652,7 @@ common_match(struct dictdb *db, const char *word,
x.wordlen = utf8_strlen(word);
compare_count = 0;
ep = dico_bsearch(&x, db->index, db->numwords, sizeof(db->index[0]),
- compare, NULL);
+ compare, db);
if (ep) {
res->type = result_match;
res->db = db;
@@ -696,7 +663,7 @@ common_match(struct dictdb *db, const char *word,
}
res->itr = NULL;
if (unique) {
- dico_list_set_comparator(res->list, uniq_comp, case_comparator(db));
+ dico_list_set_comparator(res->list, uniq_comp, db);
dico_list_set_flags(res->list, DICO_LIST_COMPARE_TAIL);
}
for (; ep < db->index + db->numwords
@@ -713,7 +680,7 @@ common_match(struct dictdb *db, const char *word,
static int
exact_match(struct dictdb *db, const char *word, struct result *res)
{
- return common_match(db, word, comparator(db), 0, res);
+ return common_match(db, word, compare_index_entry, 0, res);
}
static int
@@ -721,11 +688,12 @@ compare_prefix(const void *a, const void *b, void *closure)
{
const struct index_entry *pkey = a;
const struct index_entry *pelt = b;
+ struct dictdb *db = closure;
size_t wordlen = pkey->wordlen;
compare_count++;
if (pelt->wordlen < wordlen)
return -1;
- return utf8_strncasecmp(pkey->word, pelt->word, wordlen);
+ return headword_compare_allchars(pkey->word, pelt->word, db, wordlen);
}
static int
@@ -739,11 +707,12 @@ compare_rev_prefix(const void *a, const void *b, void *closure)
{
const struct rev_entry *pkey = a;
const struct rev_entry *pelt = b;
+ struct dictdb *db = closure;
size_t wordlen = pkey->ptr->wordlen;
if (pelt->ptr->wordlen < wordlen)
wordlen = pelt->ptr->wordlen;
compare_count++;
- return utf8_strncasecmp(pkey->word, pelt->word, wordlen);
+ return headword_compare_allchars(pkey->word, pelt->word, db, wordlen);
}
static int
@@ -771,7 +740,7 @@ suffix_match(struct dictdb *db, const char *word, struct result *res)
compare_count = 0;
ep = dico_bsearch(&x, db->suf_index, db->numwords, sizeof(db->suf_index[0]),
- compare_rev_prefix, NULL);
+ compare_rev_prefix, db);
if (ep) {
struct rev_entry *p;
struct index_entry **tmp;
@@ -796,8 +765,7 @@ suffix_match(struct dictdb *db, const char *word, struct result *res)
tmp[j++] = p->ptr;
count = j;
- dico_sort(tmp, count, sizeof(tmp[0]), compare_entry_ptr,
- case_comparator(db));
+ dico_sort(tmp, count, sizeof(tmp[0]), compare_entry_ptr, db);
list = dico_list_create();
if (!list) {
@@ -806,7 +774,7 @@ suffix_match(struct dictdb *db, const char *word, struct result *res)
free(tmp);
return 1;
}
- dico_list_set_comparator(list, uniq_comp, case_comparator(db));
+ dico_list_set_comparator(list, uniq_comp, db);
dico_list_set_flags(list, DICO_LIST_COMPARE_TAIL);
for (i = 0; i < count; i++)
dico_list_append(list, tmp[i]);
@@ -836,7 +804,7 @@ find_db_entry(struct dictdb *db, const char *name)
x.length = strlen(name);
x.wordlen = utf8_strlen(name);
ep = dico_bsearch(&x, db->index, db->numwords, sizeof(db->index[0]),
- comparator(db), NULL);
+ compare_index_entry, db);
if (!ep)
return NULL;
buf = malloc(ep->size + 1);
@@ -865,7 +833,7 @@ get_db_flag(struct dictdb *db, const char *name)
x.length = strlen(name);
x.wordlen = utf8_strlen(name);
return dico_bsearch(&x, db->index, db->numwords, sizeof(db->index[0]),
- comparator(db), NULL) != NULL;
+ compare_index_entry, db) != NULL;
}
static char *
@@ -937,7 +905,7 @@ _match_all(struct dictdb *db, dico_strategy_t strat, const char *word)
return NULL;
}
- dico_list_set_comparator(list, uniq_comp, case_comparator(db));
+ dico_list_set_comparator(list, uniq_comp, db);
dico_list_set_flags(list, DICO_LIST_COMPARE_TAIL);
if (dico_key_init(&key, strat, word)) {
@@ -998,7 +966,7 @@ mod_define(dico_handle_t hp, const char *word)
if (RESERVED_WORD(db, word))
return NULL;
- rc = common_match(db, word, comparator(db), 0, &res);
+ rc = common_match(db, word, compare_index_entry, 0, &res);
if (rc)
return NULL;
rp = malloc(sizeof(*rp));

Return to:

Send suggestions and report system problems to the System administrator.