diff options
author | Sergey Poznyakoff <gray@gnu.org> | 2018-09-16 10:24:48 +0300 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org> | 2018-09-16 10:24:48 +0300 |
commit | 1c5e14432740e8af3a42e5e83a396e82221499ad (patch) | |
tree | 622ba85b52b1e6c09e7acd01d22419d2df4b5bfc | |
parent | b785e98d2a06eb1f74f71337431e38f896f397a2 (diff) | |
download | dico-1c5e14432740e8af3a42e5e83a396e82221499ad.tar.gz dico-1c5e14432740e8af3a42e5e83a396e82221499ad.tar.bz2 |
utf8_compare: new general-purpose comparator function
* lib/utf8.c (utf8_strcmp_cc)
(utf8_strcmp_alnumspace_cc,utf8_strcmp_alnumspace)
(utf8_strcasecmp_alnumspace): Remove
(utf8_compare): New function.
(utf8_strcmp,utf8_strcasecmp)
(utf8_strncasecmp): Rewrite as a wrapper over utf8_compare
* include/dico/utf8.h: Update protos.
* modules/dict.org/dictorg.c: Use new utf comparators.
-rw-r--r-- | include/dico/utf8.h | 18 | ||||
-rw-r--r-- | lib/utf8.c | 125 | ||||
-rw-r--r-- | modules/dict.org/dictorg.c | 154 |
3 files changed, 99 insertions, 198 deletions
diff --git a/include/dico/utf8.h b/include/dico/utf8.h index c100a15..953a4f1 100644 --- a/include/dico/utf8.h +++ b/include/dico/utf8.h @@ -42,15 +42,19 @@ int utf8_wctomb (char *r, unsigned int wc); int utf8_symcmp(char *a, char *b); int utf8_symcasecmp(char *a, char *b); -int utf8_strcmp_cc(char *a, char *b, int ci); +int utf8_strcmp_cc(char const *a, char const *b, int ci); -int utf8_strcmp(char *a, char *b); -int utf8_strcasecmp(char *a, char *b); -int utf8_strncasecmp(char *a, char *b, size_t maxlen); +enum { + case_sensitive, + case_insensitive +}; + +int utf8_compare(char const *a, char const *b, int ci, size_t maxlen, + int (*wcsel)(unsigned)); -int utf8_strcmp_alnumspace_cc(char *a, char *b, int ci); -int utf8_strcmp_alnumspace(char *a, char *b); -int utf8_strcasecmp_alnumspace(char *a, char *b); +int utf8_strcmp(char const *a, char const *b); +int utf8_strcasecmp(char const *a, char const *b); +int utf8_strncasecmp(char const *a, char const *b, size_t maxlen); unsigned utf8_wc_toupper (unsigned wc); int utf8_toupper (char *s); @@ -1748,120 +1748,43 @@ urf8_symcasecmp(char *a, char *b) return 1; return 0; } - -enum { - case_sensitive, - case_insensitive -}; - -int -utf8_strcmp_cc(char *a, char *b, int ci) -{ - int alen, blen; - - for (; *a; a += alen, b += blen) { - unsigned wa, wb; - - if (*b == 0) - return 1; - - alen = utf8_char_width(a); - if (alen == 0) - return -1; - utf8_mbtowc(&wa, a, alen); - blen = utf8_char_width(b); - if (blen == 0) - return 1; - utf8_mbtowc(&wb, b, blen); - if (ci == case_insensitive) { - wa = utf8_wc_toupper(wa); - wb = utf8_wc_toupper(wb); - } - if (wa < wb) - return -1; - if (wa > wb) - return 1; - - } - if (*b) - return -1; - return 0; -} - -int -utf8_strcmp(char *a, char *b) -{ - return utf8_strcmp_cc(a, b, case_sensitive); -} - -int -utf8_strcasecmp(char *a, char *b) -{ - return utf8_strcmp_cc(a, b, case_insensitive); -} - -int -utf8_strncasecmp(char *a, char *b, size_t maxlen) -{ - int alen, blen; - unsigned asz = 0, bsz = 0; - - while (asz < maxlen) { - unsigned wa, wb; - - if (*a == 0) - return (*b == 0) ? 0 : -1; - - if (*b == 0) - return 1; - - alen = utf8_char_width(a); - if (alen == 0) - return -1; - utf8_mbtowc(&wa, a, alen); - blen = utf8_char_width(b); - if (blen == 0) - return 1; - utf8_mbtowc(&wb, b, blen); - wa = utf8_wc_toupper(wa); - wb = utf8_wc_toupper(wb); - if (wa < wb) - return -1; - if (wa > wb) - return 1; - a += alen; - b += blen; - asz ++; - bsz ++; - } - return 0; -} -#define is_alnumspace(c) (utf8_wc_is_alnum(c) || utf8_wc_is_space(c)) int -utf8_strcmp_alnumspace_cc(char *a, char *b, int ci) +utf8_compare(char const *a, char const *b, + int ci, size_t maxlen, int (*wcsel)(unsigned)) { int alen, blen; + size_t an = 0, bn = 0; unsigned wa, wb; - while (*a) { + while (1) { + if (maxlen != 0 && an == maxlen) + return 0; + if (*a == 0) + break; + alen = utf8_char_width(a); if (alen == 0) return -1; + utf8_mbtowc(&wa, a, alen); a += alen; + an++; - if (is_alnumspace(wa)) { + if (!wcsel || wcsel(wa)) { if (*b == 0) return 1; while (*b) { + if (maxlen != 0 && bn == maxlen) + return 0; blen = utf8_char_width(b); if (blen == 0) return 1; utf8_mbtowc(&wb, b, blen); b += blen; + bn++; - if (is_alnumspace(wb)) { + if (!wcsel || wcsel(wb)) { if (ci == case_insensitive) { wa = utf8_wc_toupper(wa); wb = utf8_wc_toupper(wb); @@ -1882,23 +1805,29 @@ utf8_strcmp_alnumspace_cc(char *a, char *b, int ci) return 1; utf8_mbtowc(&wb, b, blen); b += blen; - if (is_alnumspace(wb)) + if (!wcsel || wcsel(wb)) return -1; } return 0; } + +int +utf8_strcmp(char const *a, char const *b) +{ + return utf8_compare(a, b, case_sensitive, 0, NULL); +} int -utf8_strcmp_alnumspace(char *a, char *b) +utf8_strcasecmp(char const *a, char const *b) { - return utf8_strcmp_alnumspace_cc(a, b, case_sensitive); + return utf8_compare(a, b, case_insensitive, 0, NULL); } int -utf8_strcasecmp_alnumspace(char *a, char *b) +utf8_strncasecmp(char const *a, char const *b, size_t maxlen) { - return utf8_strcmp_alnumspace_cc(a, b, case_insensitive); + return utf8_compare(a, b, case_insensitive, maxlen, NULL); } unsigned diff --git a/modules/dict.org/dictorg.c b/modules/dict.org/dictorg.c index fbaec48..bec219a 100644 --- a/modules/dict.org/dictorg.c +++ b/modules/dict.org/dictorg.c @@ -22,7 +22,42 @@ static int sort_index; static int trim_ws; static int show_dictorg_entries; -typedef int (*COMPARATOR) (const void *, const void *, void *closure); +static int +is_alnumspace(unsigned c) +{ + return utf8_wc_is_alnum(c) || utf8_wc_is_space(c); +} + +static inline int +headword_compare(char const *a, char const *b, struct dictdb *db) +{ + return utf8_compare(a, b, + db->flag_casesensitive + ? case_sensitive : case_insensitive, + 0, + db->flag_allchars ? NULL : is_alnumspace); +} + +static inline int +headword_compare_allchars(char const *a, char const *b, struct dictdb *db, + size_t len) +{ + return utf8_compare(a, b, + db->flag_casesensitive + ? case_sensitive : case_insensitive, + len, + NULL); +} + +static int +compare_index_entry(const void *a, const void *b, void *closure) +{ + const struct index_entry *epa = a; + const struct index_entry *epb = b; + compare_count++; + return headword_compare(epa->word, epb->word, (struct dictdb *)closure); +} + static int get_db_flag(struct dictdb *db, const char *name); static int register_strategies(void); @@ -132,9 +167,6 @@ b64_decode(const char *val, size_t len, size_t *presult) return 0; } -static COMPARATOR comparator(struct dictdb *db); -static COMPARATOR case_comparator(struct dictdb *db); - static int parse_index_entry(const char *filename, size_t line, dico_list_t list, char *buf, int tws) @@ -480,12 +512,11 @@ mod_init_db(const char *dbname, int argc, char **argv) if (sort_option) { /* Sort index entries */ dico_sort(db->index, db->numwords, sizeof(db->index[0]), - comparator(db), NULL); + compare_index_entry, db); } return (dico_handle_t)db; } - static void revert_word(char *dst, const char *src, size_t len) @@ -505,11 +536,11 @@ revert_word(char *dst, const char *src, size_t len) } static int -compare_rev_entry(const void *a, const void *b) +compare_rev_entry(const void *a, const void *b, void *closure) { struct rev_entry const *epa = a; struct rev_entry const *epb = b; - return utf8_strcasecmp(epa->word, epb->word); + return headword_compare_allchars(epa->word, epb->word, closure, 0); } static int @@ -533,8 +564,8 @@ init_suffix_index(struct dictdb *db) db->suf_index[i].word = p; db->suf_index[i].ptr = &db->index[i]; } - qsort(db->suf_index, db->numwords, sizeof(db->suf_index[0]), - compare_rev_entry); + dico_sort(db->suf_index, db->numwords, sizeof(db->suf_index[0]), + compare_rev_entry, db); } return 0; } @@ -580,73 +611,12 @@ register_strategies(void) } static int -compare_allchars(const void *a, const void *b, void *closure) -{ - const struct index_entry *epa = a; - const struct index_entry *epb = b; - compare_count++; - return utf8_strcmp(epa->word, epb->word); -} - -static int -compare_allchars_ci(const void *a, const void *b, void *closure) -{ - const struct index_entry *epa = a; - const struct index_entry *epb = b; - compare_count++; - return utf8_strcasecmp(epa->word, epb->word); -} - -static int -compare_alnumspace(const void *a, const void *b, void *closure) -{ - const struct index_entry *epa = a; - const struct index_entry *epb = b; - compare_count++; - return utf8_strcmp_alnumspace(epa->word, epb->word); -} - -static int -compare_alnumspace_ci(const void *a, const void *b, void *closure) -{ - const struct index_entry *epa = a; - const struct index_entry *epb = b; - compare_count++; - return utf8_strcasecmp_alnumspace(epa->word, epb->word); -} - -static COMPARATOR -comparator(struct dictdb *db) -{ - if (db->flag_allchars) { - if (db->flag_casesensitive) - return compare_allchars; - else - return compare_allchars_ci; - } else { - if (db->flag_casesensitive) - return compare_alnumspace; - else - return compare_alnumspace_ci; - } -} - -static COMPARATOR -case_comparator(struct dictdb *db) -{ - if (db->flag_casesensitive) - return compare_allchars; - else - return compare_allchars_ci; -} - -static int compare_entry_ptr(const void *a, const void *b, void *closure) { const struct index_entry *epa = *(const struct index_entry **)a; const struct index_entry *epb = *(const struct index_entry **)b; - COMPARATOR cmp = closure; - return cmp(epa, epb, NULL); + struct dictdb *db = closure; + return headword_compare_allchars(epa->word, epb->word, db, 0); } static int @@ -654,11 +624,10 @@ uniq_comp(const void *a, const void *b, void *closure) { const struct index_entry *epa = a; const struct index_entry *epb = b; - COMPARATOR cmp = closure; - struct index_entry atmp, btmp; + struct dictdb *db = closure; /* Entries differ if their headwords differ */ - if (utf8_strcasecmp(epa->word, epb->word)) + if (headword_compare(epa->word, epb->word, db)) return 1; /* Otherwise, if neither entry has the original headword, they are equal */ @@ -668,14 +637,12 @@ uniq_comp(const void *a, const void *b, void *closure) if (!epa->orig || !epb->orig) return 1; /* We have both original headwords. Compare them to decide. */ - atmp.word = epa->orig; - btmp.word = epb->orig; - return cmp(&atmp, &btmp, NULL); + return headword_compare(epa->orig, epb->orig, db); } static int common_match(struct dictdb *db, const char *word, - COMPARATOR compare, + int (*compare)(const void *, const void *, void *), int unique, struct result *res) { struct index_entry x, *ep; @@ -685,7 +652,7 @@ common_match(struct dictdb *db, const char *word, x.wordlen = utf8_strlen(word); compare_count = 0; ep = dico_bsearch(&x, db->index, db->numwords, sizeof(db->index[0]), - compare, NULL); + compare, db); if (ep) { res->type = result_match; res->db = db; @@ -696,7 +663,7 @@ common_match(struct dictdb *db, const char *word, } res->itr = NULL; if (unique) { - dico_list_set_comparator(res->list, uniq_comp, case_comparator(db)); + dico_list_set_comparator(res->list, uniq_comp, db); dico_list_set_flags(res->list, DICO_LIST_COMPARE_TAIL); } for (; ep < db->index + db->numwords @@ -713,7 +680,7 @@ common_match(struct dictdb *db, const char *word, static int exact_match(struct dictdb *db, const char *word, struct result *res) { - return common_match(db, word, comparator(db), 0, res); + return common_match(db, word, compare_index_entry, 0, res); } static int @@ -721,11 +688,12 @@ compare_prefix(const void *a, const void *b, void *closure) { const struct index_entry *pkey = a; const struct index_entry *pelt = b; + struct dictdb *db = closure; size_t wordlen = pkey->wordlen; compare_count++; if (pelt->wordlen < wordlen) return -1; - return utf8_strncasecmp(pkey->word, pelt->word, wordlen); + return headword_compare_allchars(pkey->word, pelt->word, db, wordlen); } static int @@ -739,11 +707,12 @@ compare_rev_prefix(const void *a, const void *b, void *closure) { const struct rev_entry *pkey = a; const struct rev_entry *pelt = b; + struct dictdb *db = closure; size_t wordlen = pkey->ptr->wordlen; if (pelt->ptr->wordlen < wordlen) wordlen = pelt->ptr->wordlen; compare_count++; - return utf8_strncasecmp(pkey->word, pelt->word, wordlen); + return headword_compare_allchars(pkey->word, pelt->word, db, wordlen); } static int @@ -771,7 +740,7 @@ suffix_match(struct dictdb *db, const char *word, struct result *res) compare_count = 0; ep = dico_bsearch(&x, db->suf_index, db->numwords, sizeof(db->suf_index[0]), - compare_rev_prefix, NULL); + compare_rev_prefix, db); if (ep) { struct rev_entry *p; struct index_entry **tmp; @@ -796,8 +765,7 @@ suffix_match(struct dictdb *db, const char *word, struct result *res) tmp[j++] = p->ptr; count = j; - dico_sort(tmp, count, sizeof(tmp[0]), compare_entry_ptr, - case_comparator(db)); + dico_sort(tmp, count, sizeof(tmp[0]), compare_entry_ptr, db); list = dico_list_create(); if (!list) { @@ -806,7 +774,7 @@ suffix_match(struct dictdb *db, const char *word, struct result *res) free(tmp); return 1; } - dico_list_set_comparator(list, uniq_comp, case_comparator(db)); + dico_list_set_comparator(list, uniq_comp, db); dico_list_set_flags(list, DICO_LIST_COMPARE_TAIL); for (i = 0; i < count; i++) dico_list_append(list, tmp[i]); @@ -836,7 +804,7 @@ find_db_entry(struct dictdb *db, const char *name) x.length = strlen(name); x.wordlen = utf8_strlen(name); ep = dico_bsearch(&x, db->index, db->numwords, sizeof(db->index[0]), - comparator(db), NULL); + compare_index_entry, db); if (!ep) return NULL; buf = malloc(ep->size + 1); @@ -865,7 +833,7 @@ get_db_flag(struct dictdb *db, const char *name) x.length = strlen(name); x.wordlen = utf8_strlen(name); return dico_bsearch(&x, db->index, db->numwords, sizeof(db->index[0]), - comparator(db), NULL) != NULL; + compare_index_entry, db) != NULL; } static char * @@ -937,7 +905,7 @@ _match_all(struct dictdb *db, dico_strategy_t strat, const char *word) return NULL; } - dico_list_set_comparator(list, uniq_comp, case_comparator(db)); + dico_list_set_comparator(list, uniq_comp, db); dico_list_set_flags(list, DICO_LIST_COMPARE_TAIL); if (dico_key_init(&key, strat, word)) { @@ -998,7 +966,7 @@ mod_define(dico_handle_t hp, const char *word) if (RESERVED_WORD(db, word)) return NULL; - rc = common_match(db, word, comparator(db), 0, &res); + rc = common_match(db, word, compare_index_entry, 0, &res); if (rc) return NULL; rp = malloc(sizeof(*rp)); |