utf8_compare: new general-purpose comparator function

* lib/utf8.c (utf8_strcmp_cc) (utf8_strcmp_alnumspace_cc,utf8_strcmp_alnumspace) (utf8_strcasecmp_alnumspace): Remove (utf8_compare): New function. (utf8_strcmp,utf8_strcasecmp) (utf8_strncasecmp): Rewrite as a wrapper over utf8_compare * include/dico/utf8.h: Update protos. * modules/dict.org/dictorg.c: Use new utf comparators.
author: Sergey Poznyakoff <gray@gnu.org> 2018-09-16 10:24:48 +0300
committer: Sergey Poznyakoff <gray@gnu.org> 2018-09-16 10:24:48 +0300
commit: 1c5e14432740e8af3a42e5e83a396e82221499ad (patch)
tree: 622ba85b52b1e6c09e7acd01d22419d2df4b5bfc
parent: b785e98d2a06eb1f74f71337431e38f896f397a2 (diff)
download: dico-1c5e14432740e8af3a42e5e83a396e82221499ad.tar.gz
dico-1c5e14432740e8af3a42e5e83a396e82221499ad.tar.bz2
3 files changed, 99 insertions, 198 deletions
diff --git a/include/dico/utf8.h b/include/dico/utf8.h
index c100a15..953a4f1 100644
--- a/include/dico/utf8.h
+++ b/include/dico/utf8.h
@@ -42,15 +42,19 @@ int utf8_wctomb (char *r, unsigned int wc);
 int utf8_symcmp(char *a, char *b);
 int utf8_symcasecmp(char *a, char *b);
 
-int utf8_strcmp_cc(char *a, char *b, int ci);
+int utf8_strcmp_cc(char const *a, char const *b, int ci);
 
-int utf8_strcmp(char *a, char *b);
-int utf8_strcasecmp(char *a, char *b);
-int utf8_strncasecmp(char *a, char *b, size_t maxlen);
+enum {
+    case_sensitive,
+    case_insensitive
+};
+
+int utf8_compare(char const *a, char const *b, int ci, size_t maxlen,
+		 int (*wcsel)(unsigned));
 
-int utf8_strcmp_alnumspace_cc(char *a, char *b, int ci);
-int utf8_strcmp_alnumspace(char *a, char *b);
-int utf8_strcasecmp_alnumspace(char *a, char *b);
+int utf8_strcmp(char const *a, char const *b);
+int utf8_strcasecmp(char const *a, char const *b);
+int utf8_strncasecmp(char const *a, char const *b, size_t maxlen);
 
 unsigned utf8_wc_toupper (unsigned wc);
 int utf8_toupper (char *s);
diff --git a/lib/utf8.c b/lib/utf8.c
index 837a594..3ba6048 100644
--- a/lib/utf8.c
+++ b/lib/utf8.c
@@ -1748,120 +1748,43 @@ urf8_symcasecmp(char *a, char *b)
 	return 1;
     return 0;
 }
-
-enum {
-    case_sensitive,
-    case_insensitive
-};
-
-int
-utf8_strcmp_cc(char *a, char *b, int ci)
-{
-    int alen, blen;
-
-    for (; *a; a += alen, b += blen) {
-	unsigned wa, wb;
-
-	if (*b == 0)
-	    return 1;
-
-	alen = utf8_char_width(a);
-	if (alen == 0)
-	    return -1;
-	utf8_mbtowc(&wa, a, alen);
-	blen = utf8_char_width(b);
-	if (blen == 0)
-	    return 1;
-	utf8_mbtowc(&wb, b, blen);
-	if (ci == case_insensitive) {
-	    wa = utf8_wc_toupper(wa);
-	    wb = utf8_wc_toupper(wb);
-	}
-	if (wa < wb)
-	    return -1;
-	if (wa > wb)
-	    return 1;
-
-    }
-    if (*b)
-	return -1;
-    return 0;
-}
-
-int
-utf8_strcmp(char *a, char *b)
-{
-    return utf8_strcmp_cc(a, b, case_sensitive);
-}
-
-int
-utf8_strcasecmp(char *a, char *b)
-{
-    return utf8_strcmp_cc(a, b, case_insensitive);
-}
-
-int
-utf8_strncasecmp(char *a, char *b, size_t maxlen)
-{
-    int alen, blen;
-    unsigned asz = 0, bsz = 0;
-
-    while (asz < maxlen) {
-	unsigned wa, wb;
-
-	if (*a == 0)
-	    return (*b == 0) ? 0 : -1;
-
-	if (*b == 0)
-	    return 1;
-
-	alen = utf8_char_width(a);
-	if (alen == 0)
-	    return -1;
-	utf8_mbtowc(&wa, a, alen);
-	blen = utf8_char_width(b);
-	if (blen == 0)
-	    return 1;
-	utf8_mbtowc(&wb, b, blen);
-	wa = utf8_wc_toupper(wa);
-	wb = utf8_wc_toupper(wb);
-	if (wa < wb)
-	    return -1;
-	if (wa > wb)
-	    return 1;
-	a += alen;
-	b += blen;
-	asz ++;
-	bsz ++;
-    }
-    return 0;
-}
 
-#define is_alnumspace(c) (utf8_wc_is_alnum(c) || utf8_wc_is_space(c))
 int
-utf8_strcmp_alnumspace_cc(char *a, char *b, int ci)
+utf8_compare(char const *a, char const *b,
+	     int ci, size_t maxlen, int (*wcsel)(unsigned))
 {
     int alen, blen;
+    size_t an = 0, bn = 0;
     unsigned wa, wb;
 
-    while (*a) {	
+    while (1) {
+	if (maxlen != 0 && an == maxlen)
+	    return 0;
+	if (*a == 0)
+	    break;
+	
 	alen = utf8_char_width(a);
 	if (alen == 0)
 	    return -1;
+	
 	utf8_mbtowc(&wa, a, alen);
 	a += alen;
+	an++;
 
-	if (is_alnumspace(wa)) {
+	if (!wcsel || wcsel(wa)) {
 	    if (*b == 0)
 		return 1;
 	    while (*b) {
+		if (maxlen != 0 && bn == maxlen)
+		    return 0;
 		blen = utf8_char_width(b);
 		if (blen == 0)
 		    return 1;
 		utf8_mbtowc(&wb, b, blen);
 		b += blen;
+		bn++;
 		
-		if (is_alnumspace(wb)) {
+		if (!wcsel || wcsel(wb)) {
 		    if (ci == case_insensitive) {
 			wa = utf8_wc_toupper(wa);
 			wb = utf8_wc_toupper(wb);
@@ -1882,23 +1805,29 @@ utf8_strcmp_alnumspace_cc(char *a, char *b, int ci)
 	    return 1;
 	utf8_mbtowc(&wb, b, blen);
 	b += blen;
-	if (is_alnumspace(wb))
+	if (!wcsel || wcsel(wb))
 	    return -1;
     }
     
     return 0;
 }
+
+int
+utf8_strcmp(char const *a, char const *b)
+{
+    return utf8_compare(a, b, case_sensitive, 0, NULL);
+}
 
 int
-utf8_strcmp_alnumspace(char *a, char *b)
+utf8_strcasecmp(char const *a, char const *b)
 {
-    return utf8_strcmp_alnumspace_cc(a, b, case_sensitive);
+    return utf8_compare(a, b, case_insensitive, 0, NULL);
 }
 
 int
-utf8_strcasecmp_alnumspace(char *a, char *b)
+utf8_strncasecmp(char const *a, char const *b, size_t maxlen)
 {
-    return utf8_strcmp_alnumspace_cc(a, b, case_insensitive);
+    return utf8_compare(a, b, case_insensitive, maxlen, NULL);
 }
 
 unsigned
diff --git a/modules/dict.org/dictorg.c b/modules/dict.org/dictorg.c
index fbaec48..bec219a 100644
--- a/modules/dict.org/dictorg.c
+++ b/modules/dict.org/dictorg.c
@@ -22,7 +22,42 @@ static int sort_index;
 static int trim_ws;
 static int show_dictorg_entries;
 
-typedef int (*COMPARATOR) (const void *, const void *, void *closure);
+static int
+is_alnumspace(unsigned c)
+{
+    return utf8_wc_is_alnum(c) || utf8_wc_is_space(c);
+}
+
+static inline int
+headword_compare(char const *a, char const *b, struct dictdb *db)
+{
+    return utf8_compare(a, b,
+			db->flag_casesensitive
+			  ? case_sensitive : case_insensitive,
+			0,
+			db->flag_allchars ? NULL : is_alnumspace);
+}
+
+static inline int
+headword_compare_allchars(char const *a, char const *b, struct dictdb *db,
+			  size_t len)
+{
+    return utf8_compare(a, b,
+			db->flag_casesensitive
+			  ? case_sensitive : case_insensitive,
+			len,
+			NULL);
+}
+
+static int
+compare_index_entry(const void *a, const void *b, void *closure)
+{
+    const struct index_entry *epa = a;
+    const struct index_entry *epb = b;
+    compare_count++;
+    return headword_compare(epa->word, epb->word, (struct dictdb *)closure);
+}
+
 static int get_db_flag(struct dictdb *db, const char *name);
     
 static int register_strategies(void);
@@ -132,9 +167,6 @@ b64_decode(const char *val, size_t len, size_t *presult)
     return 0;
 }
 
-static COMPARATOR comparator(struct dictdb *db);
-static COMPARATOR case_comparator(struct dictdb *db);
-
 static int
 parse_index_entry(const char *filename, size_t line,
 		  dico_list_t list, char *buf, int tws)
@@ -480,12 +512,11 @@ mod_init_db(const char *dbname, int argc, char **argv)
     if (sort_option) {
 	/* Sort index entries */
 	dico_sort(db->index, db->numwords, sizeof(db->index[0]),
-		  comparator(db), NULL);
+		  compare_index_entry, db);
     }
     
     return (dico_handle_t)db;
 }
-
 
 static void
 revert_word(char *dst, const char *src, size_t len)
@@ -505,11 +536,11 @@ revert_word(char *dst, const char *src, size_t len)
 }
 
 static int
-compare_rev_entry(const void *a, const void *b)
+compare_rev_entry(const void *a, const void *b, void *closure)
 {
     struct rev_entry const *epa = a;
     struct rev_entry const *epb = b;
-    return utf8_strcasecmp(epa->word, epb->word);
+    return headword_compare_allchars(epa->word, epb->word, closure, 0);
 }
 
 static int
@@ -533,8 +564,8 @@ init_suffix_index(struct dictdb *db)
 	    db->suf_index[i].word = p;
 	    db->suf_index[i].ptr = &db->index[i];
 	}
-        qsort(db->suf_index, db->numwords, sizeof(db->suf_index[0]),
-	      compare_rev_entry);
+        dico_sort(db->suf_index, db->numwords, sizeof(db->suf_index[0]),
+		  compare_rev_entry, db);
     }
     return 0;
 }    
@@ -580,73 +611,12 @@ register_strategies(void)
 }
 
 static int
-compare_allchars(const void *a, const void *b, void *closure)
-{
-    const struct index_entry *epa = a;
-    const struct index_entry *epb = b;
-    compare_count++;
-    return utf8_strcmp(epa->word, epb->word);
-}
-
-static int
-compare_allchars_ci(const void *a, const void *b, void *closure)
-{
-    const struct index_entry *epa = a;
-    const struct index_entry *epb = b;
-    compare_count++;
-    return utf8_strcasecmp(epa->word, epb->word);
-}
-
-static int
-compare_alnumspace(const void *a, const void *b, void *closure)
-{
-    const struct index_entry *epa = a;
-    const struct index_entry *epb = b;
-    compare_count++;
-    return utf8_strcmp_alnumspace(epa->word, epb->word);
-}
-
-static int
-compare_alnumspace_ci(const void *a, const void *b, void *closure)
-{
-    const struct index_entry *epa = a;
-    const struct index_entry *epb = b;
-    compare_count++;
-    return utf8_strcasecmp_alnumspace(epa->word, epb->word);
-}
-
-static COMPARATOR
-comparator(struct dictdb *db)
-{
-    if (db->flag_allchars) {
-	if (db->flag_casesensitive)
-	    return compare_allchars;
-	else
-	    return compare_allchars_ci;
-    } else {
-	if (db->flag_casesensitive)
-	    return compare_alnumspace;
-	else
-	    return compare_alnumspace_ci;
-    }
-}
-
-static COMPARATOR
-case_comparator(struct dictdb *db)
-{
-    if (db->flag_casesensitive)
-	return compare_allchars;
-    else
-	return compare_allchars_ci;
-}
-
-static int
 compare_entry_ptr(const void *a, const void *b, void *closure)
 {
     const struct index_entry *epa = *(const struct index_entry **)a;
     const struct index_entry *epb = *(const struct index_entry **)b;
-    COMPARATOR cmp = closure;
-    return cmp(epa, epb, NULL);
+    struct dictdb *db = closure;
+    return headword_compare_allchars(epa->word, epb->word, db, 0);
 }
 
 static int
@@ -654,11 +624,10 @@ uniq_comp(const void *a, const void *b, void *closure)
 {
     const struct index_entry *epa = a;
     const struct index_entry *epb = b;
-    COMPARATOR cmp = closure;
-    struct index_entry atmp, btmp;
+    struct dictdb *db = closure;
     
     /* Entries differ if their headwords differ */
-    if (utf8_strcasecmp(epa->word, epb->word))
+    if (headword_compare(epa->word, epb->word, db))
 	return 1;
     /* Otherwise, if neither entry has the original headword, they
        are equal */
@@ -668,14 +637,12 @@ uniq_comp(const void *a, const void *b, void *closure)
     if (!epa->orig || !epb->orig)
 	return 1;
     /* We have both original headwords. Compare them to decide. */
-    atmp.word = epa->orig;
-    btmp.word = epb->orig;
-    return cmp(&atmp, &btmp, NULL);
+    return headword_compare(epa->orig, epb->orig, db);
 }
 
 static int
 common_match(struct dictdb *db, const char *word,
-	     COMPARATOR compare,
+	     int (*compare)(const void *, const void *, void *),
 	     int unique, struct result *res)
 {
     struct index_entry x, *ep;
@@ -685,7 +652,7 @@ common_match(struct dictdb *db, const char *word,
     x.wordlen = utf8_strlen(word);
     compare_count = 0;
     ep = dico_bsearch(&x, db->index, db->numwords, sizeof(db->index[0]),
-		      compare, NULL);
+		      compare, db);
     if (ep) {
 	res->type = result_match;
 	res->db = db;
@@ -696,7 +663,7 @@ common_match(struct dictdb *db, const char *word,
 	}
 	res->itr = NULL;
 	if (unique) {
-	    dico_list_set_comparator(res->list, uniq_comp, case_comparator(db));
+	    dico_list_set_comparator(res->list, uniq_comp, db);
 	    dico_list_set_flags(res->list, DICO_LIST_COMPARE_TAIL);
 	}
 	for (; ep < db->index + db->numwords
@@ -713,7 +680,7 @@ common_match(struct dictdb *db, const char *word,
 static int
 exact_match(struct dictdb *db, const char *word, struct result *res)
 {
-    return common_match(db, word, comparator(db), 0, res);
+    return common_match(db, word, compare_index_entry, 0, res);
 }
 
 static int
@@ -721,11 +688,12 @@ compare_prefix(const void *a, const void *b, void *closure)
 {
     const struct index_entry *pkey = a;
     const struct index_entry *pelt = b;
+    struct dictdb *db = closure;
     size_t wordlen = pkey->wordlen;
     compare_count++;
     if (pelt->wordlen < wordlen)
 	return -1;
-    return utf8_strncasecmp(pkey->word, pelt->word, wordlen);
+    return headword_compare_allchars(pkey->word, pelt->word, db, wordlen);
 }
 
 static int
@@ -739,11 +707,12 @@ compare_rev_prefix(const void *a, const void *b, void *closure)
 {
     const struct rev_entry *pkey = a;
     const struct rev_entry *pelt = b;
+    struct dictdb *db = closure;
     size_t wordlen = pkey->ptr->wordlen;
     if (pelt->ptr->wordlen < wordlen)
 	wordlen = pelt->ptr->wordlen;
     compare_count++;
-    return utf8_strncasecmp(pkey->word, pelt->word, wordlen);
+    return headword_compare_allchars(pkey->word, pelt->word, db, wordlen);
 }
 
 static int
@@ -771,7 +740,7 @@ suffix_match(struct dictdb *db, const char *word, struct result *res)
     
     compare_count = 0;
     ep = dico_bsearch(&x, db->suf_index, db->numwords, sizeof(db->suf_index[0]),
-		      compare_rev_prefix, NULL);
+		      compare_rev_prefix, db);
     if (ep) {
 	struct rev_entry *p;
 	struct index_entry **tmp;
@@ -796,8 +765,7 @@ suffix_match(struct dictdb *db, const char *word, struct result *res)
 		tmp[j++] = p->ptr;
 	
 	count = j;
-	dico_sort(tmp, count, sizeof(tmp[0]), compare_entry_ptr,
-		  case_comparator(db));
+	dico_sort(tmp, count, sizeof(tmp[0]), compare_entry_ptr, db);
 
 	list = dico_list_create();
 	if (!list) {
@@ -806,7 +774,7 @@ suffix_match(struct dictdb *db, const char *word, struct result *res)
 	    free(tmp);
 	    return 1;
 	}
-	dico_list_set_comparator(list, uniq_comp, case_comparator(db));
+	dico_list_set_comparator(list, uniq_comp, db);
 	dico_list_set_flags(list, DICO_LIST_COMPARE_TAIL);
 	for (i = 0; i < count; i++) 
 	    dico_list_append(list, tmp[i]);
@@ -836,7 +804,7 @@ find_db_entry(struct dictdb *db, const char *name)
     x.length = strlen(name);
     x.wordlen = utf8_strlen(name);
     ep = dico_bsearch(&x, db->index, db->numwords, sizeof(db->index[0]),
-		      comparator(db), NULL);
+		      compare_index_entry, db);
     if (!ep)
 	return NULL;
     buf = malloc(ep->size + 1);
@@ -865,7 +833,7 @@ get_db_flag(struct dictdb *db, const char *name)
     x.length = strlen(name);
     x.wordlen = utf8_strlen(name);
     return dico_bsearch(&x, db->index, db->numwords, sizeof(db->index[0]),
-			comparator(db), NULL) != NULL;
+			compare_index_entry, db) != NULL;
 }
 
 static char *
@@ -937,7 +905,7 @@ _match_all(struct dictdb *db, dico_strategy_t strat, const char *word)
 	return NULL;
     }
 
-    dico_list_set_comparator(list, uniq_comp, case_comparator(db));
+    dico_list_set_comparator(list, uniq_comp, db);
     dico_list_set_flags(list, DICO_LIST_COMPARE_TAIL);
 
     if (dico_key_init(&key, strat, word)) {
@@ -998,7 +966,7 @@ mod_define(dico_handle_t hp, const char *word)
     if (RESERVED_WORD(db, word))
 	return NULL;
     
-    rc = common_match(db, word, comparator(db), 0, &res);
+    rc = common_match(db, word, compare_index_entry, 0, &res);
     if (rc)
 	return NULL;
     rp = malloc(sizeof(*rp));
author	Sergey Poznyakoff <gray@gnu.org>	2018-09-16 10:24:48 +0300
committer	Sergey Poznyakoff <gray@gnu.org>	2018-09-16 10:24:48 +0300
commit	1c5e14432740e8af3a42e5e83a396e82221499ad (patch)
tree	622ba85b52b1e6c09e7acd01d22419d2df4b5bfc
parent	b785e98d2a06eb1f74f71337431e38f896f397a2 (diff)
download	dico-1c5e14432740e8af3a42e5e83a396e82221499ad.tar.gz dico-1c5e14432740e8af3a42e5e83a396e82221499ad.tar.bz2