diff options
author | Sergey Poznyakoff <gray@gnu.org> | 2019-02-18 10:32:43 +0200 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org> | 2019-02-18 10:32:43 +0200 |
commit | 7d66d87691efd2004306c57624d61a0391056cc5 (patch) | |
tree | ee01f286294717504cd2593bc7df75582b349bb3 | |
parent | bbdaff2c31a5cffe362cffdc1147546ac25a6f06 (diff) | |
download | dico-7d66d87691efd2004306c57624d61a0391056cc5.tar.gz dico-7d66d87691efd2004306c57624d61a0391056cc5.tar.bz2 |
Modify utf8 iterator to correctly handle non-zero-terminated strings.
* include/dico/utf8.h (utf8_iterator): New fields: length, err, end.
(utf8_iter_err_p,utf8_iter_init): New prototype.
* lib/utf8.c (utf8_iter0): Add boundary checking. Set end and error
flags as appropriate.
(utf8_iter_err_p): Rewrite.
(utf8_iter_err_p): New function.
(utf8_iter_init): New function.
(utf8_iter_first): Rewrite using utf8_iter_init.
(utf8_iter_next): Update length.
* lib/linetrimstr.c (_linetrimstr_find_end): Don't assume zero-terminated
string.
* modules/dict.org/dictorg.c (revert_word): Likewise.
* modules/outline/outline.c (revert_word): Likewise.
-rw-r--r-- | include/dico/utf8.h | 5 | ||||
-rw-r--r-- | lib/linetrimstr.c | 21 | ||||
-rw-r--r-- | lib/utf8.c | 37 | ||||
-rw-r--r-- | modules/dict.org/dictorg.c | 2 | ||||
-rw-r--r-- | modules/outline/outline.c | 2 |
5 files changed, 46 insertions, 21 deletions
diff --git a/include/dico/utf8.h b/include/dico/utf8.h index 953a4f1..807f6b7 100644 --- a/include/dico/utf8.h +++ b/include/dico/utf8.h @@ -24,18 +24,23 @@ size_t utf8_strlen (const char *s); size_t utf8_strbytelen (const char *s); struct utf8_iterator { char *string; char *curptr; unsigned curwidth; + size_t length; + int end:1; + int err:1; }; #define utf8_iter_isascii(itr) \ ((itr).curwidth == 1 && isascii((itr).curptr[0])) int utf8_iter_end_p(struct utf8_iterator *itr); +int utf8_iter_err_p(struct utf8_iterator *itr); +int utf8_iter_init(struct utf8_iterator *itr, char *ptr, size_t size); int utf8_iter_first(struct utf8_iterator *itr, char *ptr); int utf8_iter_next(struct utf8_iterator *itr); int utf8_mbtowc_internal (void *data, int (*read) (void*), unsigned int *pwc); int utf8_wctomb (char *r, unsigned int wc); diff --git a/lib/linetrimstr.c b/lib/linetrimstr.c index 3ed8483..7cd100e 100644 --- a/lib/linetrimstr.c +++ b/lib/linetrimstr.c @@ -34,51 +34,44 @@ struct _linetrimstr { #define ISWS(c) ((c)==' '||(c)=='\t'||(c)=='\n') static size_t _linetrimstr_find_end(struct _linetrimstr *s, const char *buf, size_t size, size_t *psize) { - const char *end = buf + size; struct utf8_iterator itr; const char *wordstart = buf; - utf8_iter_first(&itr, (char*)buf); - do { - for (; utf8_iter_isascii(itr) && ISWS(*itr.curptr); + utf8_iter_init(&itr, (char*)buf, size); + while (!utf8_iter_end_p(&itr)) { + for (; !utf8_iter_end_p(&itr) + && utf8_iter_isascii(itr) && ISWS(*itr.curptr); utf8_iter_next(&itr)) { - if (itr.curptr >= end) { - *psize = itr.curptr - buf; - return 0; - } if (*itr.curptr == '\n') s->linelen = 0; else if (++s->linelen >= s->maxlen) { *psize = wordstart > buf ? wordstart - buf : itr.curptr - buf; s->linelen = 0; return 1; } } wordstart = itr.curptr; - for (; !(utf8_iter_isascii(itr) && ISWS(*itr.curptr)); + for (; !utf8_iter_end_p(&itr) + && !(utf8_iter_isascii(itr) && ISWS(*itr.curptr)); utf8_iter_next(&itr)) { - if (itr.curptr >= end) { - *psize = itr.curptr - buf; - return 0; - } if (++s->linelen >= s->maxlen) { size_t size = itr.string == wordstart ? itr.curptr - buf : wordstart - itr.string; s->linelen = 0; if (size > 0) { *psize = size; return 1; } } } - } while (itr.curptr < end); + } *psize = itr.curptr - buf; return 0; } static int _linetrimstr_write(void *data, const char *buf, size_t size, size_t *pret) @@ -1537,39 +1537,66 @@ utf8_strlen(const char *s) } static int utf8_iter0(struct utf8_iterator *itr) { - size_t n = utf8_char_width(itr->curptr); - if (n == 0) + size_t n; + + if (itr->length == 0) { + itr->end = 1; + return 1; + } + n = utf8_char_width(itr->curptr); + if (n > itr->length) { + itr->end = 1; + itr->err = 1; return 1; + } + itr->curwidth = n; return 0; } int utf8_iter_end_p(struct utf8_iterator *itr) { - return *itr->curptr == 0; + return itr->end; } int -utf8_iter_first(struct utf8_iterator *itr, char *ptr) +utf8_iter_err_p(struct utf8_iterator *itr) +{ + return itr->end; +} + +int +utf8_iter_init(struct utf8_iterator *itr, char *ptr, size_t size) { itr->string = ptr; itr->curptr = ptr; + itr->length = size; + itr->curwidth = 0; + itr->err = 0; + itr->end = 0; return utf8_iter0(itr); } int +utf8_iter_first(struct utf8_iterator *itr, char *ptr) +{ + return utf8_iter_init(itr, ptr, strlen(ptr)); +} + +int utf8_iter_next(struct utf8_iterator *itr) { - if (*itr->curptr == 0) + if (utf8_iter_end_p(itr)) return -1; itr->curptr += itr->curwidth; + itr->length -= itr->curwidth; return utf8_iter0(itr); } /* Stores the UTF-8 representation of the Unicode character wc in r[0..5]. Returns the number of bytes stored, or -1 if wc is out of range. */ diff --git a/modules/dict.org/dictorg.c b/modules/dict.org/dictorg.c index ee9c26f..7eacfbc 100644 --- a/modules/dict.org/dictorg.c +++ b/modules/dict.org/dictorg.c @@ -524,13 +524,13 @@ static void revert_word(char *dst, const char *src, size_t len) { struct utf8_iterator itr; char *p = dst + len; *p = 0; - for (utf8_iter_first(&itr, (char *)src); + for (utf8_iter_init(&itr, (char *)src, len); !utf8_iter_end_p(&itr); utf8_iter_next(&itr)) { p -= itr.curwidth; if (p < dst) break; memcpy(p, itr.curptr, itr.curwidth); diff --git a/modules/outline/outline.c b/modules/outline/outline.c index 18e7dd3..e27530a 100644 --- a/modules/outline/outline.c +++ b/modules/outline/outline.c @@ -264,13 +264,13 @@ static void revert_word(char *dst, const char *src, size_t len) { struct utf8_iterator itr; char *p = dst + len; *p = 0; - for (utf8_iter_first(&itr, (char *)src); + for (utf8_iter_init(&itr, (char *)src, len); !utf8_iter_end_p(&itr); utf8_iter_next(&itr)) { p -= itr.curwidth; if (p < dst) break; memcpy(p, itr.curptr, itr.curwidth); |