aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey Poznyakoff <gray@gnu.org>2019-02-18 10:32:43 +0200
committerSergey Poznyakoff <gray@gnu.org>2019-02-18 10:32:43 +0200
commit7d66d87691efd2004306c57624d61a0391056cc5 (patch)
treeee01f286294717504cd2593bc7df75582b349bb3
parentbbdaff2c31a5cffe362cffdc1147546ac25a6f06 (diff)
downloaddico-7d66d87691efd2004306c57624d61a0391056cc5.tar.gz
dico-7d66d87691efd2004306c57624d61a0391056cc5.tar.bz2
Modify utf8 iterator to correctly handle non-zero-terminated strings.
* include/dico/utf8.h (utf8_iterator): New fields: length, err, end. (utf8_iter_err_p,utf8_iter_init): New prototype. * lib/utf8.c (utf8_iter0): Add boundary checking. Set end and error flags as appropriate. (utf8_iter_err_p): Rewrite. (utf8_iter_err_p): New function. (utf8_iter_init): New function. (utf8_iter_first): Rewrite using utf8_iter_init. (utf8_iter_next): Update length. * lib/linetrimstr.c (_linetrimstr_find_end): Don't assume zero-terminated string. * modules/dict.org/dictorg.c (revert_word): Likewise. * modules/outline/outline.c (revert_word): Likewise.
-rw-r--r--include/dico/utf8.h5
-rw-r--r--lib/linetrimstr.c21
-rw-r--r--lib/utf8.c37
-rw-r--r--modules/dict.org/dictorg.c2
-rw-r--r--modules/outline/outline.c2
5 files changed, 46 insertions, 21 deletions
diff --git a/include/dico/utf8.h b/include/dico/utf8.h
index 953a4f1..807f6b7 100644
--- a/include/dico/utf8.h
+++ b/include/dico/utf8.h
@@ -24,18 +24,23 @@ size_t utf8_strlen (const char *s);
size_t utf8_strbytelen (const char *s);
struct utf8_iterator {
char *string;
char *curptr;
unsigned curwidth;
+ size_t length;
+ int end:1;
+ int err:1;
};
#define utf8_iter_isascii(itr) \
((itr).curwidth == 1 && isascii((itr).curptr[0]))
int utf8_iter_end_p(struct utf8_iterator *itr);
+int utf8_iter_err_p(struct utf8_iterator *itr);
+int utf8_iter_init(struct utf8_iterator *itr, char *ptr, size_t size);
int utf8_iter_first(struct utf8_iterator *itr, char *ptr);
int utf8_iter_next(struct utf8_iterator *itr);
int utf8_mbtowc_internal (void *data, int (*read) (void*), unsigned int *pwc);
int utf8_wctomb (char *r, unsigned int wc);
diff --git a/lib/linetrimstr.c b/lib/linetrimstr.c
index 3ed8483..7cd100e 100644
--- a/lib/linetrimstr.c
+++ b/lib/linetrimstr.c
@@ -34,51 +34,44 @@ struct _linetrimstr {
#define ISWS(c) ((c)==' '||(c)=='\t'||(c)=='\n')
static size_t
_linetrimstr_find_end(struct _linetrimstr *s, const char *buf, size_t size,
size_t *psize)
{
- const char *end = buf + size;
struct utf8_iterator itr;
const char *wordstart = buf;
- utf8_iter_first(&itr, (char*)buf);
- do {
- for (; utf8_iter_isascii(itr) && ISWS(*itr.curptr);
+ utf8_iter_init(&itr, (char*)buf, size);
+ while (!utf8_iter_end_p(&itr)) {
+ for (; !utf8_iter_end_p(&itr)
+ && utf8_iter_isascii(itr) && ISWS(*itr.curptr);
utf8_iter_next(&itr)) {
- if (itr.curptr >= end) {
- *psize = itr.curptr - buf;
- return 0;
- }
if (*itr.curptr == '\n')
s->linelen = 0;
else if (++s->linelen >= s->maxlen) {
*psize = wordstart > buf ? wordstart - buf : itr.curptr - buf;
s->linelen = 0;
return 1;
}
}
wordstart = itr.curptr;
- for (; !(utf8_iter_isascii(itr) && ISWS(*itr.curptr));
+ for (; !utf8_iter_end_p(&itr)
+ && !(utf8_iter_isascii(itr) && ISWS(*itr.curptr));
utf8_iter_next(&itr)) {
- if (itr.curptr >= end) {
- *psize = itr.curptr - buf;
- return 0;
- }
if (++s->linelen >= s->maxlen) {
size_t size = itr.string == wordstart ?
itr.curptr - buf : wordstart - itr.string;
s->linelen = 0;
if (size > 0) {
*psize = size;
return 1;
}
}
}
- } while (itr.curptr < end);
+ }
*psize = itr.curptr - buf;
return 0;
}
static int
_linetrimstr_write(void *data, const char *buf, size_t size, size_t *pret)
diff --git a/lib/utf8.c b/lib/utf8.c
index 3ba6048..999bce1 100644
--- a/lib/utf8.c
+++ b/lib/utf8.c
@@ -1537,39 +1537,66 @@ utf8_strlen(const char *s)
}
static int
utf8_iter0(struct utf8_iterator *itr)
{
- size_t n = utf8_char_width(itr->curptr);
- if (n == 0)
+ size_t n;
+
+ if (itr->length == 0) {
+ itr->end = 1;
+ return 1;
+ }
+ n = utf8_char_width(itr->curptr);
+ if (n > itr->length) {
+ itr->end = 1;
+ itr->err = 1;
return 1;
+ }
+
itr->curwidth = n;
return 0;
}
int
utf8_iter_end_p(struct utf8_iterator *itr)
{
- return *itr->curptr == 0;
+ return itr->end;
}
int
-utf8_iter_first(struct utf8_iterator *itr, char *ptr)
+utf8_iter_err_p(struct utf8_iterator *itr)
+{
+ return itr->end;
+}
+
+int
+utf8_iter_init(struct utf8_iterator *itr, char *ptr, size_t size)
{
itr->string = ptr;
itr->curptr = ptr;
+ itr->length = size;
+ itr->curwidth = 0;
+ itr->err = 0;
+ itr->end = 0;
return utf8_iter0(itr);
}
int
+utf8_iter_first(struct utf8_iterator *itr, char *ptr)
+{
+ return utf8_iter_init(itr, ptr, strlen(ptr));
+}
+
+int
utf8_iter_next(struct utf8_iterator *itr)
{
- if (*itr->curptr == 0)
+ if (utf8_iter_end_p(itr))
return -1;
itr->curptr += itr->curwidth;
+ itr->length -= itr->curwidth;
return utf8_iter0(itr);
}
/* Stores the UTF-8 representation of the Unicode character wc in r[0..5].
Returns the number of bytes stored, or -1 if wc is out of range. */
diff --git a/modules/dict.org/dictorg.c b/modules/dict.org/dictorg.c
index ee9c26f..7eacfbc 100644
--- a/modules/dict.org/dictorg.c
+++ b/modules/dict.org/dictorg.c
@@ -524,13 +524,13 @@ static void
revert_word(char *dst, const char *src, size_t len)
{
struct utf8_iterator itr;
char *p = dst + len;
*p = 0;
- for (utf8_iter_first(&itr, (char *)src);
+ for (utf8_iter_init(&itr, (char *)src, len);
!utf8_iter_end_p(&itr);
utf8_iter_next(&itr)) {
p -= itr.curwidth;
if (p < dst)
break;
memcpy(p, itr.curptr, itr.curwidth);
diff --git a/modules/outline/outline.c b/modules/outline/outline.c
index 18e7dd3..e27530a 100644
--- a/modules/outline/outline.c
+++ b/modules/outline/outline.c
@@ -264,13 +264,13 @@ static void
revert_word(char *dst, const char *src, size_t len)
{
struct utf8_iterator itr;
char *p = dst + len;
*p = 0;
- for (utf8_iter_first(&itr, (char *)src);
+ for (utf8_iter_init(&itr, (char *)src, len);
!utf8_iter_end_p(&itr);
utf8_iter_next(&itr)) {
p -= itr.curwidth;
if (p < dst)
break;
memcpy(p, itr.curptr, itr.curwidth);

Return to:

Send suggestions and report system problems to the System administrator.