diff options
Diffstat (limited to 'src/ellinika/utf8scm.c')
-rw-r--r-- | src/ellinika/utf8scm.c | 149 |
1 files changed, 149 insertions, 0 deletions
diff --git a/src/ellinika/utf8scm.c b/src/ellinika/utf8scm.c new file mode 100644 index 0000000..89f5fba --- /dev/null +++ b/src/ellinika/utf8scm.c @@ -0,0 +1,149 @@ +/* This file is part of Ellinika project. + Copyright (C) 2011 Sergey Poznyakoff + + Ellinika is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3 of the License, or + (at your option) any later version. + + Ellinika is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ +#ifdef HAVE_CONFIG_H +# include <config.h> +#endif +#include <errno.h> +#include <stdlib.h> +#include <libguile.h> +#include "utf8.h" + + +SCM_DEFINE_PUBLIC(scm_utf8_toupper, "utf8-toupper", 1, 0, 0, + (SCM string), +"Convert STRING to uppercase\n") +#define FUNC_NAME s_scm_utf8_toupper +{ + char *str; + SCM scm; + + SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME); + str = scm_to_locale_string(string); + if (utf8_toupper(str, strlen(str))) + scm_misc_error(FUNC_NAME, + "cannot convert to upper case: ~S", + scm_list_1(string)); + scm = scm_from_locale_string(str); + free(str); + return scm; +} +#undef FUNC_NAME + +SCM_DEFINE_PUBLIC(scm_utf8_tolower, "utf8-tolower", 1, 0, 0, + (SCM string), +"Convert STRING to lowercase\n") +#define FUNC_NAME s_scm_utf8_tolower +{ + char *str; + SCM scm; + + SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME); + str = scm_to_locale_string(string); + if (utf8_tolower(str, strlen(str))) + scm_misc_error(FUNC_NAME, + "cannot convert to lower case: ~S", + scm_list_1(string)); + scm = scm_from_locale_string(str); + free(str); + return scm; +} +#undef FUNC_NAME + +static int +memberof(unsigned *w, size_t len, unsigned c) +{ + while (len--) + if (*w++ == c) + return 1; + return 0; +} + +SCM_DEFINE_PUBLIC(scm_utf8_escape, "utf8-escape", 1, 1, 0, + (SCM string, SCM escapable), +"Prefix with \\ each occurrence of ESCAPABLE chars in STRING\n") +#define FUNC_NAME s_scm_utf8_escape +{ + SCM scm; + unsigned *escptr, *escbase; + size_t esclen; + char *s; + unsigned *wptr, *nptr; + size_t wlen; + size_t incr, i; + + SCM_ASSERT(scm_is_string(string), string, SCM_ARG1, FUNC_NAME); + s = scm_to_locale_string(string); + + if (utf8_mbstr_to_wc(s, &wptr, &wlen)) + scm_misc_error(FUNC_NAME, + "cannot convert ~S to UTF-8", + scm_list_1(string)); + free(s); + + if (SCM_UNBNDP(escapable)) { + static unsigned default_escapable[] = { '\"', '\\' }; + escbase = NULL; + escptr = default_escapable; + esclen = 2; + } else { + SCM_ASSERT(scm_is_string(escapable), escapable, + SCM_ARG2, FUNC_NAME); + s = scm_to_locale_string(escapable); + if (utf8_mbstr_to_wc(s, &escbase, &esclen)) { + free(wptr); + scm_misc_error(FUNC_NAME, + "cannot convert ~S to UTF-8", + scm_list_1(escapable)); + } + escptr = escbase; + free(s); + } + + incr = 0; + for (i = 0; i < wlen; i++) + if (memberof(escptr, esclen, wptr[i])) + incr++; + + + nptr = calloc(sizeof(nptr[0]), wlen + incr); + if (!nptr) + scm_memory_error(FUNC_NAME); + + for (i = incr = 0; i < wlen; i++) { + if (memberof(escptr, esclen, wptr[i])) + nptr[i + incr++] = '\\'; + nptr[i + incr] = wptr[i]; + } + + free(wptr); + free(escbase); + + if (utf8_wc_to_mbstr(nptr, wlen + incr, &s)) + scm_misc_error(FUNC_NAME, + "cannot convert UTF-8 to Scheme", + SCM_EOL); + scm = scm_from_locale_string(s); + free(s); + return scm; +} +#undef FUNC_NAME + +void +elmorph_utf8scm_init() +{ +#include "utf8scm.x" +} |