diff options
author | Sergey Poznyakoff <gray@gnu.org> | 2018-12-08 12:03:40 +0200 |
---|---|---|
committer | Sergey Poznyakoff <gray@gnu.org> | 2018-12-08 12:03:40 +0200 |
commit | 602f4d93070ac0e762e0cbe3ef72ba792f9c4811 (patch) | |
tree | 8047168c6b23de38973b494b6ec794a81faf3576 | |
parent | 2a684f1cdd7723c2ded277ea2c7e66227b6f3ae1 (diff) | |
download | vmod-dbrw-602f4d93070ac0e762e0cbe3ef72ba792f9c4811.tar.gz vmod-dbrw-602f4d93070ac0e762e0cbe3ef72ba792f9c4811.tar.bz2 |
Implement the $(urlprefixes) built-in function.
* NEWS: Update.
* README: Update.
* configure.ac: Version 2.2.91
* doc/vmod-dbrw.3: Document the use of $(urlprefixes) built-in
* doc/vmod-dbrw.texi: Likewise.
* src/vmod_dbrw.c (parse_flags): Make sure status string is null-terminated.
(do_rewrite): Expand built-in functions in $(). Support urlprefixes.
On debug_level=100, produce detailed trace of expansions.
* src/wordsplit.c: Pull from grecs commit 9097d529.
* src/wordsplit.h: Likewise.
* tests/initdb.at (rewrite): Change the url column.
* tests/rewrite01.at: Use $(urlprefixes) in the SQL templates.
* tests/rewrite02.at: Likewise.
* tests/rewrite03.at: Likewise.
* tests/rewrite04.at: Likewise.
* tests/rewrite05.at: Likewise.
* tests/rewrite06.at: Likewise.
-rw-r--r-- | NEWS | 31 | ||||
-rw-r--r-- | README | 3 | ||||
-rw-r--r-- | configure.ac | 2 | ||||
-rw-r--r-- | doc/vmod-dbrw.3 | 34 | ||||
-rw-r--r-- | doc/vmod-dbrw.texi | 16 | ||||
-rw-r--r-- | src/vmod_dbrw.c | 127 | ||||
-rw-r--r-- | src/wordsplit.c | 1646 | ||||
-rw-r--r-- | src/wordsplit.h | 214 | ||||
-rw-r--r-- | tests/initdb.at | 10 | ||||
-rw-r--r-- | tests/rewrite01.at | 5 | ||||
-rw-r--r-- | tests/rewrite02.at | 5 | ||||
-rw-r--r-- | tests/rewrite03.at | 5 | ||||
-rw-r--r-- | tests/rewrite04.at | 5 | ||||
-rw-r--r-- | tests/rewrite05.at | 5 | ||||
-rw-r--r-- | tests/rewrite06.at | 5 |
15 files changed, 1664 insertions, 449 deletions
@@ -1,12 +1,12 @@ -vmod-dbrw -- history of user-visible changes. 2018-01-30 +vmod-dbrw -- history of user-visible changes. 2018-12-08 See the end of file for copying conditions. Please send vmod-dbrw bug reports to <gray@gnu.org> -Version 2.2.90 (Git) +Version 2.2.91 (Git) * SQL idle timeout For MySQL backend, the default connection idle timeout is set equal to the value of the MySQL variable 'wait_timeout'. For Postgres, default idle timeout is not yet implemented. @@ -15,12 +15,39 @@ Idle timeout can be configured using the timeout configuration option, e.g.: dbrw.config("mysql", "database=dbrw;user=proxy;timeout=600", {"select dest,pattern,value,flags from rewrite where locate(url,'$url') = 1 order by weight asc;"}); +* The $() functions in SQL templates + +The SQL templates support the use of $() constructs for invoking +built-in functions. So far one function is implemented: + + $(urlprefixes PATH) + +It expands to comma-separated list of properly quoted pathname +prefixes, constructed from its argument. Optional query part is +stripped off the argument prior to expansion. For example + + $(urlprefixes "/local/user/local?a=1") + +expands to: + + '/local/user/local','/local/user','/local' + +This construct is intended for use in SQL IN conditionals, for +example: + + SELECT dest,pattern,value,flags + FROM rewrite + WHERE host='$host' + AND url IN ($(urlprefixes $url)) + ORDER BY length(dest),value,weight DESC + + Version 2.2, 2017-08-10 * Support for Varnish 5.1 @@ -1,8 +1,7 @@ Vmod-dbrw README -Copyright (C) 2013-2017 Sergey Poznyakoff See the end of file for copying conditions. * Introduction This file contains brief information about configuring, testing and running vmod-dbrw. It is *not* intended as a replacement @@ -220,13 +219,13 @@ This way you won't need to supply them to `make check'. Send bug reports and suggestions to <gray@gnu.org> * Copyright information: -Copyright (C) 2013-2017 Sergey Poznyakoff +Copyright (C) 2013-2018 Sergey Poznyakoff Permission is granted to anyone to make or distribute verbatim copies of this document as received, in any medium, provided that the copyright notice and this permission notice are preserved, thus giving the recipient permission to redistribute in turn. diff --git a/configure.ac b/configure.ac index 7a1272c..1212a37 100644 --- a/configure.ac +++ b/configure.ac @@ -11,13 +11,13 @@ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with vmod-dbrw. If not, see <http://www.gnu.org/licenses/>. AC_PREREQ(2.69) -AC_INIT([vmod-dbrw], 2.2.90, [gray@gnu.org]) +AC_INIT([vmod-dbrw], 2.2.91, [gray@gnu.org]) AC_CONFIG_AUX_DIR([build-aux]) AC_CONFIG_MACRO_DIR([m4]) AC_CONFIG_SRCDIR(src/vmod_dbrw.vcc) AM_CONFIG_HEADER(config.h) AC_SUBST([AC_VMOD_BASENAME],[dbrw]) diff --git a/doc/vmod-dbrw.3 b/doc/vmod-dbrw.3 index 6f1dba8..4760b6b 100644 --- a/doc/vmod-dbrw.3 +++ b/doc/vmod-dbrw.3 @@ -1,8 +1,8 @@ .\" This file is part of Vmod-dbrw -*- nroff -*- -.\" Copyright (C) 2013-2017 Sergey Poznyakoff +.\" Copyright (C) 2013-2018 Sergey Poznyakoff .\" .\" Vmod-dbrw is free software; you can redistribute it and/or modify .\" it under the terms of the GNU General Public License as published by .\" the Free Software Foundation; either version 3, or (at your option) .\" any later version. .\" @@ -10,13 +10,13 @@ .\" but WITHOUT ANY WARRANTY; without even the implied warranty of .\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the .\" GNU General Public License for more details. .\" .\" You should have received a copy of the GNU General Public License .\" along with vmod-dbrw. If not, see <http://www.gnu.org/licenses/>. -.TH VMOD-DBRW 1 "January 30, 2018" "VMOD-DBRW" "User Reference" +.TH VMOD-DBRW 1 "December 8, 2018" "VMOD-DBRW" "User Reference" .SH NAME vmod-dbrw \- Database-driven rewrite rules for Varnish Cache .SH SYNOPSIS .B import dbrw; .PP .BI "VOID dbrw.config(STRING " dbtype ", STRING " params ", STRING " query ");" @@ -131,12 +131,38 @@ construct (a \fBvariable reference\fR) with the corresponding \fIVALUE\fR from its argument. Similarly to the shell syntax, the variable reference can be written as \fB${\fINAME\fB}\fR. This form can be used in contexts where the variable name is immediately followed by another letter, to prevent it from being counted as a part of the name. .PP +The special construct +.sp +.EX +$(urlprefixes \fIPATH\fR) +.EE +.sp +expands to a comma-separated list of all possible path prefixes in +\fIPATH\fR. Each element in the list is quoted, so the result can +safely be used in SQL statements. For example, +.sp +.EX +$(urlprefixes "/local/user/login") +.EE +.sp +produces +.sp +.EX + '/local/user/login','/local/user','/local' +.EE +.PP +This statement is usually used in \fBIN\fR SQL constructs, e.g. +.sp +.EX +SELECT * FROM table WHERE url IN ($(urlprefixes $url)) +.EE +.PP The expanded query is then sent to the database server. Handling of the return value depends on the number of fields it contains. .SS Strict matches If the returned set consists of one or two columns, only the first tuple is used and the value of its first column is returned. The second column (if present) is ignored. @@ -245,13 +271,15 @@ The VCL: .EX sub vcl_recv { # It is supposed that the url column contains an SQL-style # wildcard pattern. dbrw.config("mysql", "database=varnish;user=varnish;debug=10", {"SELECT dest,pattern,value,flags FROM rewrite - WHERE host='$host' and '$url' like url"}); + WHERE host='$host' + AND url IN ($(urlprefixes $url)) + ORDER BY LENGTH(dest),value DESC"}); set req.http.X-Redirect-To = dbrw.rewrite("host=" + req.http.Host + ";" + "url=" + req.url); if (req.http.X-Redirect-To != "") { return(synth(750, "Redirect")); } diff --git a/doc/vmod-dbrw.texi b/doc/vmod-dbrw.texi index 40fc84d..31b15d1 100644 --- a/doc/vmod-dbrw.texi +++ b/doc/vmod-dbrw.texi @@ -493,25 +493,35 @@ WHERE host='$host' AND LOCATE(url,'$url')==1 ORDER BY weight @end group @end example @noindent -Furthermore, the @code{url} column can contain a SQL wildcard pattern, -in which case the query will look like: +Furthermore, the @code{url} column can contain a path prefix, +which can be matched using the @code{IN} conditional: @example @group SELECT dest,pattern,value,flags FROM rewrite WHERE host='$host' -AND '$url' like $url +AND url IN ($(urlprefixes $url)) ORDER BY weight @end group @end example +Notice the use of the @samp{$(urlprefixes $url)}. This invokes the built-in +@dfn{function} @code{urlprefixes}, which expands to comma-separated +list of properly quoted pathname prefixes, constructed from its +argument. For example, if @samp{$url} is @samp{/local/user/local?a=1}, +then the expansion of @samp{$(urlprefixes $url)} is: + +@example +'/local/user/local','/local/user','/local' +@end example + @node Rewrite @chapter The @code{rewrite} Function @deftypefn {function} string rewrite (string @var{args}) This function is the working horse of the module. It rewrites its argument using the database configured in the previous call to diff --git a/src/vmod_dbrw.c b/src/vmod_dbrw.c index d6785d7..63d4ea1 100644 --- a/src/vmod_dbrw.c +++ b/src/vmod_dbrw.c @@ -186,23 +186,27 @@ parse_flags(const char *arg, int *qdisp, int *flags, char status[]) *qdisp = QDISP_DISCARD; else if (strncmp(ws.ws_wordv[i], "redirect=", 9) == 0) { if (!is_http_status(ws.ws_wordv[i] + 9)) { dbrw_error("invalid status code: %s", ws.ws_wordv[i] + 9); rc = 1; - } else + } else { strncpy(status, ws.ws_wordv[i] + 9, HTTP_STATUS_LEN); + status[HTTP_STATUS_LEN] = 0; + } } else if (strncmp(ws.ws_wordv[i], "R=", 2) == 0) { if (!is_http_status(ws.ws_wordv[i] + 2)) { dbrw_error("invalid status code: %s", ws.ws_wordv[i] + 2); rc = 1; - } else + } else { strncpy(status, ws.ws_wordv[i] + 2, HTTP_STATUS_LEN); + status[HTTP_STATUS_LEN] = 0; + } } else { dbrw_error("unrecognized flag: %s", ws.ws_wordv[i]); rc = 1; } } @@ -500,19 +504,119 @@ findmatch(VRT_CTX, struct dbrw_connection *conn, char **param) if (wsflags & WRDSF_REUSE) wordsplit_free(&ws); return res; } +static int +expand_error(char **ret, char const *func, char const *msg) +{ + static char delim[] = ": "; + *ret = malloc(strlen(func) + strlen(msg) + 1); + if (*ret) { + strcat(strcat(strcpy(*ret, func), delim), msg); + return WRDSE_USERERR; + } else + return WRDSE_NOSPACE; +} + +static int +expand_urlprefixes(struct dbrw_connection *cp, char **argv, char **ret) +{ + char *arg; + size_t n, len, i, j; + char *q, *res; + + if (argv[1] == NULL || argv[2] != NULL) + return expand_error(ret, argv[0], "bad arguments"); + + /* Create a copy of the argument */ + if (cp->conf->backend->sql_escape) { + arg = sql_escape(cp, argv[1]); + } else { + arg = strdup(argv[1]); + } + if (!arg) + return WRDSE_NOSPACE; + + /* Cut off eventual query */ + i = j = strcspn(arg, "?"); + arg[i] = 0; + + /* Compute the resulting length */ + len = i; + n = 1; + for (; i > 0; i--) { + if (arg[i] == '/') { + len += i; + n++; + } + } + + /* Count quotes around each member */ + len += n * 2 + n - 1; + + /* Allocate the result */ + res = malloc(len + 1); + if (!res) { + free(arg); + return WRDSE_NOSPACE; + } + + /* Format the result */ + q = res; + i = j; + while (i) { + if (q > res) + *q++ = ','; + *q++ = '\''; + memcpy(q, arg, i); + q += i; + *q++ = '\''; + i--; + while (i > 0 && arg[i] != '/') + i--; + } + *q = 0; + *ret = res; + free(arg); + + return WRDSE_OK; +} + +static struct expcom { + char *com; + int (*exp) (struct dbrw_connection *, char **, char **); +} expcomtab[] = { + { "urlprefixes", expand_urlprefixes }, + { NULL } +}; + +static int +query_command_expand(char **ret, const char *cmd, size_t len, char **argv, + void *clos) +{ + struct expcom *ec; + static char diagmsg[] = "unknown command: "; + + for (ec = expcomtab; ec->com; ec++) { + if (strcmp(ec->com, argv[0]) == 0) + return ec->exp(clos, argv, ret); + } + + return expand_error(ret, argv[0], "unknown command"); +} + static char * do_rewrite(VRT_CTX, struct dbrw_connection *cp, VCL_STRING arg) { struct wordsplit ws, wsenv; int i, rc; char *res; - + int wsflags; + if (sql_connect(cp) || cp->state != state_connected) return NULL; debug(cp->conf, 2, ("vmod_rewrite: splitting arg")); wsenv.ws_delim = ";"; if (wordsplit(arg, &wsenv, WRDSF_NOVAR|WRDSF_NOCMD|WRDSF_DELIM)) { @@ -533,18 +637,25 @@ do_rewrite(VRT_CTX, struct dbrw_connection *cp, VCL_STRING arg) free(wsenv.ws_wordv[i]); wsenv.ws_wordv[i] = p; debug(cp->conf, 3, ("%d: %s",i,p)); } } - debug(cp->conf, 2, ("expanding query")); + debug(cp->conf, 2, ("expanding query {\"%s\"}", cp->conf->query)); ws.ws_env = (const char **)wsenv.ws_wordv; - rc = wordsplit(cp->conf->query, &ws, - WRDSF_NOCMD | WRDSF_QUOTE | - WRDSF_NOSPLIT | - WRDSF_ENV | WRDSF_UNDEF); + ws.ws_command = query_command_expand; + ws.ws_closure = cp; + wsflags = WRDSF_NOSPLIT | WRDSF_CLOSURE | WRDSF_ENV | WRDSF_UNDEF; + + if (cp->conf->debug_level == 100) { + ws.ws_debug = dbrw_debug; + wsflags |= WRDSF_DEBUG | WRDSF_SHOWDBG; + } + + rc = wordsplit(cp->conf->query, &ws, wsflags); + if (rc) { dbrw_error("cannot expand query `%s': %s", cp->conf->query, wordsplit_strerror(&ws)); wordsplit_free(&wsenv); return NULL; } diff --git a/src/wordsplit.c b/src/wordsplit.c index f4740bf..bad59b1 100644 --- a/src/wordsplit.c +++ b/src/wordsplit.c @@ -1,8 +1,8 @@ /* wordsplit - a word splitter - Copyright (C) 2009-2014 Sergey Poznyakoff + Copyright (C) 2009-2018 Sergey Poznyakoff This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. @@ -22,12 +22,14 @@ #include <ctype.h> #include <unistd.h> #include <stdlib.h> #include <string.h> #include <stdio.h> #include <stdarg.h> +#include <pwd.h> +#include <glob.h> #if ENABLE_NLS # include <gettext.h> #else # define gettext(msgid) msgid #endif @@ -45,23 +47,29 @@ #define ISALPHA(c) (ISUPPER(c) || ISLOWER(c)) #define ISDIGIT(c) ('0' <= ((unsigned) (c)) && ((unsigned) (c)) <= '9') #define ISXDIGIT(c) (strchr("abcdefABCDEF", c)!=NULL) #define ISALNUM(c) (ISALPHA(c) || ISDIGIT(c)) #define ISPRINT(c) (' ' <= ((unsigned) (c)) && ((unsigned) (c)) <= 127) +#define ISVARBEG(c) (ISALPHA(c) || c == '_') +#define ISVARCHR(c) (ISALNUM(c) || c == '_') + +#define WSP_RETURN_DELIMS(wsp) \ + ((wsp)->ws_flags & WRDSF_RETURN_DELIMS || ((wsp)->ws_options & WRDSO_MAXWORDS)) + #define ALLOC_INIT 128 #define ALLOC_INCR 128 static void _wsplt_alloc_die (struct wordsplit *wsp) { - wsp->ws_error (_("memory exhausted")); + wsp->ws_error ("%s", _("memory exhausted")); abort (); } -static void +static void _wsplt_error (const char *fmt, ...) { va_list ap; va_start (ap, fmt); vfprintf (stderr, fmt, ap); @@ -69,12 +77,21 @@ _wsplt_error (const char *fmt, ...) fputc ('\n', stderr); } static void wordsplit_free_nodes (struct wordsplit *); static int +_wsplt_seterr (struct wordsplit *wsp, int ec) +{ + wsp->ws_errno = ec; + if (wsp->ws_flags & WRDSF_SHOWERR) + wordsplit_perror (wsp); + return ec; +} + +static int _wsplt_nomem (struct wordsplit *wsp) { errno = ENOMEM; wsp->ws_errno = WRDSE_NOSPACE; if (wsp->ws_flags & WRDSF_ENOMEMABRT) wsp->ws_alloc_die (wsp); @@ -83,59 +100,137 @@ _wsplt_nomem (struct wordsplit *wsp) if (!(wsp->ws_flags & WRDSF_REUSE)) wordsplit_free (wsp); wordsplit_free_nodes (wsp); return wsp->ws_errno; } +static int wordsplit_run (const char *command, size_t length, + struct wordsplit *wsp, + int flags, int lvl); + +static int wordsplit_init (struct wordsplit *wsp, const char *input, size_t len, + int flags); +static int wordsplit_process_list (struct wordsplit *wsp, size_t start); +static int wordsplit_finish (struct wordsplit *wsp); + +static int +_wsplt_subsplit (struct wordsplit *wsp, struct wordsplit *wss, + char const *str, int len, + int flags, int finalize) +{ + int rc; + + wss->ws_delim = wsp->ws_delim; + wss->ws_debug = wsp->ws_debug; + wss->ws_error = wsp->ws_error; + wss->ws_alloc_die = wsp->ws_alloc_die; + + if (!(flags & WRDSF_NOVAR)) + { + wss->ws_env = wsp->ws_env; + wss->ws_getvar = wsp->ws_getvar; + flags |= wsp->ws_flags & (WRDSF_ENV | WRDSF_ENV_KV | WRDSF_GETVAR); + } + if (!(flags & WRDSF_NOCMD)) + { + wss->ws_command = wsp->ws_command; + } + + if ((flags & (WRDSF_NOVAR|WRDSF_NOCMD)) != (WRDSF_NOVAR|WRDSF_NOCMD)) + { + wss->ws_closure = wsp->ws_closure; + flags |= wsp->ws_flags & WRDSF_CLOSURE; + } + + wss->ws_options = wsp->ws_options; + + flags |= WRDSF_DELIM + | WRDSF_ALLOC_DIE + | WRDSF_ERROR + | WRDSF_DEBUG + | (wsp->ws_flags & (WRDSF_SHOWDBG | WRDSF_SHOWERR | WRDSF_OPTIONS)); + + rc = wordsplit_init (wss, str, len, flags); + if (rc) + return rc; + wss->ws_lvl = wsp->ws_lvl + 1; + rc = wordsplit_process_list (wss, 0); + if (rc) + { + wordsplit_free_nodes (wss); + return rc; + } + if (finalize) + { + rc = wordsplit_finish (wss); + wordsplit_free_nodes (wss); + } + return rc; +} + +static void +_wsplt_seterr_sub (struct wordsplit *wsp, struct wordsplit *wss) +{ + if (wsp->ws_errno == WRDSE_USERERR) + free (wsp->ws_usererr); + wsp->ws_errno = wss->ws_errno; + if (wss->ws_errno == WRDSE_USERERR) + { + wsp->ws_usererr = wss->ws_usererr; + wss->ws_errno = WRDSE_EOF; + wss->ws_usererr = NULL; + } +} + static void wordsplit_init0 (struct wordsplit *wsp) { if (wsp->ws_flags & WRDSF_REUSE) { if (!(wsp->ws_flags & WRDSF_APPEND)) wordsplit_free_words (wsp); + wordsplit_clearerr (wsp); } else { wsp->ws_wordv = NULL; wsp->ws_wordc = 0; wsp->ws_wordn = 0; } wsp->ws_errno = 0; - wsp->ws_head = wsp->ws_tail = NULL; } +char wordsplit_c_escape_tab[] = "\\\\\"\"a\ab\bf\fn\nr\rt\tv\v"; + static int wordsplit_init (struct wordsplit *wsp, const char *input, size_t len, int flags) { wsp->ws_flags = flags; if (!(wsp->ws_flags & WRDSF_ALLOC_DIE)) wsp->ws_alloc_die = _wsplt_alloc_die; if (!(wsp->ws_flags & WRDSF_ERROR)) wsp->ws_error = _wsplt_error; - if (!(wsp->ws_flags & WRDSF_NOVAR) - && !(wsp->ws_flags & (WRDSF_ENV | WRDSF_GETVAR))) + if (!(wsp->ws_flags & WRDSF_NOVAR)) { - errno = EINVAL; - wsp->ws_errno = WRDSE_USAGE; - if (wsp->ws_flags & WRDSF_SHOWERR) - wordsplit_perror (wsp); - return wsp->ws_errno; + /* These will be initialized on first variable assignment */ + wsp->ws_envidx = wsp->ws_envsiz = 0; + wsp->ws_envbuf = NULL; } if (!(wsp->ws_flags & WRDSF_NOCMD)) { - errno = EINVAL; - wsp->ws_errno = WRDSE_NOSUPP; - if (wsp->ws_flags & WRDSF_SHOWERR) - wordsplit_perror (wsp); - return wsp->ws_errno; + if (!wsp->ws_command) + { + _wsplt_seterr (wsp, WRDSE_USAGE); + errno = EINVAL; + return wsp->ws_errno; + } } if (wsp->ws_flags & WRDSF_SHOWDBG) { if (!(wsp->ws_flags & WRDSF_DEBUG)) { @@ -160,16 +255,48 @@ wordsplit_init (struct wordsplit *wsp, const char *input, size_t len, if (!(wsp->ws_flags & WRDSF_COMMENT)) wsp->ws_comment = NULL; if (!(wsp->ws_flags & WRDSF_CLOSURE)) wsp->ws_closure = NULL; + if (!(wsp->ws_flags & WRDSF_OPTIONS)) + wsp->ws_options = 0; + + if (wsp->ws_flags & WRDSF_ESCAPE) + { + if (!wsp->ws_escape[WRDSX_WORD]) + wsp->ws_escape[WRDSX_WORD] = ""; + if (!wsp->ws_escape[WRDSX_QUOTE]) + wsp->ws_escape[WRDSX_QUOTE] = ""; + } + else + { + if (wsp->ws_flags & WRDSF_CESCAPES) + { + wsp->ws_escape[WRDSX_WORD] = wordsplit_c_escape_tab; + wsp->ws_escape[WRDSX_QUOTE] = wordsplit_c_escape_tab; + wsp->ws_options |= WRDSO_OESC_QUOTE | WRDSO_OESC_WORD + | WRDSO_XESC_QUOTE | WRDSO_XESC_WORD; + } + else + { + wsp->ws_escape[WRDSX_WORD] = ""; + wsp->ws_escape[WRDSX_QUOTE] = "\\\\\"\""; + wsp->ws_options |= WRDSO_BSKEEP_QUOTE; + } + } + wsp->ws_endp = 0; + wsp->ws_wordi = 0; + if (wsp->ws_flags & WRDSF_REUSE) + wordsplit_free_nodes (wsp); + wsp->ws_head = wsp->ws_tail = NULL; + wordsplit_init0 (wsp); - + return 0; } static int alloc_space (struct wordsplit *wsp, size_t count) { @@ -206,12 +333,13 @@ alloc_space (struct wordsplit *wsp, size_t count) #define _WSNF_NULL 0x01 /* null node (a noop) */ #define _WSNF_WORD 0x02 /* node contains word in v.word */ #define _WSNF_QUOTE 0x04 /* text is quoted */ #define _WSNF_NOEXPAND 0x08 /* text is not subject to expansion */ #define _WSNF_JOIN 0x10 /* node must be joined with the next node */ #define _WSNF_SEXP 0x20 /* is a sed expression */ +#define _WSNF_DELIM 0x40 /* node is a delimiter */ #define _WSNF_EMPTYOK 0x0100 /* special flag indicating that wordsplit_add_segm must add the segment even if it is empty */ struct wordsplit_node @@ -230,13 +358,13 @@ struct wordsplit_node } v; }; static const char * wsnode_flagstr (int flags) { - static char retbuf[6]; + static char retbuf[7]; char *p = retbuf; if (flags & _WSNF_WORD) *p++ = 'w'; else if (flags & _WSNF_NULL) *p++ = 'n'; @@ -255,12 +383,16 @@ wsnode_flagstr (int flags) else *p++ = '-'; if (flags & _WSNF_SEXP) *p++ = 's'; else *p++ = '-'; + if (flags & _WSNF_DELIM) + *p++ = 'd'; + else + *p++ = '-'; *p = 0; return retbuf; } static const char * wsnode_ptr (struct wordsplit *wsp, struct wordsplit_node *p) @@ -335,12 +467,20 @@ wsnode_remove (struct wordsplit *wsp, struct wordsplit_node *node) else wsp->ws_tail = node->prev; node->next = node->prev = NULL; } +static struct wordsplit_node * +wsnode_tail (struct wordsplit_node *p) +{ + while (p && p->next) + p = p->next; + return p; +} + static void wsnode_insert (struct wordsplit *wsp, struct wordsplit_node *node, struct wordsplit_node *anchor, int before) { if (!wsp->ws_head) { @@ -350,28 +490,30 @@ wsnode_insert (struct wordsplit *wsp, struct wordsplit_node *node, else if (before) { if (anchor->prev) wsnode_insert (wsp, node, anchor->prev, 0); else { + struct wordsplit_node *tail = wsnode_tail (node); node->prev = NULL; - node->next = anchor; - anchor->prev = node; + tail->next = anchor; + anchor->prev = tail; wsp->ws_head = node; } } else { struct wordsplit_node *p; + struct wordsplit_node *tail = wsnode_tail (node); p = anchor->next; if (p) - p->prev = node; + p->prev = tail; else - wsp->ws_tail = node; - node->next = p; + wsp->ws_tail = tail; + tail->next = p; node->prev = anchor; anchor->next = node; } } static int @@ -412,16 +554,18 @@ wordsplit_dump_nodes (struct wordsplit *wsp) struct wordsplit_node *p; int n = 0; for (p = wsp->ws_head, n = 0; p; p = p->next, n++) { if (p->flags & _WSNF_WORD) - wsp->ws_debug ("%4d: %p: %#04x (%s):%s;", + wsp->ws_debug ("(%02d) %4d: %p: %#04x (%s):%s;", + wsp->ws_lvl, n, p, p->flags, wsnode_flagstr (p->flags), p->v.word); else - wsp->ws_debug ("%4d: %p: %#04x (%s):%.*s;", + wsp->ws_debug ("(%02d) %4d: %p: %#04x (%s):%.*s;", + wsp->ws_lvl, n, p, p->flags, wsnode_flagstr (p->flags), (int) (p->v.segm.end - p->v.segm.beg), wsp->ws_input + p->v.segm.beg); } } @@ -430,12 +574,15 @@ coalesce_segment (struct wordsplit *wsp, struct wordsplit_node *node) { struct wordsplit_node *p, *end; size_t len = 0; char *buf, *cur; int stop; + if (!(node->flags & _WSNF_JOIN)) + return 0; + for (p = node; p && (p->flags & _WSNF_JOIN); p = p->next) { len += wsnode_len (p); } if (p) len += wsnode_len (p); @@ -454,12 +601,13 @@ coalesce_segment (struct wordsplit *wsp, struct wordsplit_node *node) size_t slen = wsnode_len (p); memcpy (cur, str, slen); cur += slen; if (p != node) { + node->flags |= p->flags & _WSNF_QUOTE; wsnode_remove (wsp, p); stop = p == end; wsnode_free (p); } p = next; } @@ -473,30 +621,29 @@ coalesce_segment (struct wordsplit *wsp, struct wordsplit_node *node) else node->flags |= _WSNF_WORD; node->v.word = buf; return 0; } +static void wordsplit_string_unquote_copy (struct wordsplit *ws, int inquote, + char *dst, const char *src, + size_t n); + static int wsnode_quoteremoval (struct wordsplit *wsp) { struct wordsplit_node *p; - void (*uqfn) (char *, const char *, size_t) = - (wsp->ws_flags & WRDSF_CESCAPES) ? - wordsplit_c_unquote_copy : wordsplit_sh_unquote_copy; for (p = wsp->ws_head; p; p = p->next) { const char *str = wsnode_ptr (wsp, p); size_t slen = wsnode_len (p); int unquote; if (wsp->ws_flags & WRDSF_QUOTE) - { - unquote = !(p->flags & _WSNF_NOEXPAND); - } + unquote = !(p->flags & _WSNF_NOEXPAND); else unquote = 0; if (unquote) { if (!(p->flags & _WSNF_WORD)) @@ -507,17 +654,14 @@ wsnode_quoteremoval (struct wordsplit *wsp) memcpy (newstr, str, slen); newstr[slen] = 0; p->v.word = newstr; p->flags |= _WSNF_WORD; } - if (wsp->ws_flags & WRDSF_ESCAPE) - wordsplit_general_unquote_copy (p->v.word, str, slen, - wsp->ws_escape); - else - uqfn (p->v.word, str, slen); + wordsplit_string_unquote_copy (wsp, p->flags & _WSNF_QUOTE, + p->v.word, str, slen); } } return 0; } static int @@ -532,48 +676,218 @@ wsnode_coalesce (struct wordsplit *wsp) return 1; } return 0; } static int +wsnode_tail_coalesce (struct wordsplit *wsp, struct wordsplit_node *p) +{ + if (p->next) + { + struct wordsplit_node *np = p; + while (np && np->next) + { + np->flags |= _WSNF_JOIN; + np = np->next; + } + if (coalesce_segment (wsp, p)) + return 1; + } + return 0; +} + +static size_t skip_delim (struct wordsplit *wsp); + +static int wordsplit_finish (struct wordsplit *wsp) { struct wordsplit_node *p; size_t n; + int delim; - n = 0; + /* Postprocess delimiters. It would be rather simple, if it weren't for + the incremental operation. - for (p = wsp->ws_head; p; p = p->next) - n++; + Nodes of type _WSNF_DELIM get inserted to the node list if either + WRDSF_RETURN_DELIMS flag or WRDSO_MAXWORDS option is set. + + The following cases should be distinguished: + + 1. If both WRDSF_SQUEEZE_DELIMS and WRDSF_RETURN_DELIMS are set, compress + any runs of similar delimiter nodes to a single node. The nodes are + 'similar' if they point to the same delimiter character. + + If WRDSO_MAXWORDS option is set, stop compressing when + ws_wordi + 1 == ws_maxwords, and coalesce the rest of nodes into + a single last node. + + 2. If WRDSO_MAXWORDS option is set, but WRDSF_RETURN_DELIMS is not, + remove any delimiter nodes. Stop operation when + ws_wordi + 1 == ws_maxwords, and coalesce the rest of nodes into + a single last node. + + 3. If incremental operation is in progress, restart the loop any time + a delimiter node is about to be returned, unless WRDSF_RETURN_DELIMS + is set. + */ + again: + delim = 0; /* Delimiter being processed (if any) */ + n = 0; /* Number of words processed so far */ + p = wsp->ws_head; /* Current node */ + + while (p) + { + struct wordsplit_node *next = p->next; + if (p->flags & _WSNF_DELIM) + { + if (wsp->ws_flags & WRDSF_RETURN_DELIMS) + { + if (wsp->ws_flags & WRDSF_SQUEEZE_DELIMS) + { + char const *s = wsnode_ptr (wsp, p); + if (delim) + { + if (delim == *s) + { + wsnode_remove (wsp, p); + p = next; + continue; + } + else + { + delim = 0; + n++; /* Count this node; it will be returned */ + } + } + else + { + delim = *s; + p = next; + continue; + } + } + } + else if (wsp->ws_options & WRDSO_MAXWORDS) + { + wsnode_remove (wsp, p); + p = next; + continue; + } + } + else + { + if (delim) + { + /* Last node was a delimiter or a compressed run of delimiters; + Count it, and clear the delimiter marker */ + n++; + delim = 0; + } + if (wsp->ws_options & WRDSO_MAXWORDS) + { + if (wsp->ws_wordi + n + 1 == wsp->ws_maxwords) + break; + } + } + n++; + if (wsp->ws_flags & WRDSF_INCREMENTAL) + p = NULL; /* Break the loop */ + else + p = next; + } + + if (p) + { + /* We're here if WRDSO_MAXWORDS is in effect and wsp->ws_maxwords + words have already been collected. Reconstruct a single final + node from the remaining nodes. |