Fix default escape settings.HEAD master

* wordsplit.c (wordsplit_escape): New global. (wordsplit_init): Backslash interpretation is disabled if not expliticitly configured. (wsnode_quoteremoval): Unquote unless _WSNF_NOEXPAND is set. (scan_word): Fix backslash handling if WRDSF_QUOTE flags are set. * wsp.c: Fix option handling. * wordsplit.at: Test handling of C-style escapes. * README: Document changes. * wordsplit.3: Likewise.
author: Sergey Poznyakoff <gray@gnu.org> 2023-06-21 16:01:32 +0300
committer: Sergey Poznyakoff <gray@gnu.org> 2023-06-22 17:12:46 +0300
commit: e2f0c64db92c3865cfe606959885221e55f79ace (patch)
tree: 943ed5e04b339a0461cd18dff6349edd74072a95
parent: 403b1c769fbb85d29f110742d9989eed9c2daac2 (diff)
download: wordsplit-master.tar.gz
wordsplit-master.tar.bz2
6 files changed, 147 insertions, 79 deletions
diff --git a/README b/README
index ecdcda2..2d315ae 100644
--- a/README
+++ b/README
@@ -1,10 +1,13 @@
+README file for the wordsplit library
+See the end of file for copying conditions.
+
 * Overview
 
 This package provides a set of C functions for parsing input strings.
-Default parsing rules are are similar to those used in Bourne shell.
+Default parsing rules are similar to those used in Bourne shell.
 This includes tilde expansion, variable expansion, quote removal, word
 splitting, command substitution, and path expansion.  Parsing is
 controlled by a number of settings which allow the caller to alter
 processing at each of these phases or even to disable any of them.
 Thus, wordsplit can be used for parsing inputs in different formats,
 from simple character-delimited entries, as in /etc/passwd, and up to
@@ -43,21 +46,21 @@ The package is designed as a drop-in facility for use in larger
 programs.  It consists of the following files:
 
   wordsplit.h   - Interface header.
   wordsplit.c   - Main source file.
   wordsplit.3   - Documentation.
 
-For most uses, you will need only these three.  The rest of files
+For most uses, you will need only these three.  The remaining files
 are for building the autotest-based testsuite:
 
   wsp.c         - Auxiliary test program.
   wordsplit.at  - The source for the testsuite.
 
 * Incorporating wordsplit into your project
 
-The project is designed to be used as a git submodule.  To incorporate
+Wordsplit is designed to be used as a git submodule.  To incorporate
 it into your project, first select the location for the wordsplit
 directory within your project.  Then add the submodule at this
 location.  The rest is quite straightforward: you need to add
 wordsplit.c to your sources and add both wordsplit.c and wordsplit.h
 to the distributed files.
 
@@ -114,13 +117,13 @@ Modify the VPATH variable in your Makefile.am:
 Add wordsplit.c to the nodist_program_SOURCES variable:
 
   nodist_program_SOURCES = wordsplit.c
 
 The nodist_ prefix is necessary to prevent Make from trying to
 distribute this file from the current directory (where it doesn't
-exist of course).  During compilation it will be located using VPATH.
+exist, of course).  During compilation it will be located using VPATH.
 
 Finally, add both wordsplit/wordsplit.c and wordsplit/wordsplit.h to
 the EXTRA_DIST variable and modify AM_CPPFLAGS as shown in the
 previous section.
 
 An example Makefile.am:
@@ -210,24 +213,24 @@ Then, add the following fragment to build the auxiliary files:
   nodist_wsp_SOURCES = wsp.c
   wsp.o: $(srcdir)/wordsplit-version.h
   VPATH = $(srcdir):$(top_srcdir)/wordsplit
 
 * History
 
-First version of wordsplit appeared in March 2009 as a part of the
+First version of wordsplit appeared in March 2009 as part of the
 Wydawca[1] project.  Its main usage was to assist in configuration
 file parsing.  The parser subsystem proved to be quite useful and
 soon evolved into a separate project - Grecs[2].  This package had been
 since used (as a git submodule) in a number of other projects, such as
 GNU Dico[3] and Direvent[4], to name a few.
 
-In 2010 the wordsplit sources were incorporated to the GNU
-Mailutils[5] package, where they replaced the obsolete argcv module.
-Mailutils uses its own configuration package, which meant that using
-Grecs was not expedient.  Therefore the sources had been exported from
-Grecs. Since then both Mailutils and Grecs versions are periodically
+In 2010 wordsplit sources were incorporated to the GNU Mailutils[5]
+package, where they replaced the obsolete argcv module.  Mailutils
+uses its own configuration package, which meant that using Grecs was
+not expedient.  Therefore the sources had been exported from
+Grecs. Since then both Mailutils and Grecs versions were periodically
 synchronized.
 
 Several other projects, such as GNU Rush[6] and fileserv[7], followed
 suit.  It was therefore decided that it would be advisable to
 have wordsplit as a separate package which could be easily included in
 another project without incurring unnecessary overhead.
@@ -272,13 +275,13 @@ the following information:
   2. Input string.
   3. Produced output.
   4. Expected output.
 
 * Copying
 
-Copyright (C) 2009-2021 Sergey Poznyakoff
+Copyright (C) 2009-2023 Sergey Poznyakoff
 
 Permission is granted to anyone to make or distribute verbatim copies
 of this document as received, in any medium, provided that the
 copyright notice and this permission notice are preserved,
 thus giving the recipient permission to redistribute in turn.
 
diff --git a/wordsplit.3 b/wordsplit.3
index 401cad8..4f86f1a 100644
--- a/wordsplit.3
+++ b/wordsplit.3
@@ -11,13 +11,13 @@
 .\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 .\" GNU General Public License for more details.
 .\"
 .\" You should have received a copy of the GNU General Public License
 .\" along with wordsplit.  If not, see <http://www.gnu.org/licenses/>.
 .\"
-.TH WORDSPLIT 3 "July 24, 2019" "WORDSPLIT" "Wordsplit User Reference"
+.TH WORDSPLIT 3 "June 22, 2023" "WORDSPLIT" "Wordsplit User Reference"
 .SH NAME
 wordsplit \- split string into words 
 .SH SYNOPSIS
 .B #include <wordsplit.h>
 .sp
 \fBint wordsplit (const char *\fIs\fB,\
@@ -296,22 +296,21 @@ Recognition of single quoted strings is enabled by the
 \fBWRDSF_SQUOTE\fR flag.  Recognition of double quotes is enabled by
 the \fBWRDSF_DQUOTE\fR flag.  The macro \fBWRDSF_QUOTE\fR enables both.
 .SS Backslash interpretation
 Backslash interpretation translates unquoted
 .I escape sequences
 into corresponding characters.  An escape sequence is a backslash followed
-by one or more characters.  By default, each sequence \fB\\\fIC\fR
-appearing in unquoted words is replaced with the character \fIC\fR.  In
-doubly-quoted strings, two backslash sequences are recognized:
-\fB\\\\\fR translates to a single backslash, and \fB\\\(dq\fR
-translates to a double-quote.
-.PP
-Two flags are provided to modify this behavior.  If
-.I WRDSF_CESCAPES
-flag is set, the following escape sequences are recognized:
-.sp
+by one or more characters.  By default, that is if no flags are
+supplied, no escape sequences are defined, and each sequence
+\fB\\\fIC\fR is reproduced verbatim.
+.PP
+There are several ways to enable backslash interpretation and to
+define escape sequences.  The simplest one is to use the
+\fBWRDSF_CESCAPES\fR flag.  This flag defines the C-like escape
+sequences:
+.PP
 .nf
 .ta 8n 18n 42n
 .ul
 	Sequence	Expansion	ASCII
 	\fB\\\\\fR	\fB\\\fR	134
 	\fB\\\(dq\fR	\fB\(dq\fR	042
@@ -326,25 +325,65 @@ flag is set, the following escape sequences are recognized:
 .sp
 The sequence \fB\\x\fINN\fR or \fB\\X\fINN\fR, where \fINN\fR stands
 for a two-digit hex number is replaced with ASCII character \fINN\fR.
 The sequence \fB\\0\fINNN\fR, where \fINNN\fR stands for a three-digit
 octal number is replaced with ASCII character whose code is \fINNN\fR.
 .PP
-The \fBWRDSF_ESCAPE\fR flag allows the caller to customize escape
-sequences.  If it is set, the \fBws_escape\fR member must be
+Additionally, outside of quoted strings (if these are enabled by the
+use of \fBWRDSF_DQUOTE\fR flag) backslash character can be used to
+escape horizontal whitespace: horizontal space (ASCII 32) and
+tab (ASCII 9) characters.
+.PP
+The \fBWRDSF_CESCAPES\fR bit is included in the default flag
+set \fBWRDSF_DEFFLAGS\fR.
+.PP
+The \fBWRDSF_ESCAPE\fR flag provides a more elaborate way of defining
+escape sequences.  If it is set, the \fBws_escape\fR member must be
 initialized.  This member provides escape tables for unquoted words
-(\fBws_escape[0]\fR) and quoted strings (\fBws_escape[1]\fR).  Each
-table is a string consisting of an even number of characters.  In each
-pair of characters, the first one is a character that can appear after
-backslash, and the following one is its translation.  For example, the
-above table of C escapes is represented as 
-\fB\(dq\\\\\\\\"\\"a\\ab\\bf\\fn\\nr\\rt\\tv\\v\(dq\fR.
-.PP
-It is valid to initialize \fBws_escape\fR elements to zero.  In this
+(\fBws_escape[WRDSX_WORD]\fR) and quoted strings
+(\fBws_escape[WRDSX_QUOTE]\fR).  Each table is a string consisting of
+an even number of characters.  In each pair of characters, the first
+one is a character that can appear after backslash, and the following
+one is its translation.  For example, the table of C escapes is
+represented as follows:
+.TP
+\fB\(dq\\\\\\\\"\\"a\\ab\\bf\\fn\\nr\\rt\\tv\\v\(dq\fR
+.PP
+It is valid to initialize \fBws_escape\fR elements to NULL.  In this
 case, no backslash translation occurs.
 .PP
+For convenience, the global variable
+.B wordsplit_escape
+defines several most often used escape translation tables:
+.PP
+.EX
+extern char const *wordsplit_escape[];
+.EE
+.PP
+It is indexed by the following constants:
+.TP
+.B WS_ESC_C
+C-style escapes, the definition of which is shown above.  This is the
+translation table that is used within quoted strings when
+.B WRDSF_CESCAPES
+is in effect.
+.TP
+.B WS_ESC_C_WS
+The \fBWS_ESC_C\fR table augmented by two entries: for horizontal tab
+character and whitespace.  This is the table that is used for unquoted
+words when
+.B WRDSF_CESCAPES
+is in effect.
+.TP
+.B WS_ESC_DQ
+Backslash character escapes double-quote and itself.  Useful for
+handling doubly-quoted strings in various Internet protocols.
+.TP
+.B WS_ESC_DQ_WS
+Escape double-quote, backslash, horizontal tab and whitespace characters.
+.PP
 Interpretation of octal and hex escapes is controlled by the following
 bits in \fBws_options\fR:
 .TP
 .B WRDSO_BSKEEP_WORD
 When an unrecognized escape sequence is encountered in a word,
 preserve it on output.  If that bit is not set, the backslash is
diff --git a/wordsplit.at b/wordsplit.at
index 293bc9a..38114c5 100644
--- a/wordsplit.at
+++ b/wordsplit.at
@@ -1,8 +1,8 @@
 # Test suite for wordsplit -*- Autotest -*-
-# Copyright (C) 2014-2021 Sergey Poznyakoff
+# Copyright (C) 2014-2023 Sergey Poznyakoff
 #
 # Wordsplit is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation; either version 3, or (at your option)
 # any later version.
 #
@@ -520,32 +520,41 @@ TESTWSP([sed expressions],[],[-sed],
 2: arg2
 TOTAL: 3
 ])
 
 WSPGROUP()
 
-TESTWSP([C escapes on],[wcp-c-escape],[-cescapes],
-[a\ttab form\ffeed and new\nline],
+TESTWSP([C escapes on],[wcp-c-escape],[-nodefault -dquote -cescapes],
+["a\ttab" "form\ffeed" and "new\nline"],
 [NF: 4
 0: a\ttab
 1: form\ffeed
 2: and
 3: new\nline
 TOTAL: 4
 ])
 
-TESTWSP([C escapes off],[wcp-c-escape-off],[-nocescapes],
-[a\ttab form\ffeed and new\nline],
+TESTWSP([C escapes off],[wcp-c-escape-off],[-nodefault -dquote -nocescapes],
+["a\ttab" "form\ffeed" and "new\nline"],
 [NF: 4
 0: attab
 1: formffeed
 2: and
 3: newnline
 TOTAL: 4
 ])
 
+TESTWSP([C escapes on (unquoted)],[wcp-c-escape],[-nodefault -cescapes],
+[a\ttab \"form\ffeed\" and\ new\\nline],
+[NF: 3
+0: a\ttab
+1: "\"form\ffeed\""
+2: "and new\\nline"
+TOTAL: 3
+])
+
 TESTWSP([ws elimination],[wsp-ws-elim],[-delim ' ()' -ws -return_delims],
 [( list  items  )],
 [NF: 4
 0: (
 1: list
 2: items
diff --git a/wordsplit.c b/wordsplit.c
index aca63df..9139e85 100644
--- a/wordsplit.c
+++ b/wordsplit.c
@@ -1,8 +1,8 @@
 /* wordsplit - a word splitter
-   Copyright (C) 2009-2021 Sergey Poznyakoff
+   Copyright (C) 2009-2023 Sergey Poznyakoff
 
    This program is free software; you can redistribute it and/or modify it
    under the terms of the GNU General Public License as published by the
    Free Software Foundation; either version 3 of the License, or (at your
    option) any later version.
 
@@ -64,12 +64,14 @@ is_name_char (struct wordsplit *wsp, int c)
 #define WSP_RETURN_DELIMS(wsp) \
  ((wsp)->ws_flags & WRDSF_RETURN_DELIMS || ((wsp)->ws_options & WRDSO_MAXWORDS))
 
 #define to_num(c) \
   (ISDIGIT(c) ? c - '0' : (ISXDIGIT(c) ? toupper(c) - 'A' + 10 : 255 ))
 
+static int wsplt_unquote_char (const char *transtab, int c);
+
 #define ALLOC_INIT 128
 #define ALLOC_INCR 128
 
 static void
 _wsplt_alloc_die (struct wordsplit *wsp)
 {
@@ -244,13 +246,22 @@ wordsplit_init0 (struct wordsplit *wsp)
       wsp->ws_wordn = 0;
     }
 
   wsp->ws_errno = 0;
 }
 
-char wordsplit_c_escape_tab[] = "\\\\\"\"a\ab\bf\fn\nr\rt\tv\v";
+char const *wordsplit_escape[] = {
+	/* C-style escapes, for quoted strings */
+	[WS_ESC_C]     = "\\\\\"\"a\ab\bf\fn\nr\rt\tv\v",
+	/* C-style escapes, outsize of quoted strings */
+	[WS_ESC_C_WS]  = "\\\\\"\"a\ab\bf\fn\nr\rt\tv\v  \t\t",
+	/* Escape double-quote and backslash. */
+	[WS_ESC_DQ]    = "\\\\\"\"",
+	/* Escape double-quote, backslash, and whitespace. */
+	[WS_ESC_DQ_WS] = "\\\\\"\"  \t\t"
+};
 
 static int
 wordsplit_init (struct wordsplit *wsp, const char *input, size_t len,
 		int flags)
 {
   wsp->ws_flags = flags;
@@ -311,27 +322,23 @@ wordsplit_init (struct wordsplit *wsp, const char *input, size_t len,
     {
       if (!wsp->ws_escape[WRDSX_WORD])
 	wsp->ws_escape[WRDSX_WORD] = "";
       if (!wsp->ws_escape[WRDSX_QUOTE])
 	wsp->ws_escape[WRDSX_QUOTE] = "";
     }
+  else if (wsp->ws_flags & WRDSF_CESCAPES)
+    {
+      wsp->ws_escape[WRDSX_WORD] = wordsplit_escape[WS_ESC_C_WS];
+      wsp->ws_escape[WRDSX_QUOTE] = wordsplit_escape[WS_ESC_C];
+      wsp->ws_options |= WRDSO_OESC_QUOTE | WRDSO_OESC_WORD
+	              | WRDSO_XESC_QUOTE | WRDSO_XESC_WORD;
+    }
   else
     {
-      if (wsp->ws_flags & WRDSF_CESCAPES)
-	{
-	  wsp->ws_escape[WRDSX_WORD] = wordsplit_c_escape_tab;
-	  wsp->ws_escape[WRDSX_QUOTE] = wordsplit_c_escape_tab;
-	  wsp->ws_options |= WRDSO_OESC_QUOTE | WRDSO_OESC_WORD
-			     | WRDSO_XESC_QUOTE | WRDSO_XESC_WORD;
-	}
-      else
-	{
-	  wsp->ws_escape[WRDSX_WORD] = "";
-	  wsp->ws_escape[WRDSX_QUOTE] = "\\\\\"\"";
-	  wsp->ws_options |= WRDSO_BSKEEP_QUOTE;
-	}
+      wsp->ws_escape[WRDSX_WORD] = "";
+      wsp->ws_escape[WRDSX_QUOTE] = "";
     }
 
   if (!(wsp->ws_options & WRDSO_PARAMV))
     {
       wsp->ws_paramv = NULL;
       wsp->ws_paramc = 0;
@@ -697,20 +704,14 @@ wsnode_quoteremoval (struct wordsplit *wsp)
   struct wordsplit_node *p;
 
   for (p = wsp->ws_head; p; p = p->next)
     {
       const char *str = wsnode_ptr (wsp, p);
       size_t slen = wsnode_len (p);
-      int unquote;
-
-      if (wsp->ws_flags & WRDSF_QUOTE)
-	unquote = !(p->flags & _WSNF_NOEXPAND);
-      else
-	unquote = 0;
 
-      if (unquote)
+      if (!(p->flags & _WSNF_NOEXPAND))
 	{
 	  if (!(p->flags & _WSNF_WORD))
 	    {
 	      char *newstr = malloc (slen + 1);
 	      if (!newstr)
 		return _wsplt_nomem (wsp);
@@ -2300,36 +2301,40 @@ scan_word (struct wordsplit *wsp, size_t start, int consume_all)
 	      if (wordsplit_add_segm (wsp, start, i, 0))
 		return _WRDS_ERR;
 	      wsp->ws_endp = j;
 	      return _WRDS_OK;
 	    }
 
-	  if (wsp->ws_flags & WRDSF_QUOTE)
+	  if (command[i] == '\\')
 	    {
-	      if (command[i] == '\\')
+	      if (i + 1 == len)
 		{
-		  if (++i == len)
-		    break;
 		  i++;
-		  continue;
+		  break;
 		}
-
-	      if (((wsp->ws_flags & WRDSF_SQUOTE) && command[i] == '\'') ||
-		  ((wsp->ws_flags & WRDSF_DQUOTE) && command[i] == '"'))
+	      if (wsplt_unquote_char (wsp->ws_escape[WRDSX_WORD], command[i+1]))
 		{
-		  if (join && wsp->ws_tail)
-		    wsp->ws_tail->flags |= _WSNF_JOIN;
-		  if (wordsplit_add_segm (wsp, start, i, _WSNF_JOIN))
-		    return _WRDS_ERR;
-		  if (scan_qstring (wsp, i, &i))
-		    return _WRDS_ERR;
-		  start = i + 1;
-		  join = 1;
+		  i += 2;
+		  continue;
 		}
 	    }
 
+	  if ((wsp->ws_flags & WRDSF_QUOTE) &&
+	      (((wsp->ws_flags & WRDSF_SQUOTE) && command[i] == '\'') ||
+	       ((wsp->ws_flags & WRDSF_DQUOTE) && command[i] == '"')))
+	    {
+	      if (join && wsp->ws_tail)
+		wsp->ws_tail->flags |= _WSNF_JOIN;
+	      if (wordsplit_add_segm (wsp, start, i, _WSNF_JOIN))
+		return _WRDS_ERR;
+	      if (scan_qstring (wsp, i, &i))
+		return _WRDS_ERR;
+	      start = i + 1;
+	      join = 1;
+	    }
+
 	  if (command[i] == '$')
 	    {
 	      if ((!(wsp->ws_flags & WRDSF_NOVAR)
 		   || (wsp->ws_options & WRDSO_NOVARSPLIT))
 		  && command[i+1] == '{'
 		  && find_closing_paren (command, i + 2, len, &i, "{}") == 0)
@@ -2446,19 +2451,19 @@ wsplt_quote_char (const char *transtab, int c)
   return 0;
 }
 
 int
 wordsplit_c_unquote_char (int c)
 {
-  return wsplt_unquote_char (wordsplit_c_escape_tab, c);
+  return wsplt_unquote_char (wordsplit_escape[WS_ESC_C], c);
 }
 
 int
 wordsplit_c_quote_char (int c)
 {
-  return wsplt_quote_char (wordsplit_c_escape_tab, c);
+  return wsplt_quote_char (wordsplit_escape[WS_ESC_C], c);
 }
 
 void
 wordsplit_string_unquote_copy (struct wordsplit *ws, int inquote,
 			       char *dst, const char *src, size_t n)
 {
diff --git a/wordsplit.h b/wordsplit.h
index 794207a..768df34 100644
--- a/wordsplit.h
+++ b/wordsplit.h
@@ -1,8 +1,8 @@
 /* wordsplit - a word splitter
-   Copyright (C) 2009-2021 Sergey Poznyakoff
+   Copyright (C) 2009-2023 Sergey Poznyakoff
 
    This program is free software; you can redistribute it and/or modify it
    under the terms of the GNU General Public License as published by the
    Free Software Foundation; either version 3 of the License, or (at your
    option) any later version.
 
@@ -306,7 +306,17 @@ void wordsplit_c_quote_copy (char *dst, const char *src, int quote_hex);
 
 void wordsplit_perror (wordsplit_t *ws);
 const char *wordsplit_strerror (wordsplit_t *ws);
 
 void wordsplit_clearerr (wordsplit_t *ws);
 
+enum
+{
+  WS_ESC_C,     /* C-style escapes, for quoted strings */
+  WS_ESC_C_WS,  /* C-style escapes plus whitespace.  For unquoted words */
+  WS_ESC_DQ,    /* Escape double-quote and backslash. */
+  WS_ESC_DQ_WS, /* Escape double-quote, backslash, and whitespace. */
+};
+
+extern char const *wordsplit_escape[];
+
 #endif
diff --git a/wsp.c b/wsp.c
index 38420b3..144f728 100644
--- a/wsp.c
+++ b/wsp.c
@@ -1,8 +1,8 @@
 /* wsp - test program for wordsplit
-   Copyright (C) 2014-2021 Sergey Poznyakoff
+   Copyright (C) 2014-2023 Sergey Poznyakoff
 
    Wordsplit is free software; you can redistribute it and/or modify it
    under the terms of the GNU General Public License as published by the
    Free Software Foundation; either version 3 of the License, or (at your
    option) any later version.
 
@@ -218,12 +218,14 @@ getwsopt (int argc, char **argv, struct wsopt *wso, struct wsclosure *wsc)
 		  fprintf (stderr, "%s: missing arguments for -%s\n",
 			   progname, opt);
 		  exit (1);
 		}
 	      arg = argv[wsoptind++];
 	    }
+          else
+            arg = NULL;
 	  wso->setfn (wso->tok, negate, arg, wsc);
 	}
       return 0;
     }
 
   fprintf (stderr, "%s: unrecognized option: -%s\n",
author	Sergey Poznyakoff <gray@gnu.org>	2023-06-21 16:01:32 +0300
committer	Sergey Poznyakoff <gray@gnu.org>	2023-06-22 17:12:46 +0300
commit	e2f0c64db92c3865cfe606959885221e55f79ace (patch)
tree	943ed5e04b339a0461cd18dff6349edd74072a95
parent	403b1c769fbb85d29f110742d9989eed9c2daac2 (diff)
download	wordsplit-master.tar.gz wordsplit-master.tar.bz2