Relax sytnax requirements for MIME structured headers.

* libmailutils/base/ctparse.c (content_type_parse): Rewrite the value splitting on type and subtype. Allow for optional whitespace at both sides of "/". Both type and subtype can contain arbitrary characters (except for "/"). This is not right according to RFC 2045, but reportedly such cases exist in old mails. * libmailutils/mailbox/bodystruct.c (bodystructure_fill): Fix parsing of the Content-Type header. Fix storing the pointer to an automatic variable in assoc object. Unfold the Content-Disposition value. * libmailutils/mime/mimehdr.c (_mime_header_parse): Return in pvalue entire prefix part up to the first semicolon, with leading and trailing whitespace removed. Allow for both output parameters to be NULL. * libmailutils/tests/content-type.at: Add new test case. * libmailutils/tests/conttype.c: ignore empty lines in input. * libmailutils/tests/mimehdr.at: Change mimehdr16 and mimehdr17 tests: this syntax is accepted by the relaxed rules of the modified parser.
author: Sergey Poznyakoff <gray@gnu.org> 2020-07-23 13:42:23 +0300
committer: Sergey Poznyakoff <gray@gnu.org> 2020-07-23 14:07:58 +0300
commit: ff9c0a396c14c219532dd667d49ebc7a3f376f9b (patch)
tree: a0d9a259072018dd97b936042df7d89daa3c8b41 /libmailutils
parent: 78c28187ffb8f6ae8aa0662e6be193d23ae7808b (diff)
download: mailutils-ff9c0a396c14c219532dd667d49ebc7a3f376f9b.tar.gz
mailutils-ff9c0a396c14c219532dd667d49ebc7a3f376f9b.tar.bz2
6 files changed, 134 insertions, 94 deletions
diff --git a/libmailutils/base/ctparse.c b/libmailutils/base/ctparse.c
index a6c93f9f1..b5bed3e86 100644
--- a/libmailutils/base/ctparse.c
+++ b/libmailutils/base/ctparse.c
@@ -29,45 +29,67 @@
 #include <mailutils/cctype.h>
 #include <mailutils/cstr.h>
 
-  
+/* Parse the content type header value in INPUT.  If CHARSET is not
+   NULL, convert textual parameters to this charset.
+
+   Store the result in CT.
+
+   In case of error, CT is left partially constructed.  The caller
+   must free it.
+   
+   Parsing of the type/subtype value is relaxed: any characters are
+   allowed in either part (except for "/", which can't appear in type).
+   Although RFC 2045 forbids that, mails with such content types reportedly
+   exist (see conversation with Karl Berry on 2020-07-21, particularly
+   <202007220115.06M1FuTh001462@freefriends.org> and my reply
+   <20200722133251.8412@ulysses.gnu.org.ua>).
+
+   Type must not be empty, but empty subtype is allowed.
+*/   
 static int
 content_type_parse (const char *input, const char *charset,
 		    mu_content_type_t ct)
 {
   int rc;
   char *value, *p;
-  
+
   rc = mu_mime_header_parse (input, charset, &value, &ct->param);
   if (rc)
     return rc;
+
   p = strchr (value, '/');
   if (p)
     {
       size_t len = p - value;
+      while (len > 0 && mu_isspace (value[len-1]))
+	len--;
+      if (len == 0)
+	{
+	  rc = MU_ERR_PARSE;
+	  goto end;
+	}
+      
+      p = mu_str_skip_class (p + 1, MU_CTYPE_SPACE);
+      
       ct->type = malloc (len + 1);
       if (!ct->type)
 	{
 	  rc = errno;
-	  free (value);
-	  return rc;
+	  goto end;
 	}
+      
       memcpy (ct->type, value, len);
       ct->type[len] = 0;
 
-      ct->subtype = strdup (p + 1);
-      free (value);
-
+      ct->subtype = strdup (p);
       if (!ct->subtype)
-	{
-	  rc = errno;
-	  return rc;
-	}
+	rc = errno;
     }
   else
-    {
-      return MU_ERR_PARSE;
-    }
-  return 0;
+    rc = MU_ERR_PARSE;
+ end:
+  free (value);
+  return rc;
 }
 
 int
diff --git a/libmailutils/mailbox/bodystruct.c b/libmailutils/mailbox/bodystruct.c
index cffe503d9..2d48205a9 100644
--- a/libmailutils/mailbox/bodystruct.c
+++ b/libmailutils/mailbox/bodystruct.c
@@ -31,6 +31,7 @@
 #include <mailutils/nls.h>
 #include <mailutils/cstr.h>
 #include <mailutils/body.h>
+#include <mailutils/util.h>
 
 void
 mu_list_free_bodystructure (void *item)
@@ -94,59 +95,48 @@ static int
 bodystructure_fill (mu_message_t msg, struct mu_bodystructure *bs)
 {
   mu_header_t header = NULL;
-  const char *buffer = NULL;
+  char *buffer = NULL;
   mu_body_t body = NULL;
-  int rc;
   int is_multipart = 0;
+  int rc;
 
   rc = mu_message_get_header (msg, &header);
   if (rc)
     return rc;
   
-  if (mu_header_sget_value (header, MU_HEADER_CONTENT_TYPE, &buffer) == 0)
+  if (mu_header_aget_value_unfold (header, MU_HEADER_CONTENT_TYPE, &buffer) == 0)
     {
-      char *value;
-      char *p;
-      size_t len;
-      
-      rc = mu_mime_header_parse (buffer, "UTF-8", &value, &bs->body_param);
-      if (rc)
-	return rc;
-
-      len = strcspn (value, "/");
+      mu_content_type_t ct;
 
-      if (mu_c_strcasecmp (value, "MESSAGE/RFC822") == 0)
-        bs->body_message_type = mu_message_rfc822;
-      else if (mu_c_strncasecmp (value, "TEXT", len) == 0)
-        bs->body_message_type = mu_message_text;
-
-      p = malloc (len + 1);
-      if (!p)
-	return ENOMEM;
-      memcpy (p, value, len);
-      p[len] = 0;
-      
-      bs->body_type = p;
-      mu_strupper (bs->body_type);
-      if (value[len])
+      rc = mu_content_type_parse (buffer, "UTF-8", &ct);
+      if (rc == 0)
 	{
-	  bs->body_subtype = strdup (value + len + 1);
-	  if (!bs->body_subtype)
-	    return ENOMEM;
+	  if (mu_c_strcasecmp (ct->type, "MESSAGE") == 0 &&
+	      mu_c_strcasecmp (ct->subtype, "RFC822") == 0)
+	    bs->body_message_type = mu_message_rfc822;
+	  else if (mu_c_strcasecmp (ct->type, "TEXT") == 0)
+	    bs->body_message_type = mu_message_text;
+	  
+	  bs->body_type = ct->type;
+	  ct->type = NULL;
+	  mu_strupper (bs->body_type);
+	  bs->body_subtype = ct->subtype;
+	  ct->subtype = NULL;
 	  mu_strupper (bs->body_subtype);
-	}
+	  bs->body_param = ct->param;
+	  ct->param = NULL;
+	  mu_content_type_destroy (&ct);
       
-      /* body parameter parenthesized list: Content-type attributes */
-
-      rc = mu_message_is_multipart (msg, &is_multipart);
-      if (rc)
-	return rc;
-      if (is_multipart)
-	bs->body_message_type = mu_message_multipart;
+	  /* body parameter parenthesized list: Content-type attributes */
+	  mu_message_is_multipart (msg, &is_multipart);
+	  if (is_multipart)
+	    bs->body_message_type = mu_message_multipart;
+	}
+      free (buffer);
     }
   else
     {
-      struct mu_mime_param param;
+      struct mu_mime_param *param;
       
       /* Default? If Content-Type is not present consider as text/plain.  */
       bs->body_type = strdup ("TEXT");
@@ -161,21 +151,22 @@ bodystructure_fill (mu_message_t msg, struct mu_bodystructure *bs)
       rc = mu_mime_param_assoc_create (&bs->body_param);
       if (rc)
 	return rc;
-      memset (&param, 0, sizeof (param));
-      param.value = strdup ("US-ASCII");
-      if (!param.value)
-        {
-          free (bs->body_type);
-          free (bs->body_subtype);
-          return ENOMEM;
-        }
-      rc = mu_assoc_install (bs->body_param, "CHARSET", &param);
-      if (rc)
+      param = calloc (1, sizeof (*param));
+      if (param && (param->value = strdup ("US-ASCII")) != NULL)
 	{
-	  free (param.value);
-	  return rc;
+	  rc = mu_assoc_install (bs->body_param, "CHARSET", param);
+	  if (rc)
+	    {
+	      mu_mime_param_free (param);
+	      return rc;
+	    }
+	  bs->body_message_type = mu_message_text;
 	}
-      bs->body_message_type = mu_message_text;
+      else
+	{
+	  free (param);
+	  return ENOMEM;
+	} 
     }
 
   if (is_multipart)
@@ -283,12 +274,13 @@ bodystructure_fill (mu_message_t msg, struct mu_bodystructure *bs)
     return rc;
   
   /* body disposition: Content-Disposition.  */
-  rc = mu_header_sget_value (header, MU_HEADER_CONTENT_DISPOSITION,
-			     &buffer);
+  rc = mu_header_aget_value_unfold (header, MU_HEADER_CONTENT_DISPOSITION,
+				    &buffer);
   if (rc == 0)
     {
       rc = mu_mime_header_parse (buffer, "UTF-8", &bs->body_disposition,
 				 &bs->body_disp_param);
+      free (buffer);
       if (rc)
 	return rc;
     }
diff --git a/libmailutils/mime/mimehdr.c b/libmailutils/mime/mimehdr.c
index ec883329b..b057a48a5 100644
--- a/libmailutils/mime/mimehdr.c
+++ b/libmailutils/mime/mimehdr.c
@@ -570,8 +570,14 @@ parse_param (struct mu_wordsplit *ws, size_t *pi, mu_assoc_t assoc,
              success.
     ASSOC  - Unless NULL, parameters are stored here.	     
 
-   Either PVALUE or ASSOC (but not both) can be NULL, meaning that the
-   corresponding data are of no interest to the caller.
+   Both output pointers can be NULL, meaning that the corresponding data
+   are of no interest to the caller.
+
+   The value returned in PVALUE is the initial part of TEXT up to the
+   start of parameters (i.e. to the first semicolon) with leading and
+   trailing whitespace removed.  No other syntactic checking is done on
+   the value.  It is the responsibility of the caller to verify that it
+   complies to the syntax of the particular header.
 */
 static int
 _mime_header_parse (const char *text, char **pvalue,
@@ -581,7 +587,28 @@ _mime_header_parse (const char *text, char **pvalue,
   struct mu_wordsplit ws;
   struct param_continuation cont;
   size_t i;
+  char *value = NULL;
+  size_t val_len;
 
+  val_len = strcspn (text, ";");
+  if (pvalue)
+    {
+      value = malloc (val_len + 1);
+      if (!value)
+	return ENOMEM;
+      memcpy (value, text, val_len);
+      value[val_len] = 0;
+      mu_rtrim_class (value, MU_CTYPE_SPACE);
+      mu_ltrim_class (value, MU_CTYPE_SPACE);
+      if (value[0] == 0)
+	{
+	  free (value);
+	  return MU_ERR_PARSE;
+	}
+    }
+  
+  text += val_len;
+  
   ws.ws_delim = " \t\r\n;";
   ws.ws_escape[0] = ws.ws_escape[1] = "\\\\\"\"";
   ws.ws_options = 0;
@@ -596,28 +623,20 @@ _mime_header_parse (const char *text, char **pvalue,
       mu_debug (MU_DEBCAT_MIME, MU_DEBUG_ERROR,
 		(_("wordsplit: %s"), mu_wordsplit_strerror (&ws)));
       mu_wordsplit_free (&ws);
+      free (value);
       return MU_ERR_PARSE;
     }
 
-  if (ws.ws_wordc == 0)
-    {
-      mu_wordsplit_free (&ws);
-      return MU_ERR_PARSE;
-    }
-  
   if (!assoc)
     {
-      if (!pvalue)
-	return MU_ERR_OUT_PTR_NULL;
-      *pvalue = strdup (ws.ws_wordv[0]);
+      if (pvalue)
+	*pvalue = value;
       mu_wordsplit_free (&ws);
-      if (!*pvalue)
-	return ENOMEM;
       return 0;
     }
     
   memset (&cont, 0, sizeof (cont));
-  for (i = 1; (rc = parse_param (&ws, &i, assoc, &cont, outcharset, subset)) == 0;)
+  for (i = 0; (rc = parse_param (&ws, &i, assoc, &cont, outcharset, subset)) == 0;)
     ;
   if (rc == MU_ERR_USER0)
     rc = 0;
@@ -627,13 +646,10 @@ _mime_header_parse (const char *text, char **pvalue,
   if (rc == 0)
     {
       if (pvalue)
-	{
-	  *pvalue = strdup (ws.ws_wordv[0]);
-	  if (!*pvalue)
-	    rc = ENOMEM;
-	}
+	*pvalue = value;
     }
-
+  else
+    free (value);
   mu_wordsplit_free (&ws);
 
   if (subset)
diff --git a/libmailutils/tests/content-type.at b/libmailutils/tests/content-type.at
index adeeeadeb..7dbda629f 100644
--- a/libmailutils/tests/content-type.at
+++ b/libmailutils/tests/content-type.at
@@ -49,5 +49,14 @@ CTHDR([missing subtype],[ctparse content-type],
 [conttype: Parse error
 ])
 
+CTHDR([whitespace],[ctparse content-type],
+[  text  /  plain  ; charset = utf-8;p =foo],
+[0],
+[type = text
+subtype = plain
+ 0: charset=utf-8
+ 1: p=foo
+])
+
 m4_popdef([CTHDR])
 
diff --git a/libmailutils/tests/conttype.c b/libmailutils/tests/conttype.c
index 1f3920b13..1cbd695c3 100644
--- a/libmailutils/tests/conttype.c
+++ b/libmailutils/tests/conttype.c
@@ -53,6 +53,8 @@ main (int argc, char **argv)
   while ((rc = mu_stream_getline (mu_strin, &buf, &size, &n)) == 0 && n > 0)
     {
       mu_rtrim_class (buf, MU_CTYPE_ENDLN);
+      if (buf[0] == 0)
+	continue;
       if (parse (buf))
 	result = 1;
     }
diff --git a/libmailutils/tests/mimehdr.at b/libmailutils/tests/mimehdr.at
index cef3a6e9b..ea6fab5f4 100644
--- a/libmailutils/tests/mimehdr.at
+++ b/libmailutils/tests/mimehdr.at
@@ -277,18 +277,17 @@ MIMEHDR_FAIL([empty input],[mimehdr15],
 [mimehdr: mu_mime_header_parse() failed: Parse error
 ])
 
-MIMEHDR_FAIL([missing semicolon after type],[mimehdr16],
+MIMEHDR([missing semicolon after type],[mimehdr16],
 [],
 [message name="foo"],
-[],
-[mimehdr: mu_mime_header_parse() failed: Parse error
+[message name="foo"
 ])
 
-MIMEHDR_FAIL([whitespace in type],[mimehdr17],
+MIMEHDR([whitespace in type],[mimehdr17],
 [],
 [TeX file/plain; name=foo],
-[],
-[mimehdr: mu_mime_header_parse() failed: Parse error
+[TeX file/plain
+name=foo
 ])
 
 m4_popdef([MIMEHDR])
author	Sergey Poznyakoff <gray@gnu.org>	2020-07-23 13:42:23 +0300
committer	Sergey Poznyakoff <gray@gnu.org>	2020-07-23 14:07:58 +0300
commit	ff9c0a396c14c219532dd667d49ebc7a3f376f9b (patch)
tree	a0d9a259072018dd97b936042df7d89daa3c8b41 /libmailutils
parent	78c28187ffb8f6ae8aa0662e6be193d23ae7808b (diff)
download	mailutils-ff9c0a396c14c219532dd667d49ebc7a3f376f9b.tar.gz mailutils-ff9c0a396c14c219532dd667d49ebc7a3f376f9b.tar.bz2