webcit/decode.c

   1 /*
   2  * Copyright (c) 1996-2012 by the citadel.org team
   3  *
   4  * This program is open source software.  You can redistribute it and/or
   5  * modify it under the terms of the GNU General Public License, version 3.
   6  *
   7  * This program is distributed in the hope that it will be useful,
   8  * but WITHOUT ANY WARRANTY; without even the implied warranty of
   9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  10  * GNU General Public License for more details.
  11  */
  12
  13 #include "webcit.h"
  14 #ifdef HAVE_ICONV
  15
  16 /*
  17  * Wrapper around iconv_open()
  18  * Our version adds aliases for non-standard Microsoft charsets
  19  * such as 'MS950', aliasing them to names like 'CP950'
  20  *
  21  * tocode       Target encoding
  22  * fromcode     Source encoding
  23  * /
  24 iconv_t ctdl_iconv_open(const char *tocode, const char *fromcode)
  25 {
  26         iconv_t ic = (iconv_t)(-1) ;
  27         ic = iconv_open(tocode, fromcode);
  28         if (ic == (iconv_t)(-1) ) {
  29                 char alias_fromcode[64];
  30                 if ( (strlen(fromcode) == 5) && (!strncasecmp(fromcode, "MS", 2)) ) {
  31                         safestrncpy(alias_fromcode, fromcode, sizeof alias_fromcode);
  32                         alias_fromcode[0] = 'C';
  33                         alias_fromcode[1] = 'P';
  34                         ic = iconv_open(tocode, alias_fromcode);
  35                 }
  36         }
  37         return(ic);
  38 }
  39 */
  40
  41
  42 static inline char *FindNextEnd (char *bptr)
  43 {
  44         char * end;
  45         /* Find the next ?Q? */
  46         end = strchr(bptr + 2, '?');
  47         if (end == NULL) return NULL;
  48         if (((*(end + 1) == 'B') || (*(end + 1) == 'Q')) &&
  49             (*(end + 2) == '?')) {
  50                 /* skip on to the end of the cluster, the next ?= */
  51                 end = strstr(end + 3, "?=");
  52         }
  53         else
  54                 /* sort of half valid encoding, try to find an end. */
  55                 end = strstr(bptr, "?=");
  56         return end;
  57 }
  58
  59 /*
  60  * Handle subjects with RFC2047 encoding such as:
  61  * =?koi8-r?B?78bP0s3Mxc7JxSDXz9rE1dvO2c3JINvB0sHNySDP?=
  62  */
  63 void utf8ify_rfc822_string(char **buf) {
  64         char *start, *end, *next, *nextend, *ptr;
  65         char newbuf[1024];
  66         char charset[128];
  67         char encoding[16];
  68         char istr[1024];
  69         iconv_t ic = (iconv_t)(-1) ;
  70         char *ibuf;                     /**< Buffer of characters to be converted */
  71         char *obuf;                     /**< Buffer for converted characters */
  72         size_t ibuflen;                 /**< Length of input buffer */
  73         size_t obuflen;                 /**< Length of output buffer */
  74         char *isav;                     /**< Saved pointer to input buffer */
  75         char *osav;                     /**< Saved pointer to output buffer */
  76         int passes = 0;
  77         int i, len, delta;
  78         int illegal_non_rfc2047_encoding = 0;
  79
  80         /* Sometimes, badly formed messages contain strings which were simply
  81          *  written out directly in some foreign character set instead of
  82          *  using RFC2047 encoding.  This is illegal but we will attempt to
  83          *  handle it anyway by converting from a user-specified default
  84          *  charset to UTF-8 if we see any nonprintable characters.
  85          */
  86         len = strlen(*buf);
  87         for (i=0; i<len; ++i) {
  88                 if (((*buf)[i] < 32) || ((*buf)[i] > 126)) {
  89                         illegal_non_rfc2047_encoding = 1;
  90                         i = len; /*< take a shortcut, it won't be more than one. */
  91                 }
  92         }
  93         if (illegal_non_rfc2047_encoding) {
  94                 StrBuf *default_header_charset;
  95                 get_preference("default_header_charset", &default_header_charset);
  96                 if ( (strcasecmp(ChrPtr(default_header_charset), "UTF-8")) &&
  97                      (strcasecmp(ChrPtr(default_header_charset), "us-ascii")) ) {
  98                         ctdl_iconv_open("UTF-8", ChrPtr(default_header_charset), &ic);
  99                         if (ic != (iconv_t)(-1) ) {
 100                                 ibuf = malloc(1024);
 101                                 isav = ibuf;
 102                                 safestrncpy(ibuf, *buf, 1023);
 103                                 ibuflen = strlen(ibuf);
 104                                 obuflen = 1024;
 105                                 obuf = (char *) malloc(obuflen);
 106                                 osav = obuf;
 107                                 iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen);
 108                                 osav[1023-obuflen] = 0;
 109                                 free(*buf);
 110                                 *buf = osav;
 111                                 iconv_close(ic);
 112                                 free(isav);
 113                         }
 114                 }
 115         }
 116
 117         /* pre evaluate the first pair */
 118         nextend = end = NULL;
 119         len = strlen(*buf);
 120         start = strstr(*buf, "=?");
 121         if (start != NULL)
 122                 end = FindNextEnd (start);
 123
 124         while ((start != NULL) && (end != NULL))
 125         {
 126                 next = strstr(end, "=?");
 127                 if (next != NULL)
 128                         nextend = FindNextEnd(next);
 129                 if (nextend == NULL)
 130                         next = NULL;
 131
 132                 /* did we find two partitions */
 133                 if ((next != NULL) &&
 134                     ((next - end) > 2))
 135                 {
 136                         ptr = end + 2;
 137                         while ((ptr < next) &&
 138                                (isspace(*ptr) ||
 139                                 (*ptr == '\r') ||
 140                                 (*ptr == '\n') ||
 141                                 (*ptr == '\t')))
 142                                 ptr ++;
 143                         /* did we find a gab just filled with blanks? */
 144                         if (ptr == next)
 145                         {
 146                                 memmove (end + 2,
 147                                          next,
 148                                          len - (next - start));
 149
 150                                 /* now terminate the gab at the end */
 151                                 delta = (next - end) - 2;
 152                                 len -= delta;
 153                                 (*buf)[len] = '\0';
 154
 155                                 /* move next to its new location. */
 156                                 next -= delta;
 157                                 nextend -= delta;
 158                         }
 159                 }
 160                 /* our next-pair is our new first pair now. */
 161                 start = next;
 162                 end = nextend;
 163         }
 164
 165         /* Now we handle foreign character sets properly encoded
 166          * in RFC2047 format.
 167          */
 168         while (start=strstr((*buf), "=?"), end=FindNextEnd((start != NULL)? start : (*buf)),
 169                 ((start != NULL) && (end != NULL) && (end > start)) )
 170         {
 171                 extract_token(charset, start, 1, '?', sizeof charset);
 172                 extract_token(encoding, start, 2, '?', sizeof encoding);
 173                 extract_token(istr, start, 3, '?', sizeof istr);
 174
 175                 ibuf = malloc(1024);
 176                 isav = ibuf;
 177                 if (!strcasecmp(encoding, "B")) {       /**< base64 */
 178                         ibuflen = CtdlDecodeBase64(ibuf, istr, strlen(istr));
 179                 }
 180                 else if (!strcasecmp(encoding, "Q")) {  /**< quoted-printable */
 181                         size_t len;
 182                         long pos;
 183
 184                         len = strlen(istr);
 185                         pos = 0;
 186                         while (pos < len)
 187                         {
 188                                 if (istr[pos] == '_') istr[pos] = ' ';
 189                                 pos++;
 190                         }
 191
 192                         ibuflen = CtdlDecodeQuotedPrintable(ibuf, istr, len);
 193                 }
 194                 else {
 195                         strcpy(ibuf, istr);             /**< unknown encoding */
 196                         ibuflen = strlen(istr);
 197                 }
 198
 199                 ctdl_iconv_open("UTF-8", charset, &ic);
 200                 if (ic != (iconv_t)(-1) ) {
 201                         obuflen = 1024;
 202                         obuf = (char *) malloc(obuflen);
 203                         osav = obuf;
 204                         iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen);
 205                         osav[1024-obuflen] = 0;
 206
 207                         end = start;
 208                         end++;
 209                         strcpy(start, "");
 210                         remove_token(end, 0, '?');
 211                         remove_token(end, 0, '?');
 212                         remove_token(end, 0, '?');
 213                         remove_token(end, 0, '?');
 214                         strcpy(end, &end[1]);
 215
 216                         snprintf(newbuf, sizeof newbuf, "%s%s%s", *buf, osav, end);
 217                         strcpy(*buf, newbuf);
 218
 219                         free(osav);
 220                         iconv_close(ic);
 221                 }
 222                 else {
 223                         end = start;
 224                         end++;
 225                         strcpy(start, "");
 226                         remove_token(end, 0, '?');
 227                         remove_token(end, 0, '?');
 228                         remove_token(end, 0, '?');
 229                         remove_token(end, 0, '?');
 230                         strcpy(end, &end[1]);
 231
 232                         snprintf(newbuf, sizeof newbuf, "%s(unreadable)%s", *buf, end);
 233                         strcpy(*buf, newbuf);
 234                 }
 235
 236                 free(isav);
 237
 238                 /*
 239                  * Since spammers will go to all sorts of absurd lengths to get their
 240                  * messages through, there are LOTS of corrupt headers out there.
 241                  * So, prevent a really badly formed RFC2047 header from throwing
 242                  * this function into an infinite loop.
 243                  */
 244                 ++passes;
 245                 if (passes > 20) return;
 246         }
 247
 248 }
 249 #else
 250 inline void utf8ify_rfc822_string(char **a){};
 251
 252 #endif
 253
 254
 255
 256
 257 /**
 258  * \brief       RFC2047-encode a header field if necessary.
 259  *              If no non-ASCII characters are found, the string
 260  *              will be copied verbatim without encoding.
 261  *
 262  * \param       target          Target buffer.
 263  * \param       maxlen          Maximum size of target buffer.
 264  * \param       source          Source string to be encoded.
 265  * \param       SourceLen       Length of the source string
 266  * \returns     encoded length; -1 if non success.
 267  */
 268 int webcit_rfc2047encode(char *target, int maxlen, char *source, long SourceLen)
 269 {
 270         const char headerStr[] = "=?UTF-8?Q?";
 271         int need_to_encode = 0;
 272         int i = 0;
 273         int len;
 274         unsigned char ch;
 275
 276         if ((source == NULL) ||
 277             (target == NULL) ||
 278             (SourceLen > maxlen)) return -1;
 279
 280         while ((!IsEmptyStr (&source[i])) &&
 281                (need_to_encode == 0) &&
 282                (i < SourceLen) ) {
 283                 if (((unsigned char) source[i] < 32) ||
 284                     ((unsigned char) source[i] > 126)) {
 285                         need_to_encode = 1;
 286                 }
 287                 i++;
 288         }
 289
 290         if (!need_to_encode) {
 291                 memcpy (target, source, SourceLen);
 292                 target[SourceLen] = '\0';
 293                 return SourceLen;
 294         }
 295
 296         if (sizeof (headerStr + SourceLen + 2) > maxlen)
 297                 return -1;
 298         memcpy (target, headerStr, sizeof (headerStr));
 299         len = sizeof (headerStr) - 1;
 300         for (i=0; (i < SourceLen) && (len + 3< maxlen) ; ++i) {
 301                 ch = (unsigned char) source[i];
 302                 if ((ch < 32) || (ch > 126) || (ch == 61)) {
 303                         sprintf(&target[len], "=%02X", ch);
 304                         len += 3;
 305                 }
 306                 else {
 307                         sprintf(&target[len], "%c", ch);
 308                         len ++;
 309                 }
 310         }
 311
 312         if (len + 2 < maxlen) {
 313                 strcat(&target[len], "?=");
 314                 len +=2;
 315                 return len;
 316         }
 317         else
 318                 return -1;
 319 }
 320