citadel/mime_parser.c

   1 /*
   2  * $Id$
   3  *
   4  * This is the MIME parser for Citadel.
   5  *
   6  * Copyright (c) 1998-2006 by Art Cancro
   7  * This code is distributed under the GNU General Public License v2.
   8  *
   9  */
  10
  11 #include <stdlib.h>
  12 #include <unistd.h>
  13 #include <stdio.h>
  14 #include <signal.h>
  15 #include <sys/types.h>
  16 #include <ctype.h>
  17 #include <string.h>
  18 #include <sys/stat.h>
  19 #include <errno.h>
  20
  21 #include "citadel.h"
  22 #include "server.h"
  23 #include "sysdep_decls.h"
  24 #include "tools.h"
  25
  26 #include "mime_parser.h"
  27
  28
  29 void extract_key(char *target, char *source, char *key)
  30 {
  31         char *ptr;
  32         char looking_for[256];
  33         int double_quotes = 0;
  34
  35         snprintf(looking_for, sizeof looking_for, "%s=", key);
  36
  37         ptr = bmstrcasestr(source, looking_for);
  38         if (ptr == NULL) {
  39                 strcpy(target, "");
  40                 return;
  41         }
  42         strcpy(target, (ptr + strlen(looking_for)));
  43
  44         for (ptr=target; (*ptr != 0); ++ptr) {
  45
  46                 /* A semicolon means we've hit the end of the key, unless we're inside double quotes */
  47                 if ( (double_quotes != 1) && (*ptr == ';')) {
  48                         *ptr = 0;
  49                 }
  50
  51                 /* if we find double quotes, we've got a great set of string boundaries */
  52                 if (*ptr == '\"') {
  53                         ++double_quotes;
  54                         if (double_quotes == 1) {
  55                                 strcpy(ptr, ptr+1);
  56                         }
  57                         else {
  58                                 *ptr = 0;
  59                         }
  60                 }
  61         }
  62 }
  63
  64
  65 /*
  66  * For non-multipart messages, we need to generate a quickie partnum of "1"
  67  * to return to callback functions.  Some callbacks demand it.
  68  */
  69 char *fixed_partnum(char *supplied_partnum) {
  70         if (supplied_partnum == NULL) return "1";
  71         if (strlen(supplied_partnum)==0) return "1";
  72         return supplied_partnum;
  73 }
  74
  75
  76
  77 /*
  78  * Given a message or message-part body and a length, handle any necessary
  79  * decoding and pass the request up the stack.
  80  */
  81 void mime_decode(char *partnum,
  82                  char *part_start, size_t length,
  83                  char *content_type, char *charset, char *encoding,
  84                  char *disposition,
  85                  char *name, char *filename,
  86                  void (*CallBack)
  87                   (char *cbname,
  88                    char *cbfilename,
  89                    char *cbpartnum,
  90                    char *cbdisp,
  91                    void *cbcontent,
  92                    char *cbtype,
  93                    char *cbcharset,
  94                    size_t cblength,
  95                    char *cbencoding,
  96                    void *cbuserdata),
  97                  void (*PreMultiPartCallBack)
  98                   (char *cbname,
  99                    char *cbfilename,
 100                    char *cbpartnum,
 101                    char *cbdisp,
 102                    void *cbcontent,
 103                    char *cbtype,
 104                    char *cbcharset,
 105                    size_t cblength,
 106                    char *cbencoding,
 107                    void *cbuserdata),
 108                  void (*PostMultiPartCallBack)
 109                   (char *cbname,
 110                    char *cbfilename,
 111                    char *cbpartnum,
 112                    char *cbdisp,
 113                    void *cbcontent,
 114                    char *cbtype,
 115                    char *cbcharset,
 116                    size_t cblength,
 117                    char *cbencoding,
 118                    void *cbuserdata),
 119                   void *userdata,
 120                   int dont_decode
 121 )
 122 {
 123
 124         char *decoded;
 125         size_t bytes_decoded = 0;
 126
 127         /* Some encodings aren't really encodings */
 128         if (!strcasecmp(encoding, "7bit"))
 129                 strcpy(encoding, "");
 130         if (!strcasecmp(encoding, "8bit"))
 131                 strcpy(encoding, "");
 132         if (!strcasecmp(encoding, "binary"))
 133                 strcpy(encoding, "");
 134
 135         /* If this part is not encoded, send as-is */
 136         if ( (strlen(encoding) == 0) || (dont_decode)) {
 137                 if (CallBack != NULL) {
 138                         CallBack(name, filename, fixed_partnum(partnum),
 139                                 disposition, part_start,
 140                                 content_type, charset, length, encoding, userdata);
 141                         }
 142                 return;
 143         }
 144
 145         /* Fail silently if we hit an unknown encoding. */
 146         if ((strcasecmp(encoding, "base64"))
 147             && (strcasecmp(encoding, "quoted-printable"))) {
 148                 return;
 149         }
 150
 151         /*
 152          * Allocate a buffer for the decoded data.  The output buffer is slightly
 153          * larger than the input buffer; this assumes that the decoded data
 154          * will never be significantly larger than the encoded data.  This is a
 155          * safe assumption with base64, uuencode, and quoted-printable.
 156          */
 157         decoded = malloc(length + 32768);
 158         if (decoded == NULL) {
 159                 return;
 160         }
 161
 162         if (!strcasecmp(encoding, "base64")) {
 163                 bytes_decoded = CtdlDecodeBase64(decoded, part_start, length);
 164         }
 165         else if (!strcasecmp(encoding, "quoted-printable")) {
 166                 bytes_decoded = CtdlDecodeQuotedPrintable(decoded, part_start, length);
 167         }
 168
 169         if (bytes_decoded > 0) if (CallBack != NULL) {
 170                 CallBack(name, filename, fixed_partnum(partnum),
 171                         disposition, decoded,
 172                         content_type, charset, bytes_decoded, "binary", userdata);
 173         }
 174
 175         free(decoded);
 176 }
 177
 178 /*
 179  * Break out the components of a multipart message
 180  * (This function expects to be fed HEADERS + CONTENT)
 181  * Note: NULL can be supplied as content_end; in this case, the message is
 182  * considered to have ended when the parser encounters a 0x00 byte.
 183  */
 184 void the_mime_parser(char *partnum,
 185                      char *content_start, char *content_end,
 186                      void (*CallBack)
 187                       (char *cbname,
 188                        char *cbfilename,
 189                        char *cbpartnum,
 190                        char *cbdisp,
 191                        void *cbcontent,
 192                        char *cbtype,
 193                        char *cbcharset,
 194                        size_t cblength,
 195                        char *cbencoding,
 196                        void *cbuserdata),
 197                      void (*PreMultiPartCallBack)
 198                       (char *cbname,
 199                        char *cbfilename,
 200                        char *cbpartnum,
 201                        char *cbdisp,
 202                        void *cbcontent,
 203                        char *cbtype,
 204                        char *cbcharset,
 205                        size_t cblength,
 206                        char *cbencoding,
 207                        void *cbuserdata),
 208                      void (*PostMultiPartCallBack)
 209                       (char *cbname,
 210                        char *cbfilename,
 211                        char *cbpartnum,
 212                        char *cbdisp,
 213                        void *cbcontent,
 214                        char *cbtype,
 215                        char *cbcharset,
 216                        size_t cblength,
 217                        char *cbencoding,
 218                        void *cbuserdata),
 219                       void *userdata,
 220                       int dont_decode
 221 )
 222 {
 223
 224         char *ptr;
 225         char *srch = NULL;
 226         char *part_start, *part_end = NULL;
 227         char buf[SIZ];
 228         char *header;
 229         char *boundary;
 230         char *startary;
 231         size_t startary_len = 0;
 232         char *endary;
 233         char *next_boundary;
 234         char *content_type;
 235         char *charset;
 236         size_t content_length;
 237         char *encoding;
 238         char *disposition;
 239         char *name = NULL;
 240         char *content_type_name;
 241         char *content_disposition_name;
 242         char *filename;
 243         int is_multipart;
 244         int part_seq = 0;
 245         int i;
 246         size_t length;
 247         char nested_partnum[256];
 248         int crlf_in_use = 0;
 249         char *evaluate_crlf_ptr = NULL;
 250         int buflen = 0;
 251         int headerlen = 0;
 252
 253         ptr = content_start;
 254         content_length = 0;
 255
 256         boundary = malloc(SIZ);
 257         memset(boundary, 0, SIZ);
 258
 259         startary = malloc(SIZ);
 260         memset(startary, 0, SIZ);
 261
 262         endary = malloc(SIZ);
 263         memset(endary, 0, SIZ);
 264
 265         header = malloc(SIZ);
 266         memset(header, 0, SIZ);
 267
 268         content_type = malloc(SIZ);
 269         memset(content_type, 0, SIZ);
 270
 271         charset = malloc(SIZ);
 272         memset(charset, 0, SIZ);
 273
 274         encoding = malloc(SIZ);
 275         memset(encoding, 0, SIZ);
 276
 277         content_type_name = malloc(SIZ);
 278         memset(content_type_name, 0, SIZ);
 279
 280         content_disposition_name = malloc(SIZ);
 281         memset(content_disposition_name, 0, SIZ);
 282
 283         filename = malloc(SIZ);
 284         memset(filename, 0, SIZ);
 285
 286         disposition = malloc(SIZ);
 287         memset(disposition, 0, SIZ);
 288
 289         /* If the caller didn't supply an endpointer, generate one by measure */
 290         if (content_end == NULL) {
 291                 content_end = &content_start[strlen(content_start)];
 292         }
 293
 294         /* Learn interesting things from the headers */
 295         strcpy(header, "");
 296         headerlen = 0;
 297         do {
 298                 ptr = memreadlinelen(ptr, buf, SIZ, &buflen);
 299                 if (ptr >= content_end) {
 300                         goto end_parser;
 301                 }
 302
 303                 for (i = 0; i < buflen; ++i) {
 304                         if (isspace(buf[i])) {
 305                                 buf[i] = ' ';
 306                         }
 307                 }
 308
 309                 if (!isspace(buf[0])) {
 310                         if (!strncasecmp(header, "Content-type:", 13)) {
 311                                 strcpy(content_type, &header[13]);
 312                                 striplt(content_type);
 313                                 extract_key(content_type_name, content_type, "name");
 314                                 extract_key(charset, content_type, "charset");
 315                                 extract_key(boundary, header, "boundary");
 316                                 /* Deal with weird headers */
 317                                 if (strchr(content_type, ' '))
 318                                         *(strchr(content_type, ' ')) = '\0';
 319                                 if (strchr(content_type, ';'))
 320                                         *(strchr(content_type, ';')) = '\0';
 321                         }
 322                         if (!strncasecmp(header, "Content-Disposition:", 20)) {
 323                                 strcpy(disposition, &header[20]);
 324                                 striplt(disposition);
 325                                 extract_key(content_disposition_name, disposition, "name");
 326                                 extract_key(filename, disposition, "filename");
 327                         }
 328                         if (!strncasecmp(header, "Content-length: ", 15)) {
 329                                 char clbuf[10];
 330                                 safestrncpy(clbuf, &header[15], sizeof clbuf);
 331                                 striplt(clbuf);
 332                                 content_length = (size_t) atol(clbuf);
 333                         }
 334                         if (!strncasecmp(header, "Content-transfer-encoding: ", 26)) {
 335                                 strcpy(encoding, &header[26]);
 336                                 striplt(encoding);
 337                         }
 338                         strcpy(header, "");
 339                         headerlen = 0;
 340                 }
 341                 if ((headerlen + buflen + 2) < SIZ) {
 342                         memcpy(&header[headerlen], buf, buflen);
 343                         headerlen += buflen;
 344                         header[headerlen] = '\0';
 345                 }
 346         } while ((!IsEmptyStr(buf)) && (*ptr != 0));
 347
 348         if (strchr(disposition, ';'))
 349                 *(strchr(disposition, ';')) = '\0';
 350         striplt(disposition);
 351         if (strchr(content_type, ';'))
 352                 *(strchr(content_type, ';')) = '\0';
 353         striplt(content_type);
 354
 355         if (!IsEmptyStr(boundary)) {
 356                 is_multipart = 1;
 357         } else {
 358                 is_multipart = 0;
 359         }
 360
 361         /* If this is a multipart message, then recursively process it */
 362         part_start = NULL;
 363         if (is_multipart) {
 364
 365                 /* Tell the client about this message's multipartedness */
 366                 if (PreMultiPartCallBack != NULL) {
 367                         PreMultiPartCallBack("", "", partnum, "",
 368                                 NULL, content_type, charset,
 369                                 0, encoding, userdata);
 370                 }
 371
 372                 /* Figure out where the boundaries are */
 373                 snprintf(startary, SIZ, "--%s", boundary);
 374                 snprintf(endary, SIZ, "--%s--", boundary);
 375                 startary_len = strlen(startary);
 376
 377                 part_start = NULL;
 378                 do {
 379                         next_boundary = NULL;
 380                         for (srch=ptr; srch<content_end; ++srch) {
 381                                 if (!memcmp(srch, startary, startary_len)) {
 382                                         next_boundary = srch;
 383                                         srch = content_end;
 384                                 }
 385                         }
 386
 387                         if ( (part_start != NULL) && (next_boundary != NULL) ) {
 388                                 part_end = next_boundary;
 389                                 --part_end;             /* omit the trailing LF */
 390                                 if (crlf_in_use) {
 391                                         --part_end;     /* omit the trailing CR */
 392                                 }
 393
 394                                 if (!IsEmptyStr(partnum)) {
 395                                         snprintf(nested_partnum,
 396                                                  sizeof nested_partnum,
 397                                                  "%s.%d", partnum,
 398                                                  ++part_seq);
 399                                 }
 400                                 else {
 401                                         snprintf(nested_partnum,
 402                                                  sizeof nested_partnum,
 403                                                  "%d", ++part_seq);
 404                                 }
 405                                 the_mime_parser(nested_partnum,
 406                                             part_start, part_end,
 407                                                 CallBack,
 408                                                 PreMultiPartCallBack,
 409                                                 PostMultiPartCallBack,
 410                                                 userdata,
 411                                                 dont_decode);
 412                         }
 413
 414                         if (next_boundary != NULL) {
 415                                 /* If we pass out of scope, don't attempt to
 416                                  * read past the end boundary. */
 417                                 if (!strcmp(next_boundary, endary)) {
 418                                         ptr = content_end;
 419                                 }
 420                                 else {
 421                                         /* Set up for the next part. */
 422                                         part_start = strstr(next_boundary, "\n");
 423
 424                                         /* Determine whether newlines are LF or CRLF */
 425                                         evaluate_crlf_ptr = part_start;
 426                                         --evaluate_crlf_ptr;
 427                                         if (!memcmp(evaluate_crlf_ptr, "\r\n", 2)) {
 428                                                 crlf_in_use = 1;
 429                                         }
 430                                         else {
 431                                                 crlf_in_use = 0;
 432                                         }
 433
 434                                         /* Advance past the LF ... now we're in the next part */
 435                                         ++part_start;
 436                                         ptr = part_start;
 437                                 }
 438                         }
 439                         else {
 440                                 /* Invalid end of multipart.  Bail out! */
 441                                 ptr = content_end;
 442                         }
 443                 } while ( (ptr < content_end) && (next_boundary != NULL) );
 444
 445                 if (PostMultiPartCallBack != NULL) {
 446                         PostMultiPartCallBack("", "", partnum, "", NULL,
 447                                 content_type, charset, 0, encoding, userdata);
 448                 }
 449                 goto end_parser;
 450         }
 451
 452         /* If it's not a multipart message, then do something with it */
 453         if (!is_multipart) {
 454                 part_start = ptr;
 455                 length = 0;
 456                 while (ptr < content_end) {
 457                         ++ptr;
 458                         ++length;
 459                 }
 460                 part_end = content_end;
 461
 462                 /******
 463                  * I thought there was an off-by-one error here, but there isn't.
 464                  * This probably means that there's an off-by-one error somewhere
 465                  * else ... or maybe only in certain messages?
 466                 --part_end;
 467                 --length;
 468                 ******/
 469
 470                 /* Truncate if the header told us to */
 471                 if ( (content_length > 0) && (length > content_length) ) {
 472                         length = content_length;
 473                 }
 474
 475                 /* Sometimes the "name" field is tacked on to Content-type,
 476                  * and sometimes it's tacked on to Content-disposition.  Use
 477                  * whichever one we have.
 478                  */
 479                 if (strlen(content_disposition_name) > strlen(content_type_name)) {
 480                         name = content_disposition_name;
 481                 }
 482                 else {
 483                         name = content_type_name;
 484                 }
 485
 486                 /* lprintf(CTDL_DEBUG, "mime_decode part=%s, len=%d, type=%s, charset=%s, encoding=%s\n",
 487                         partnum, length, content_type, charset, encoding); */
 488
 489                 /* Ok, we've got a non-multipart part here, so do something with it.
 490                  */
 491                 mime_decode(partnum,
 492                         part_start, length,
 493                         content_type, charset, encoding, disposition,
 494                         name, filename,
 495                         CallBack, NULL, NULL,
 496                         userdata, dont_decode
 497                 );
 498
 499                 /*
 500                  * Now if it's an encapsulated message/rfc822 then we have to recurse into it
 501                  */
 502                 if (!strcasecmp(content_type, "message/rfc822")) {
 503
 504                         if (PreMultiPartCallBack != NULL) {
 505                                 PreMultiPartCallBack("", "", partnum, "",
 506                                         NULL, content_type, charset,
 507                                         0, encoding, userdata);
 508                         }
 509                         if (CallBack != NULL) {
 510                                 if (strlen(partnum) > 0) {
 511                                         snprintf(nested_partnum,
 512                                                  sizeof nested_partnum,
 513                                                  "%s.%d", partnum,
 514                                                  ++part_seq);
 515                                 }
 516                                 else {
 517                                         snprintf(nested_partnum,
 518                                                  sizeof nested_partnum,
 519                                                  "%d", ++part_seq);
 520                                 }
 521                                 the_mime_parser(nested_partnum,
 522                                         part_start, part_end,
 523                                         CallBack,
 524                                         PreMultiPartCallBack,
 525                                         PostMultiPartCallBack,
 526                                         userdata,
 527                                         dont_decode
 528                                 );
 529                         }
 530                         if (PostMultiPartCallBack != NULL) {
 531                                 PostMultiPartCallBack("", "", partnum, "", NULL,
 532                                         content_type, charset, 0, encoding, userdata);
 533                         }
 534
 535
 536                 }
 537
 538         }
 539
 540 end_parser:     /* free the buffers!  end the oppression!! */
 541         free(boundary);
 542         free(startary);
 543         free(endary);
 544         free(header);
 545         free(content_type);
 546         free(charset);
 547         free(encoding);
 548         free(content_type_name);
 549         free(content_disposition_name);
 550         free(filename);
 551         free(disposition);
 552 }
 553
 554
 555
 556 /*
 557  * Entry point for the MIME parser.
 558  * (This function expects to be fed HEADERS + CONTENT)
 559  * Note: NULL can be supplied as content_end; in this case, the message is
 560  * considered to have ended when the parser encounters a 0x00 byte.
 561  */
 562 void mime_parser(char *content_start,
 563                 char *content_end,
 564
 565                  void (*CallBack)
 566                   (char *cbname,
 567                    char *cbfilename,
 568                    char *cbpartnum,
 569                    char *cbdisp,
 570                    void *cbcontent,
 571                    char *cbtype,
 572                    char *cbcharset,
 573                    size_t cblength,
 574                    char *cbencoding,
 575                    void *cbuserdata),
 576
 577                  void (*PreMultiPartCallBack)
 578                   (char *cbname,
 579                    char *cbfilename,
 580                    char *cbpartnum,
 581                    char *cbdisp,
 582                    void *cbcontent,
 583                    char *cbtype,
 584                    char *cbcharset,
 585                    size_t cblength,
 586                    char *cbencoding,
 587                    void *cbuserdata),
 588
 589                  void (*PostMultiPartCallBack)
 590                   (char *cbname,
 591                    char *cbfilename,
 592                    char *cbpartnum,
 593                    char *cbdisp,
 594                    void *cbcontent,
 595                    char *cbtype,
 596                    char *cbcharset,
 597                    size_t cblength,
 598                    char *cbencoding,
 599                    void *cbuserdata),
 600
 601                   void *userdata,
 602                   int dont_decode
 603 )
 604 {
 605
 606         the_mime_parser("", content_start, content_end,
 607                         CallBack,
 608                         PreMultiPartCallBack,
 609                         PostMultiPartCallBack,
 610                         userdata, dont_decode);
 611 }