* tiny tool for message retrieval, first draft.
[citadel.git] / webcit / mime_parser.c
1 /*
2  * $Id$
3  */
4 /**
5  * \defgroup MIME This is the MIME parser for Citadel.
6  *
7  * Copyright (c) 1998-2005 by Art Cancro
8  * This code is distributed under the terms of the GNU General Public License.
9  * \ingroup WebcitHttpServer
10  */
11 /*@{*/
12 #include "webcit.h"
13 #include "webserver.h"
14 #include "mime_parser.h"
15
16 void extract_key(char *target, char *source, char *key)
17 {
18         char *ptr;
19         char looking_for[256];
20         int double_quotes = 0;
21
22         snprintf(looking_for, sizeof looking_for, "%s=", key);
23
24         ptr = bmstrcasestr(source, looking_for);
25         if (ptr == NULL) {
26                 strcpy(target, "");
27                 return;
28         }
29         strcpy(target, (ptr + strlen(looking_for)));
30
31         for (ptr=target; (*ptr != 0); ++ptr) {
32
33                 /* A semicolon means we've hit the end of the key, unless we're inside double quotes */
34                 if ( (double_quotes != 1) && (*ptr == ';')) {
35                         *ptr = 0;
36                 }
37
38                 /* if we find double quotes, we've got a great set of string boundaries */
39                 if (*ptr == '\"') {
40                         ++double_quotes;
41                         if (double_quotes == 1) {
42                                 strcpy(ptr, ptr+1);
43                         }
44                         else {
45                                 *ptr = 0;
46                         }
47                 }
48         }
49 }
50
51
52 /*
53  * For non-multipart messages, we need to generate a quickie partnum of "1"
54  * to return to callback functions.  Some callbacks demand it.
55  */
56 char *fixed_partnum(char *supplied_partnum) {
57         if (supplied_partnum == NULL) return "1";
58         if (strlen(supplied_partnum)==0) return "1";
59         return supplied_partnum;
60 }
61
62
63
64 /*
65  * Given a message or message-part body and a length, handle any necessary
66  * decoding and pass the request up the stack.
67  */
68 void mime_decode(char *partnum,
69                  char *part_start, size_t length,
70                  char *content_type, char *charset, char *encoding,
71                  char *disposition,
72                  char *name, char *filename,
73                  void (*CallBack)
74                   (char *cbname,
75                    char *cbfilename,
76                    char *cbpartnum,
77                    char *cbdisp,
78                    void *cbcontent,
79                    char *cbtype,
80                    char *cbcharset,
81                    size_t cblength,
82                    char *cbencoding,
83                    void *cbuserdata),
84                  void (*PreMultiPartCallBack)
85                   (char *cbname,
86                    char *cbfilename,
87                    char *cbpartnum,
88                    char *cbdisp,
89                    void *cbcontent,
90                    char *cbtype,
91                    char *cbcharset,
92                    size_t cblength,
93                    char *cbencoding,
94                    void *cbuserdata),
95                  void (*PostMultiPartCallBack)
96                   (char *cbname,
97                    char *cbfilename,
98                    char *cbpartnum,
99                    char *cbdisp,
100                    void *cbcontent,
101                    char *cbtype,
102                    char *cbcharset,
103                    size_t cblength,
104                    char *cbencoding,
105                    void *cbuserdata),
106                   void *userdata,
107                   int dont_decode
108 )
109 {
110
111         char *decoded;
112         size_t bytes_decoded = 0;
113
114         /* Some encodings aren't really encodings */
115         if (!strcasecmp(encoding, "7bit"))
116                 strcpy(encoding, "");
117         if (!strcasecmp(encoding, "8bit"))
118                 strcpy(encoding, "");
119         if (!strcasecmp(encoding, "binary"))
120                 strcpy(encoding, "");
121
122         /* If this part is not encoded, send as-is */
123         if ( (strlen(encoding) == 0) || (dont_decode)) {
124                 if (CallBack != NULL) {
125                         CallBack(name, filename, fixed_partnum(partnum),
126                                 disposition, part_start,
127                                 content_type, charset, length, encoding, userdata);
128                         }
129                 return;
130         }
131         
132         /* Fail silently if we hit an unknown encoding. */
133         if ((strcasecmp(encoding, "base64"))
134             && (strcasecmp(encoding, "quoted-printable"))) {
135                 return;
136         }
137
138         /*
139          * Allocate a buffer for the decoded data.  The output buffer is slightly
140          * larger than the input buffer; this assumes that the decoded data
141          * will never be significantly larger than the encoded data.  This is a
142          * safe assumption with base64, uuencode, and quoted-printable.
143          */
144         decoded = malloc(length + 32768);
145         if (decoded == NULL) {
146                 return;
147         }
148
149         if (!strcasecmp(encoding, "base64")) {
150                 bytes_decoded = CtdlDecodeBase64(decoded, part_start, length);
151         }
152         else if (!strcasecmp(encoding, "quoted-printable")) {
153                 bytes_decoded = CtdlDecodeQuotedPrintable(decoded, part_start, length);
154         }
155
156         if (bytes_decoded > 0) if (CallBack != NULL) {
157                 CallBack(name, filename, fixed_partnum(partnum),
158                         disposition, decoded,
159                         content_type, charset, bytes_decoded, "binary", userdata);
160         }
161
162         free(decoded);
163 }
164
165 /*
166  * Break out the components of a multipart message
167  * (This function expects to be fed HEADERS + CONTENT)
168  * Note: NULL can be supplied as content_end; in this case, the message is
169  * considered to have ended when the parser encounters a 0x00 byte.
170  */
171 void the_mime_parser(char *partnum,
172                      char *content_start, char *content_end,
173                      void (*CallBack)
174                       (char *cbname,
175                        char *cbfilename,
176                        char *cbpartnum,
177                        char *cbdisp,
178                        void *cbcontent,
179                        char *cbtype,
180                        char *cbcharset,
181                        size_t cblength,
182                        char *cbencoding,
183                        void *cbuserdata),
184                      void (*PreMultiPartCallBack)
185                       (char *cbname,
186                        char *cbfilename,
187                        char *cbpartnum,
188                        char *cbdisp,
189                        void *cbcontent,
190                        char *cbtype,
191                        char *cbcharset,
192                        size_t cblength,
193                        char *cbencoding,
194                        void *cbuserdata),
195                      void (*PostMultiPartCallBack)
196                       (char *cbname,
197                        char *cbfilename,
198                        char *cbpartnum,
199                        char *cbdisp,
200                        void *cbcontent,
201                        char *cbtype,
202                        char *cbcharset,
203                        size_t cblength,
204                        char *cbencoding,
205                        void *cbuserdata),
206                       void *userdata,
207                       int dont_decode
208 )
209 {
210
211         char *ptr;
212         char *srch = NULL;
213         char *part_start, *part_end = NULL;
214         char buf[SIZ];
215         char *header;
216         char *boundary;
217         char *startary;
218         size_t startary_len = 0;
219         char *endary;
220         char *next_boundary;
221         char *content_type;
222         char *charset;
223         size_t content_length;
224         char *encoding;
225         char *disposition;
226         char *name = NULL;
227         char *content_type_name;
228         char *content_disposition_name;
229         char *filename;
230         int is_multipart;
231         int part_seq = 0;
232         int i;
233         size_t length;
234         char nested_partnum[256];
235         int crlf_in_use = 0;
236         char *evaluate_crlf_ptr = NULL;
237         int buflen = 0;
238         int headerlen = 0;
239
240         ptr = content_start;
241         content_length = 0;
242
243         boundary = malloc(SIZ);
244         memset(boundary, 0, SIZ);
245
246         startary = malloc(SIZ);
247         memset(startary, 0, SIZ);
248
249         endary = malloc(SIZ);
250         memset(endary, 0, SIZ);
251
252         header = malloc(SIZ);
253         memset(header, 0, SIZ);
254
255         content_type = malloc(SIZ);
256         memset(content_type, 0, SIZ);
257
258         charset = malloc(SIZ);
259         memset(charset, 0, SIZ);
260
261         encoding = malloc(SIZ);
262         memset(encoding, 0, SIZ);
263
264         content_type_name = malloc(SIZ);
265         memset(content_type_name, 0, SIZ);
266
267         content_disposition_name = malloc(SIZ);
268         memset(content_disposition_name, 0, SIZ);
269
270         filename = malloc(SIZ);
271         memset(filename, 0, SIZ);
272
273         disposition = malloc(SIZ);
274         memset(disposition, 0, SIZ);
275
276         /* If the caller didn't supply an endpointer, generate one by measure */
277         if (content_end == NULL) {
278                 content_end = &content_start[strlen(content_start)];
279         }
280
281         /* Learn interesting things from the headers */
282         strcpy(header, "");
283         headerlen = 0;
284         do {
285                 ptr = memreadlinelen(ptr, buf, SIZ, &buflen);
286                 if (ptr >= content_end) {
287                         goto end_parser;
288                 }
289
290                 for (i = 0; i < buflen; ++i) {
291                         if (isspace(buf[i])) {
292                                 buf[i] = ' ';
293                         }
294                 }
295
296                 if (!isspace(buf[0])) {
297                         if (!strncasecmp(header, "Content-type:", 13)) {
298                                 strcpy(content_type, &header[13]);
299                                 striplt(content_type);
300                                 extract_key(content_type_name, content_type, "name");
301                                 extract_key(charset, content_type, "charset");
302                                 extract_key(boundary, header, "boundary");
303                                 /* Deal with weird headers */
304                                 if (strchr(content_type, ' '))
305                                         *(strchr(content_type, ' ')) = '\0';
306                                 if (strchr(content_type, ';'))
307                                         *(strchr(content_type, ';')) = '\0';
308                         }
309                         if (!strncasecmp(header, "Content-Disposition:", 20)) {
310                                 strcpy(disposition, &header[20]);
311                                 striplt(disposition);
312                                 extract_key(content_disposition_name, disposition, "name");
313                                 extract_key(filename, disposition, "filename");
314                         }
315                         if (!strncasecmp(header, "Content-length: ", 15)) {
316                                 char clbuf[10];
317                                 safestrncpy(clbuf, &header[15], sizeof clbuf);
318                                 striplt(clbuf);
319                                 content_length = (size_t) atol(clbuf);
320                         }
321                         if (!strncasecmp(header, "Content-transfer-encoding: ", 26)) {
322                                 strcpy(encoding, &header[26]);
323                                 striplt(encoding);
324                         }
325                         strcpy(header, "");
326                         headerlen = 0;
327                 }
328                 if ((headerlen + buflen + 2) < SIZ) {
329                         memcpy(&header[headerlen], buf, buflen);
330                         headerlen += buflen;
331                         header[headerlen] = '\0';
332                 }
333         } while ((!IsEmptyStr(buf)) && (*ptr != 0));
334
335         if (strchr(disposition, ';'))
336                 *(strchr(disposition, ';')) = '\0';
337         striplt(disposition);
338         if (strchr(content_type, ';'))
339                 *(strchr(content_type, ';')) = '\0';
340         striplt(content_type);
341
342         if (!IsEmptyStr(boundary)) {
343                 is_multipart = 1;
344         } else {
345                 is_multipart = 0;
346         }
347
348         /* If this is a multipart message, then recursively process it */
349         part_start = NULL;
350         if (is_multipart) {
351
352                 /* Tell the client about this message's multipartedness */
353                 if (PreMultiPartCallBack != NULL) {
354                         PreMultiPartCallBack("", "", partnum, "",
355                                 NULL, content_type, charset,
356                                 0, encoding, userdata);
357                 }
358
359                 /* Figure out where the boundaries are */
360                 snprintf(startary, SIZ, "--%s", boundary);
361                 snprintf(endary, SIZ, "--%s--", boundary);
362                 startary_len = strlen(startary);
363
364                 part_start = NULL;
365                 do {
366                         next_boundary = NULL;
367                         for (srch=ptr; srch<content_end; ++srch) {
368                                 if (!memcmp(srch, startary, startary_len)) {
369                                         next_boundary = srch;
370                                         srch = content_end;
371                                 }
372                         }
373
374                         if ( (part_start != NULL) && (next_boundary != NULL) ) {
375                                 part_end = next_boundary;
376                                 --part_end;             /* omit the trailing LF */
377                                 if (crlf_in_use) {
378                                         --part_end;     /* omit the trailing CR */
379                                 }
380
381                                 if (!IsEmptyStr(partnum)) {
382                                         snprintf(nested_partnum,
383                                                  sizeof nested_partnum,
384                                                  "%s.%d", partnum,
385                                                  ++part_seq);
386                                 }
387                                 else {
388                                         snprintf(nested_partnum,
389                                                  sizeof nested_partnum,
390                                                  "%d", ++part_seq);
391                                 }
392                                 the_mime_parser(nested_partnum,
393                                             part_start, part_end,
394                                                 CallBack,
395                                                 PreMultiPartCallBack,
396                                                 PostMultiPartCallBack,
397                                                 userdata,
398                                                 dont_decode);
399                         }
400
401                         if (next_boundary != NULL) {
402                                 /* If we pass out of scope, don't attempt to
403                                  * read past the end boundary. */
404                                 if (!strcmp(next_boundary, endary)) {
405                                         ptr = content_end;
406                                 }
407                                 else {
408                                         /* Set up for the next part. */
409                                         part_start = strstr(next_boundary, "\n");
410                                         
411                                         /* Determine whether newlines are LF or CRLF */
412                                         evaluate_crlf_ptr = part_start;
413                                         --evaluate_crlf_ptr;
414                                         if (!memcmp(evaluate_crlf_ptr, "\r\n", 2)) {
415                                                 crlf_in_use = 1;
416                                         }
417                                         else {
418                                                 crlf_in_use = 0;
419                                         }
420
421                                         /* Advance past the LF ... now we're in the next part */
422                                         ++part_start;
423                                         ptr = part_start;
424                                 }
425                         }
426                         else {
427                                 /* Invalid end of multipart.  Bail out! */
428                                 ptr = content_end;
429                         }
430                 } while ( (ptr < content_end) && (next_boundary != NULL) );
431
432                 if (PostMultiPartCallBack != NULL) {
433                         PostMultiPartCallBack("", "", partnum, "", NULL,
434                                 content_type, charset, 0, encoding, userdata);
435                 }
436                 goto end_parser;
437         }
438
439         /* If it's not a multipart message, then do something with it */
440         if (!is_multipart) {
441                 part_start = ptr;
442                 length = 0;
443                 while (ptr < content_end) {
444                         ++ptr;
445                         ++length;
446                 }
447                 part_end = content_end;
448
449                 /******
450                  * I thought there was an off-by-one error here, but there isn't.
451                  * This probably means that there's an off-by-one error somewhere
452                  * else ... or maybe only in certain messages?
453                 --part_end;
454                 --length;
455                 ******/
456                 
457                 /* Truncate if the header told us to */
458                 if ( (content_length > 0) && (length > content_length) ) {
459                         length = content_length;
460                 }
461
462                 /* Sometimes the "name" field is tacked on to Content-type,
463                  * and sometimes it's tacked on to Content-disposition.  Use
464                  * whichever one we have.
465                  */
466                 if (strlen(content_disposition_name) > strlen(content_type_name)) {
467                         name = content_disposition_name;
468                 }
469                 else {
470                         name = content_type_name;
471                 }
472         
473                 /* lprintf(CTDL_DEBUG, "mime_decode part=%s, len=%d, type=%s, charset=%s, encoding=%s\n",
474                         partnum, length, content_type, charset, encoding); */
475
476                 /* Ok, we've got a non-multipart part here, so do something with it.
477                  */
478                 mime_decode(partnum,
479                         part_start, length,
480                         content_type, charset, encoding, disposition,
481                         name, filename,
482                         CallBack, NULL, NULL,
483                         userdata, dont_decode
484                 );
485
486                 /*
487                  * Now if it's an encapsulated message/rfc822 then we have to recurse into it
488                  */
489                 if (!strcasecmp(content_type, "message/rfc822")) {
490
491                         if (PreMultiPartCallBack != NULL) {
492                                 PreMultiPartCallBack("", "", partnum, "",
493                                         NULL, content_type, charset,
494                                         0, encoding, userdata);
495                         }
496                         if (CallBack != NULL) {
497                                 if (strlen(partnum) > 0) {
498                                         snprintf(nested_partnum,
499                                                  sizeof nested_partnum,
500                                                  "%s.%d", partnum,
501                                                  ++part_seq);
502                                 }
503                                 else {
504                                         snprintf(nested_partnum,
505                                                  sizeof nested_partnum,
506                                                  "%d", ++part_seq);
507                                 }
508                                 the_mime_parser(nested_partnum,
509                                         part_start, part_end,
510                                         CallBack,
511                                         PreMultiPartCallBack,
512                                         PostMultiPartCallBack,
513                                         userdata,
514                                         dont_decode
515                                 );
516                         }
517                         if (PostMultiPartCallBack != NULL) {
518                                 PostMultiPartCallBack("", "", partnum, "", NULL,
519                                         content_type, charset, 0, encoding, userdata);
520                         }
521
522
523                 }
524
525         }
526
527 end_parser:     /* free the buffers!  end the oppression!! */
528         free(boundary);
529         free(startary);
530         free(endary);   
531         free(header);
532         free(content_type);
533         free(charset);
534         free(encoding);
535         free(content_type_name);
536         free(content_disposition_name);
537         free(filename);
538         free(disposition);
539 }
540
541
542
543 /*
544  * Entry point for the MIME parser.
545  * (This function expects to be fed HEADERS + CONTENT)
546  * Note: NULL can be supplied as content_end; in this case, the message is
547  * considered to have ended when the parser encounters a 0x00 byte.
548  */
549 void mime_parser(char *content_start,
550                 char *content_end,
551
552                  void (*CallBack)
553                   (char *cbname,
554                    char *cbfilename,
555                    char *cbpartnum,
556                    char *cbdisp,
557                    void *cbcontent,
558                    char *cbtype,
559                    char *cbcharset,
560                    size_t cblength,
561                    char *cbencoding,
562                    void *cbuserdata),
563
564                  void (*PreMultiPartCallBack)
565                   (char *cbname,
566                    char *cbfilename,
567                    char *cbpartnum,
568                    char *cbdisp,
569                    void *cbcontent,
570                    char *cbtype,
571                    char *cbcharset,
572                    size_t cblength,
573                    char *cbencoding,
574                    void *cbuserdata),
575
576                  void (*PostMultiPartCallBack)
577                   (char *cbname,
578                    char *cbfilename,
579                    char *cbpartnum,
580                    char *cbdisp,
581                    void *cbcontent,
582                    char *cbtype,
583                    char *cbcharset,
584                    size_t cblength,
585                    char *cbencoding,
586                    void *cbuserdata),
587
588                   void *userdata,
589                   int dont_decode
590 )
591 {
592
593         the_mime_parser("", content_start, content_end,
594                         CallBack,
595                         PreMultiPartCallBack,
596                         PostMultiPartCallBack,
597                         userdata, dont_decode);
598 }