* tiny tool for message retrieval, first draft.
[citadel.git] / citadel / mime_parser.c
1 /*
2  * $Id$
3  *
4  * This is the MIME parser for Citadel.
5  *
6  * Copyright (c) 1998-2006 by Art Cancro
7  * This code is distributed under the GNU General Public License v2.
8  *
9  */
10
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <stdio.h>
14 #include <signal.h>
15 #include <sys/types.h>
16 #include <ctype.h>
17 #include <string.h>
18 #include <sys/stat.h>
19 #include <errno.h>
20
21 #include "citadel.h"
22 #include "server.h"
23 #include "sysdep_decls.h"
24 #include "tools.h"
25
26 #include "mime_parser.h"
27
28
29 void extract_key(char *target, char *source, char *key)
30 {
31         char *ptr;
32         char looking_for[256];
33         int double_quotes = 0;
34
35         snprintf(looking_for, sizeof looking_for, "%s=", key);
36
37         ptr = bmstrcasestr(source, looking_for);
38         if (ptr == NULL) {
39                 strcpy(target, "");
40                 return;
41         }
42         strcpy(target, (ptr + strlen(looking_for)));
43
44         for (ptr=target; (*ptr != 0); ++ptr) {
45
46                 /* A semicolon means we've hit the end of the key, unless we're inside double quotes */
47                 if ( (double_quotes != 1) && (*ptr == ';')) {
48                         *ptr = 0;
49                 }
50
51                 /* if we find double quotes, we've got a great set of string boundaries */
52                 if (*ptr == '\"') {
53                         ++double_quotes;
54                         if (double_quotes == 1) {
55                                 strcpy(ptr, ptr+1);
56                         }
57                         else {
58                                 *ptr = 0;
59                         }
60                 }
61         }
62 }
63
64
65 /*
66  * For non-multipart messages, we need to generate a quickie partnum of "1"
67  * to return to callback functions.  Some callbacks demand it.
68  */
69 char *fixed_partnum(char *supplied_partnum) {
70         if (supplied_partnum == NULL) return "1";
71         if (strlen(supplied_partnum)==0) return "1";
72         return supplied_partnum;
73 }
74
75
76
77 /*
78  * Given a message or message-part body and a length, handle any necessary
79  * decoding and pass the request up the stack.
80  */
81 void mime_decode(char *partnum,
82                  char *part_start, size_t length,
83                  char *content_type, char *charset, char *encoding,
84                  char *disposition,
85                  char *name, char *filename,
86                  void (*CallBack)
87                   (char *cbname,
88                    char *cbfilename,
89                    char *cbpartnum,
90                    char *cbdisp,
91                    void *cbcontent,
92                    char *cbtype,
93                    char *cbcharset,
94                    size_t cblength,
95                    char *cbencoding,
96                    void *cbuserdata),
97                  void (*PreMultiPartCallBack)
98                   (char *cbname,
99                    char *cbfilename,
100                    char *cbpartnum,
101                    char *cbdisp,
102                    void *cbcontent,
103                    char *cbtype,
104                    char *cbcharset,
105                    size_t cblength,
106                    char *cbencoding,
107                    void *cbuserdata),
108                  void (*PostMultiPartCallBack)
109                   (char *cbname,
110                    char *cbfilename,
111                    char *cbpartnum,
112                    char *cbdisp,
113                    void *cbcontent,
114                    char *cbtype,
115                    char *cbcharset,
116                    size_t cblength,
117                    char *cbencoding,
118                    void *cbuserdata),
119                   void *userdata,
120                   int dont_decode
121 )
122 {
123
124         char *decoded;
125         size_t bytes_decoded = 0;
126
127         /* Some encodings aren't really encodings */
128         if (!strcasecmp(encoding, "7bit"))
129                 strcpy(encoding, "");
130         if (!strcasecmp(encoding, "8bit"))
131                 strcpy(encoding, "");
132         if (!strcasecmp(encoding, "binary"))
133                 strcpy(encoding, "");
134
135         /* If this part is not encoded, send as-is */
136         if ( (strlen(encoding) == 0) || (dont_decode)) {
137                 if (CallBack != NULL) {
138                         CallBack(name, filename, fixed_partnum(partnum),
139                                 disposition, part_start,
140                                 content_type, charset, length, encoding, userdata);
141                         }
142                 return;
143         }
144         
145         /* Fail silently if we hit an unknown encoding. */
146         if ((strcasecmp(encoding, "base64"))
147             && (strcasecmp(encoding, "quoted-printable"))) {
148                 return;
149         }
150
151         /*
152          * Allocate a buffer for the decoded data.  The output buffer is slightly
153          * larger than the input buffer; this assumes that the decoded data
154          * will never be significantly larger than the encoded data.  This is a
155          * safe assumption with base64, uuencode, and quoted-printable.
156          */
157         decoded = malloc(length + 32768);
158         if (decoded == NULL) {
159                 return;
160         }
161
162         if (!strcasecmp(encoding, "base64")) {
163                 bytes_decoded = CtdlDecodeBase64(decoded, part_start, length);
164         }
165         else if (!strcasecmp(encoding, "quoted-printable")) {
166                 bytes_decoded = CtdlDecodeQuotedPrintable(decoded, part_start, length);
167         }
168
169         if (bytes_decoded > 0) if (CallBack != NULL) {
170                 CallBack(name, filename, fixed_partnum(partnum),
171                         disposition, decoded,
172                         content_type, charset, bytes_decoded, "binary", userdata);
173         }
174
175         free(decoded);
176 }
177
178 /*
179  * Break out the components of a multipart message
180  * (This function expects to be fed HEADERS + CONTENT)
181  * Note: NULL can be supplied as content_end; in this case, the message is
182  * considered to have ended when the parser encounters a 0x00 byte.
183  */
184 void the_mime_parser(char *partnum,
185                      char *content_start, char *content_end,
186                      void (*CallBack)
187                       (char *cbname,
188                        char *cbfilename,
189                        char *cbpartnum,
190                        char *cbdisp,
191                        void *cbcontent,
192                        char *cbtype,
193                        char *cbcharset,
194                        size_t cblength,
195                        char *cbencoding,
196                        void *cbuserdata),
197                      void (*PreMultiPartCallBack)
198                       (char *cbname,
199                        char *cbfilename,
200                        char *cbpartnum,
201                        char *cbdisp,
202                        void *cbcontent,
203                        char *cbtype,
204                        char *cbcharset,
205                        size_t cblength,
206                        char *cbencoding,
207                        void *cbuserdata),
208                      void (*PostMultiPartCallBack)
209                       (char *cbname,
210                        char *cbfilename,
211                        char *cbpartnum,
212                        char *cbdisp,
213                        void *cbcontent,
214                        char *cbtype,
215                        char *cbcharset,
216                        size_t cblength,
217                        char *cbencoding,
218                        void *cbuserdata),
219                       void *userdata,
220                       int dont_decode
221 )
222 {
223
224         char *ptr;
225         char *srch = NULL;
226         char *part_start, *part_end = NULL;
227         char buf[SIZ];
228         char *header;
229         char *boundary;
230         char *startary;
231         size_t startary_len = 0;
232         char *endary;
233         char *next_boundary;
234         char *content_type;
235         char *charset;
236         size_t content_length;
237         char *encoding;
238         char *disposition;
239         char *name = NULL;
240         char *content_type_name;
241         char *content_disposition_name;
242         char *filename;
243         int is_multipart;
244         int part_seq = 0;
245         int i;
246         size_t length;
247         char nested_partnum[256];
248         int crlf_in_use = 0;
249         char *evaluate_crlf_ptr = NULL;
250         int buflen = 0;
251         int headerlen = 0;
252
253         ptr = content_start;
254         content_length = 0;
255
256         boundary = malloc(SIZ);
257         memset(boundary, 0, SIZ);
258
259         startary = malloc(SIZ);
260         memset(startary, 0, SIZ);
261
262         endary = malloc(SIZ);
263         memset(endary, 0, SIZ);
264
265         header = malloc(SIZ);
266         memset(header, 0, SIZ);
267
268         content_type = malloc(SIZ);
269         memset(content_type, 0, SIZ);
270
271         charset = malloc(SIZ);
272         memset(charset, 0, SIZ);
273
274         encoding = malloc(SIZ);
275         memset(encoding, 0, SIZ);
276
277         content_type_name = malloc(SIZ);
278         memset(content_type_name, 0, SIZ);
279
280         content_disposition_name = malloc(SIZ);
281         memset(content_disposition_name, 0, SIZ);
282
283         filename = malloc(SIZ);
284         memset(filename, 0, SIZ);
285
286         disposition = malloc(SIZ);
287         memset(disposition, 0, SIZ);
288
289         /* If the caller didn't supply an endpointer, generate one by measure */
290         if (content_end == NULL) {
291                 content_end = &content_start[strlen(content_start)];
292         }
293
294         /* Learn interesting things from the headers */
295         strcpy(header, "");
296         headerlen = 0;
297         do {
298                 ptr = memreadlinelen(ptr, buf, SIZ, &buflen);
299                 if (ptr >= content_end) {
300                         goto end_parser;
301                 }
302
303                 for (i = 0; i < buflen; ++i) {
304                         if (isspace(buf[i])) {
305                                 buf[i] = ' ';
306                         }
307                 }
308
309                 if (!isspace(buf[0])) {
310                         if (!strncasecmp(header, "Content-type:", 13)) {
311                                 strcpy(content_type, &header[13]);
312                                 striplt(content_type);
313                                 extract_key(content_type_name, content_type, "name");
314                                 extract_key(charset, content_type, "charset");
315                                 extract_key(boundary, header, "boundary");
316                                 /* Deal with weird headers */
317                                 if (strchr(content_type, ' '))
318                                         *(strchr(content_type, ' ')) = '\0';
319                                 if (strchr(content_type, ';'))
320                                         *(strchr(content_type, ';')) = '\0';
321                         }
322                         if (!strncasecmp(header, "Content-Disposition:", 20)) {
323                                 strcpy(disposition, &header[20]);
324                                 striplt(disposition);
325                                 extract_key(content_disposition_name, disposition, "name");
326                                 extract_key(filename, disposition, "filename");
327                         }
328                         if (!strncasecmp(header, "Content-length: ", 15)) {
329                                 char clbuf[10];
330                                 safestrncpy(clbuf, &header[15], sizeof clbuf);
331                                 striplt(clbuf);
332                                 content_length = (size_t) atol(clbuf);
333                         }
334                         if (!strncasecmp(header, "Content-transfer-encoding: ", 26)) {
335                                 strcpy(encoding, &header[26]);
336                                 striplt(encoding);
337                         }
338                         strcpy(header, "");
339                         headerlen = 0;
340                 }
341                 if ((headerlen + buflen + 2) < SIZ) {
342                         memcpy(&header[headerlen], buf, buflen);
343                         headerlen += buflen;
344                         header[headerlen] = '\0';
345                 }
346         } while ((!IsEmptyStr(buf)) && (*ptr != 0));
347
348         if (strchr(disposition, ';'))
349                 *(strchr(disposition, ';')) = '\0';
350         striplt(disposition);
351         if (strchr(content_type, ';'))
352                 *(strchr(content_type, ';')) = '\0';
353         striplt(content_type);
354
355         if (!IsEmptyStr(boundary)) {
356                 is_multipart = 1;
357         } else {
358                 is_multipart = 0;
359         }
360
361         /* If this is a multipart message, then recursively process it */
362         part_start = NULL;
363         if (is_multipart) {
364
365                 /* Tell the client about this message's multipartedness */
366                 if (PreMultiPartCallBack != NULL) {
367                         PreMultiPartCallBack("", "", partnum, "",
368                                 NULL, content_type, charset,
369                                 0, encoding, userdata);
370                 }
371
372                 /* Figure out where the boundaries are */
373                 snprintf(startary, SIZ, "--%s", boundary);
374                 snprintf(endary, SIZ, "--%s--", boundary);
375                 startary_len = strlen(startary);
376
377                 part_start = NULL;
378                 do {
379                         next_boundary = NULL;
380                         for (srch=ptr; srch<content_end; ++srch) {
381                                 if (!memcmp(srch, startary, startary_len)) {
382                                         next_boundary = srch;
383                                         srch = content_end;
384                                 }
385                         }
386
387                         if ( (part_start != NULL) && (next_boundary != NULL) ) {
388                                 part_end = next_boundary;
389                                 --part_end;             /* omit the trailing LF */
390                                 if (crlf_in_use) {
391                                         --part_end;     /* omit the trailing CR */
392                                 }
393
394                                 if (!IsEmptyStr(partnum)) {
395                                         snprintf(nested_partnum,
396                                                  sizeof nested_partnum,
397                                                  "%s.%d", partnum,
398                                                  ++part_seq);
399                                 }
400                                 else {
401                                         snprintf(nested_partnum,
402                                                  sizeof nested_partnum,
403                                                  "%d", ++part_seq);
404                                 }
405                                 the_mime_parser(nested_partnum,
406                                             part_start, part_end,
407                                                 CallBack,
408                                                 PreMultiPartCallBack,
409                                                 PostMultiPartCallBack,
410                                                 userdata,
411                                                 dont_decode);
412                         }
413
414                         if (next_boundary != NULL) {
415                                 /* If we pass out of scope, don't attempt to
416                                  * read past the end boundary. */
417                                 if (!strcmp(next_boundary, endary)) {
418                                         ptr = content_end;
419                                 }
420                                 else {
421                                         /* Set up for the next part. */
422                                         part_start = strstr(next_boundary, "\n");
423                                         
424                                         /* Determine whether newlines are LF or CRLF */
425                                         evaluate_crlf_ptr = part_start;
426                                         --evaluate_crlf_ptr;
427                                         if (!memcmp(evaluate_crlf_ptr, "\r\n", 2)) {
428                                                 crlf_in_use = 1;
429                                         }
430                                         else {
431                                                 crlf_in_use = 0;
432                                         }
433
434                                         /* Advance past the LF ... now we're in the next part */
435                                         ++part_start;
436                                         ptr = part_start;
437                                 }
438                         }
439                         else {
440                                 /* Invalid end of multipart.  Bail out! */
441                                 ptr = content_end;
442                         }
443                 } while ( (ptr < content_end) && (next_boundary != NULL) );
444
445                 if (PostMultiPartCallBack != NULL) {
446                         PostMultiPartCallBack("", "", partnum, "", NULL,
447                                 content_type, charset, 0, encoding, userdata);
448                 }
449                 goto end_parser;
450         }
451
452         /* If it's not a multipart message, then do something with it */
453         if (!is_multipart) {
454                 part_start = ptr;
455                 length = 0;
456                 while (ptr < content_end) {
457                         ++ptr;
458                         ++length;
459                 }
460                 part_end = content_end;
461
462                 /******
463                  * I thought there was an off-by-one error here, but there isn't.
464                  * This probably means that there's an off-by-one error somewhere
465                  * else ... or maybe only in certain messages?
466                 --part_end;
467                 --length;
468                 ******/
469                 
470                 /* Truncate if the header told us to */
471                 if ( (content_length > 0) && (length > content_length) ) {
472                         length = content_length;
473                 }
474
475                 /* Sometimes the "name" field is tacked on to Content-type,
476                  * and sometimes it's tacked on to Content-disposition.  Use
477                  * whichever one we have.
478                  */
479                 if (strlen(content_disposition_name) > strlen(content_type_name)) {
480                         name = content_disposition_name;
481                 }
482                 else {
483                         name = content_type_name;
484                 }
485         
486                 /* lprintf(CTDL_DEBUG, "mime_decode part=%s, len=%d, type=%s, charset=%s, encoding=%s\n",
487                         partnum, length, content_type, charset, encoding); */
488
489                 /* Ok, we've got a non-multipart part here, so do something with it.
490                  */
491                 mime_decode(partnum,
492                         part_start, length,
493                         content_type, charset, encoding, disposition,
494                         name, filename,
495                         CallBack, NULL, NULL,
496                         userdata, dont_decode
497                 );
498
499                 /*
500                  * Now if it's an encapsulated message/rfc822 then we have to recurse into it
501                  */
502                 if (!strcasecmp(content_type, "message/rfc822")) {
503
504                         if (PreMultiPartCallBack != NULL) {
505                                 PreMultiPartCallBack("", "", partnum, "",
506                                         NULL, content_type, charset,
507                                         0, encoding, userdata);
508                         }
509                         if (CallBack != NULL) {
510                                 if (strlen(partnum) > 0) {
511                                         snprintf(nested_partnum,
512                                                  sizeof nested_partnum,
513                                                  "%s.%d", partnum,
514                                                  ++part_seq);
515                                 }
516                                 else {
517                                         snprintf(nested_partnum,
518                                                  sizeof nested_partnum,
519                                                  "%d", ++part_seq);
520                                 }
521                                 the_mime_parser(nested_partnum,
522                                         part_start, part_end,
523                                         CallBack,
524                                         PreMultiPartCallBack,
525                                         PostMultiPartCallBack,
526                                         userdata,
527                                         dont_decode
528                                 );
529                         }
530                         if (PostMultiPartCallBack != NULL) {
531                                 PostMultiPartCallBack("", "", partnum, "", NULL,
532                                         content_type, charset, 0, encoding, userdata);
533                         }
534
535
536                 }
537
538         }
539
540 end_parser:     /* free the buffers!  end the oppression!! */
541         free(boundary);
542         free(startary);
543         free(endary);   
544         free(header);
545         free(content_type);
546         free(charset);
547         free(encoding);
548         free(content_type_name);
549         free(content_disposition_name);
550         free(filename);
551         free(disposition);
552 }
553
554
555
556 /*
557  * Entry point for the MIME parser.
558  * (This function expects to be fed HEADERS + CONTENT)
559  * Note: NULL can be supplied as content_end; in this case, the message is
560  * considered to have ended when the parser encounters a 0x00 byte.
561  */
562 void mime_parser(char *content_start,
563                 char *content_end,
564
565                  void (*CallBack)
566                   (char *cbname,
567                    char *cbfilename,
568                    char *cbpartnum,
569                    char *cbdisp,
570                    void *cbcontent,
571                    char *cbtype,
572                    char *cbcharset,
573                    size_t cblength,
574                    char *cbencoding,
575                    void *cbuserdata),
576
577                  void (*PreMultiPartCallBack)
578                   (char *cbname,
579                    char *cbfilename,
580                    char *cbpartnum,
581                    char *cbdisp,
582                    void *cbcontent,
583                    char *cbtype,
584                    char *cbcharset,
585                    size_t cblength,
586                    char *cbencoding,
587                    void *cbuserdata),
588
589                  void (*PostMultiPartCallBack)
590                   (char *cbname,
591                    char *cbfilename,
592                    char *cbpartnum,
593                    char *cbdisp,
594                    void *cbcontent,
595                    char *cbtype,
596                    char *cbcharset,
597                    size_t cblength,
598                    char *cbencoding,
599                    void *cbuserdata),
600
601                   void *userdata,
602                   int dont_decode
603 )
604 {
605
606         the_mime_parser("", content_start, content_end,
607                         CallBack,
608                         PreMultiPartCallBack,
609                         PostMultiPartCallBack,
610                         userdata, dont_decode);
611 }