webcit_before_automake is now the trunk
[citadel.git] / webcit / mime_parser.c
1 /*
2  * $Id$
3  */
4 /**
5  * \defgroup MIME This is the MIME parser for Citadel.
6  *
7  * Copyright (c) 1998-2005 by Art Cancro
8  * This code is distributed under the terms of the GNU General Public License.
9  * \ingroup WebcitHttpServer
10  */
11 /*@{*/
12 #include "webcit.h"
13 #include "webserver.h"
14 #include "mime_parser.h"
15
16 /**
17  * \brief get mime key
18  * \param target where to put the mime buffer at???
19  * \param source where to extract the mimetype from
20  * \param key what???
21  */
22 void extract_key(char *target, char *source, char *key)
23 {
24         int a, b;
25
26         strcpy(target, source);
27         for (a = 0; a < strlen(target); ++a) {
28                 if ((!strncasecmp(&target[a], key, strlen(key)))
29                     && (target[a + strlen(key)] == '=')) {
30                         strcpy(target, &target[a + strlen(key) + 1]);
31                         if (target[0] == 34)
32                                 strcpy(target, &target[1]);
33                         for (b = 0; b < strlen(target); ++b)
34                                 if (target[b] == 34)
35                                         target[b] = 0;
36                         return;
37                 }
38         }
39         strcpy(target, "");
40 }
41
42
43 /**
44  * \brief For non-multipart messages, we need to generate a quickie partnum of "1"
45  * to return to callback functions.  Some callbacks demand it.
46  * \param supplied_partnum partnum to convert
47  * \return the converted num
48  */
49 char *fixed_partnum(char *supplied_partnum) {
50         if (supplied_partnum == NULL) return "1";
51         if (strlen(supplied_partnum)==0) return "1";
52         return supplied_partnum;
53 }
54
55
56
57 /**
58  * \brief Convert "quoted-printable" to binary.  Returns number of bytes decoded.
59  * \param decoded the buffer with the decoded output
60  * \param encoded the encoded string to decode
61  * \param sourcelen length of the decoded buffer
62  */
63 int CtdlDecodeQuotedPrintable(char *decoded, char *encoded, int sourcelen) {
64         char buf[SIZ];
65         int buf_length = 0;
66         int soft_line_break = 0;
67         unsigned int ch;
68         int decoded_length = 0;
69         int i;
70
71         decoded[0] = 0;
72         decoded_length = 0;
73         buf[0] = 0;
74         buf_length = 0;
75
76         for (i = 0; i < sourcelen; ++i) {
77
78                 buf[buf_length++] = encoded[i];
79
80                 if ( (encoded[i] == '\n')
81                    || (encoded[i] == 0)
82                    || (i == (sourcelen-1)) ) {
83                         buf[buf_length++] = 0;
84
85                         /*** begin -- process one line ***/
86
87                         if (buf[strlen(buf)-1] == '\n') {
88                                 buf[strlen(buf)-1] = 0;
89                         }
90                         if (buf[strlen(buf)-1] == '\r') {
91                                 buf[strlen(buf)-1] = 0;
92                         }
93                         while (isspace(buf[strlen(buf)-1])) {
94                                 buf[strlen(buf)-1] = 0;
95                         }
96                         soft_line_break = 0;
97
98                         while (strlen(buf) > 0) {
99                                 if (!strcmp(buf, "=")) {
100                                         soft_line_break = 1;
101                                         strcpy(buf, "");
102                                 } else if ((strlen(buf)>=3) && (buf[0]=='=')) {
103                                         sscanf(&buf[1], "%02x", &ch);
104                                         decoded[decoded_length++] = ch;
105                                         strcpy(buf, &buf[3]);
106                                 } else {
107                                         decoded[decoded_length++] = buf[0];
108                                         strcpy(buf, &buf[1]);
109                                 }
110                         }
111                         if (soft_line_break == 0) {
112                                 decoded[decoded_length++] = '\r';
113                                 decoded[decoded_length++] = '\n';
114                         }
115                         buf_length = 0;
116                         /*** end -- process one line ***/
117                 }
118         }
119
120         decoded[decoded_length++] = 0;
121         return(decoded_length);
122 }
123
124 /**
125  * \brief fully decode a message
126  * Given a message or message-part body and a length, handle any necessary
127  * decoding and pass the request up the stack.
128  * \param partnum todo ?????
129  * \param part_start todo
130  * \param length todo
131  * \param content_type todo
132  * \param charset todo
133  * \param encoding todo
134  * \param disposition todo
135  * \param name todo
136  * \param filename todo
137  * \param CallBack todo
138  * \param PreMultiPartCallBack todo
139  * \param PostMultiPartCallBack todo
140  * \param userdata todo
141  * \param dont_decode todo
142  */
143 void mime_decode(char *partnum,
144                  char *part_start, size_t length,
145                  char *content_type, char *charset, char *encoding,
146                  char *disposition,
147                  char *name, char *filename,
148                  void (*CallBack)
149                   (char *cbname,
150                    char *cbfilename,
151                    char *cbpartnum,
152                    char *cbdisp,
153                    void *cbcontent,
154                    char *cbtype,
155                    char *cbcharset,
156                    size_t cblength,
157                    char *cbencoding,
158                    void *cbuserdata),
159                  void (*PreMultiPartCallBack)
160                   (char *cbname,
161                    char *cbfilename,
162                    char *cbpartnum,
163                    char *cbdisp,
164                    void *cbcontent,
165                    char *cbtype,
166                    char *cbcharset,
167                    size_t cblength,
168                    char *cbencoding,
169                    void *cbuserdata),
170                  void (*PostMultiPartCallBack)
171                   (char *cbname,
172                    char *cbfilename,
173                    char *cbpartnum,
174                    char *cbdisp,
175                    void *cbcontent,
176                    char *cbtype,
177                    char *cbcharset,
178                    size_t cblength,
179                    char *cbencoding,
180                    void *cbuserdata),
181                   void *userdata,
182                   int dont_decode
183 )
184 {
185
186         char *decoded;
187         size_t bytes_decoded = 0;
188
189         /* Some encodings aren't really encodings */
190         if (!strcasecmp(encoding, "7bit"))
191                 strcpy(encoding, "");
192         if (!strcasecmp(encoding, "8bit"))
193                 strcpy(encoding, "");
194         if (!strcasecmp(encoding, "binary"))
195                 strcpy(encoding, "");
196
197         /* If this part is not encoded, send as-is */
198         if ( (strlen(encoding) == 0) || (dont_decode)) {
199                 if (CallBack != NULL) {
200                         CallBack(name, filename, fixed_partnum(partnum),
201                                 disposition, part_start,
202                                 content_type, charset, length, encoding, userdata);
203                         }
204                 return;
205         }
206         
207         if ((strcasecmp(encoding, "base64"))
208             && (strcasecmp(encoding, "quoted-printable"))) {
209                 return;
210         }
211         /**
212          * Allocate a buffer for the decoded data.  The output buffer is the
213          * same size as the input buffer; this assumes that the decoded data
214          * will never be larger than the encoded data.  This is a safe
215          * assumption with base64, uuencode, and quoted-printable.
216          */
217         decoded = malloc(length+2048);
218         if (decoded == NULL) {
219                 return;
220         }
221
222         if (!strcasecmp(encoding, "base64")) {
223                 bytes_decoded = CtdlDecodeBase64(decoded, part_start, length);
224         }
225         else if (!strcasecmp(encoding, "quoted-printable")) {
226                 bytes_decoded = CtdlDecodeQuotedPrintable(decoded,
227                                                         part_start, length);
228         }
229
230         if (bytes_decoded > 0) if (CallBack != NULL) {
231                 CallBack(name, filename, fixed_partnum(partnum),
232                         disposition, decoded,
233                         content_type, charset, bytes_decoded, "binary", userdata);
234         }
235
236         free(decoded);
237 }
238
239 /**
240  * \brief Break out the components of a multipart message
241  * (This function expects to be fed HEADERS + CONTENT)
242  * Note: NULL can be supplied as content_end; in this case, the message is
243  * considered to have ended when the parser encounters a 0x00 byte.
244  * \param partnum todo
245  * \param content_start todo ?????
246  * \param content_end todo
247  * \param CallBack todo
248  * \param PreMultiPartCallBack
249  * \param PostMultiPartCallBack
250  * \param userdata todo
251  * \param dont_decode todo
252  */
253 void the_mime_parser(char *partnum,
254                      char *content_start, char *content_end,
255                      void (*CallBack)
256                       (char *cbname,
257                        char *cbfilename,
258                        char *cbpartnum,
259                        char *cbdisp,
260                        void *cbcontent,
261                        char *cbtype,
262                        char *cbcharset,
263                        size_t cblength,
264                        char *cbencoding,
265                        void *cbuserdata),
266                      void (*PreMultiPartCallBack)
267                       (char *cbname,
268                        char *cbfilename,
269                        char *cbpartnum,
270                        char *cbdisp,
271                        void *cbcontent,
272                        char *cbtype,
273                        char *cbcharset,
274                        size_t cblength,
275                        char *cbencoding,
276                        void *cbuserdata),
277                      void (*PostMultiPartCallBack)
278                       (char *cbname,
279                        char *cbfilename,
280                        char *cbpartnum,
281                        char *cbdisp,
282                        void *cbcontent,
283                        char *cbtype,
284                        char *cbcharset,
285                        size_t cblength,
286                        char *cbencoding,
287                        void *cbuserdata),
288                       void *userdata,
289                       int dont_decode
290 )
291 {
292
293         char *ptr;
294         char *srch = NULL;
295         char *part_start, *part_end = NULL;
296         char buf[SIZ];
297         char *header;
298         char *boundary;
299         char *startary;
300         size_t startary_len = 0;
301         char *endary;
302         char *next_boundary;
303         char *content_type;
304         char *charset;
305         size_t content_length;
306         char *encoding;
307         char *disposition;
308         char *name = NULL;
309         char *content_type_name;
310         char *content_disposition_name;
311         char *filename;
312         int is_multipart;
313         int part_seq = 0;
314         int i;
315         size_t length;
316         char nested_partnum[SIZ];
317
318         ptr = content_start;
319         content_length = 0;
320
321         boundary = malloc(SIZ);
322         memset(boundary, 0, SIZ);
323
324         startary = malloc(SIZ);
325         memset(startary, 0, SIZ);
326
327         endary = malloc(SIZ);
328         memset(endary, 0, SIZ);
329
330         header = malloc(SIZ);
331         memset(header, 0, SIZ);
332
333         content_type = malloc(SIZ);
334         memset(content_type, 0, SIZ);
335
336         charset = malloc(SIZ);
337         memset(charset, 0, SIZ);
338
339         encoding = malloc(SIZ);
340         memset(encoding, 0, SIZ);
341
342         content_type_name = malloc(SIZ);
343         memset(content_type_name, 0, SIZ);
344
345         content_disposition_name = malloc(SIZ);
346         memset(content_disposition_name, 0, SIZ);
347
348         filename = malloc(SIZ);
349         memset(filename, 0, SIZ);
350
351         disposition = malloc(SIZ);
352         memset(disposition, 0, SIZ);
353
354         /** If the caller didn't supply an endpointer, generate one by measure */
355         if (content_end == NULL) {
356                 content_end = &content_start[strlen(content_start)];
357         }
358
359         /** Learn interesting things from the headers */
360         strcpy(header, "");
361         do {
362                 ptr = memreadline(ptr, buf, SIZ);
363                 if (ptr >= content_end) {
364                         goto end_parser;
365                 }
366
367                 for (i = 0; i < strlen(buf); ++i) {
368                         if (isspace(buf[i])) {
369                                 buf[i] = ' ';
370                         }
371                 }
372
373                 if (!isspace(buf[0])) {
374                         if (!strncasecmp(header, "Content-type: ", 14)) {
375                                 strcpy(content_type, &header[14]);
376                                 extract_key(content_type_name, content_type, "name");
377                                 extract_key(charset, content_type, "charset");
378                                 /** Deal with weird headers */
379                                 if (strchr(content_type, ' '))
380                                         *(strchr(content_type, ' ')) = '\0';
381                                 if (strchr(content_type, ';'))
382                                         *(strchr(content_type, ';')) = '\0';
383                         }
384                         if (!strncasecmp(header, "Content-Disposition: ", 21)) {
385                                 strcpy(disposition, &header[21]);
386                                 extract_key(content_disposition_name, disposition, "name");
387                                 extract_key(filename, disposition, "filename");
388                         }
389                         if (!strncasecmp(header, "Content-length: ", 16)) {
390                                 content_length = (size_t) atol(&header[16]);
391                         }
392                         if (!strncasecmp(header,
393                                       "Content-transfer-encoding: ", 27))
394                                 strcpy(encoding, &header[27]);
395                         if (strlen(boundary) == 0)
396                                 extract_key(boundary, header, "boundary");
397                         strcpy(header, "");
398                 }
399                 if ((strlen(header) + strlen(buf) + 2) < SIZ)
400                         strcat(header, buf);
401         } while ((strlen(buf) > 0) && (*ptr != 0));
402
403         if (strchr(disposition, ';'))
404                 *(strchr(disposition, ';')) = '\0';
405         striplt(disposition);
406         if (strchr(content_type, ';'))
407                 *(strchr(content_type, ';')) = '\0';
408         striplt(content_type);
409
410         if (strlen(boundary) > 0) {
411                 is_multipart = 1;
412         } else {
413                 is_multipart = 0;
414         }
415
416         /** If this is a multipart message, then recursively process it */
417         part_start = NULL;
418         if (is_multipart) {
419
420                 /** Tell the client about this message's multipartedness */
421                 if (PreMultiPartCallBack != NULL) {
422                         PreMultiPartCallBack("", "", partnum, "",
423                                 NULL, content_type, charset,
424                                 0, encoding, userdata);
425                 }
426
427                 /** Figure out where the boundaries are */
428                 snprintf(startary, SIZ, "--%s", boundary);
429                 snprintf(endary, SIZ, "--%s--", boundary);
430                 startary_len = strlen(startary);
431
432                 part_start = NULL;
433                 do {
434                         next_boundary = NULL;
435                         for (srch=ptr; srch<content_end; ++srch) {
436                                 if (!memcmp(srch, startary, startary_len)) {
437                                         next_boundary = srch;
438                                         srch = content_end;
439                                 }
440                         }
441
442                         if ( (part_start != NULL) && (next_boundary != NULL) ) {
443                                 part_end = next_boundary;
444                                 --part_end;
445
446                                 if (strlen(partnum) > 0) {
447                                         snprintf(nested_partnum,
448                                                  sizeof nested_partnum,
449                                                  "%s.%d", partnum,
450                                                  ++part_seq);
451                                 }
452                                 else {
453                                         snprintf(nested_partnum,
454                                                  sizeof nested_partnum,
455                                                  "%d", ++part_seq);
456                                 }
457                                 the_mime_parser(nested_partnum,
458                                             part_start, part_end,
459                                                 CallBack,
460                                                 PreMultiPartCallBack,
461                                                 PostMultiPartCallBack,
462                                                 userdata,
463                                                 dont_decode);
464                         }
465
466                         if (next_boundary != NULL) {
467                                 /**
468                                  * If we pass out of scope, don't attempt to
469                                  * read past the end boundary. */
470                                 if (!strcmp(next_boundary, endary)) {
471                                         ptr = content_end;
472                                 }
473                                 else {
474                                         /** Set up for the next part. */
475                                         part_start = strstr(next_boundary, "\n");
476                                         ++part_start;
477                                         ptr = part_start;
478                                 }
479                         }
480                         else {
481                                 /** Invalid end of multipart.  Bail out! */
482                                 ptr = content_end;
483                         }
484                 } while ( (ptr < content_end) && (next_boundary != NULL) );
485
486                 if (PostMultiPartCallBack != NULL) {
487                         PostMultiPartCallBack("", "", partnum, "", NULL,
488                                 content_type, charset, 0, encoding, userdata);
489                 }
490                 goto end_parser;
491         }
492
493         /** If it's not a multipart message, then do something with it */
494         if (!is_multipart) {
495                 part_start = ptr;
496                 length = 0;
497                 while (ptr < content_end) {
498                         ++ptr;
499                         ++length;
500                 }
501                 part_end = content_end;
502                 /** fix an off-by-one error */
503                 --part_end;
504                 --length;
505                 
506                 /** Truncate if the header told us to */
507                 if ( (content_length > 0) && (length > content_length) ) {
508                         length = content_length;
509                 }
510
511                 /**
512                  * Sometimes the "name" field is tacked on to Content-type,
513                  * and sometimes it's tacked on to Content-disposition.  Use
514                  * whichever one we have.
515                  */
516                 if (strlen(content_disposition_name) > strlen(content_type_name)) {
517                         name = content_disposition_name;
518                 }
519                 else {
520                         name = content_type_name;
521                 }
522         
523                 /*
524                 lprintf(9, "mime_decode part=%s, len=%d, type=%s, charset=%s, encoding=%s\n",
525                         partnum, length, content_type, charset, encoding);
526                 */
527
528                 /**
529                  * Ok, we've got a non-multipart part here, so do something with it.
530                  */
531                 mime_decode(partnum,
532                         part_start, length,
533                         content_type, charset, encoding, disposition,
534                         name, filename,
535                         CallBack, NULL, NULL,
536                         userdata, dont_decode
537                 );
538
539                 /**
540                  * Now if it's an encapsulated message/rfc822 then we have to recurse into it
541                  */
542                 if (!strcasecmp(content_type, "message/rfc822")) {
543
544                         if (PreMultiPartCallBack != NULL) {
545                                 PreMultiPartCallBack("", "", partnum, "",
546                                         NULL, content_type, charset,
547                                         0, encoding, userdata);
548                         }
549                         if (CallBack != NULL) {
550                                 if (strlen(partnum) > 0) {
551                                         snprintf(nested_partnum,
552                                                  sizeof nested_partnum,
553                                                  "%s.%d", partnum,
554                                                  ++part_seq);
555                                 }
556                                 else {
557                                         snprintf(nested_partnum,
558                                                  sizeof nested_partnum,
559                                                  "%d", ++part_seq);
560                                 }
561                                 the_mime_parser(nested_partnum,
562                                         part_start, part_end,
563                                         CallBack,
564                                         PreMultiPartCallBack,
565                                         PostMultiPartCallBack,
566                                         userdata,
567                                         dont_decode
568                                 );
569                         }
570                         if (PostMultiPartCallBack != NULL) {
571                                 PostMultiPartCallBack("", "", partnum, "", NULL,
572                                         content_type, charset, 0, encoding, userdata);
573                         }
574
575
576                 }
577
578         }
579
580 end_parser:     /** free the buffers!  end the oppression!! */
581         free(boundary);
582         free(startary);
583         free(endary);   
584         free(header);
585         free(content_type);
586         free(charset);
587         free(encoding);
588         free(content_type_name);
589         free(content_disposition_name);
590         free(filename);
591         free(disposition);
592 }
593
594
595
596 /**
597  * \brief Entry point for the MIME parser.
598  * (This function expects to be fed HEADERS + CONTENT)
599  * Note: NULL can be supplied as content_end; in this case, the message is
600  * considered to have ended when the parser encounters a 0x00 byte.
601  * \param content_start todo ?????????
602  * \param content_end todo 
603  * \param CallBack todo
604  * \param PreMultiPartCallBack todo
605  * \param PostMultiPartCallBack todo
606  * \param userdata todo
607  * \param dont_decode todo
608  */
609 void mime_parser(char *content_start,
610                 char *content_end,
611
612                  void (*CallBack)
613                   (char *cbname,
614                    char *cbfilename,
615                    char *cbpartnum,
616                    char *cbdisp,
617                    void *cbcontent,
618                    char *cbtype,
619                    char *cbcharset,
620                    size_t cblength,
621                    char *cbencoding,
622                    void *cbuserdata),
623
624                  void (*PreMultiPartCallBack)
625                   (char *cbname,
626                    char *cbfilename,
627                    char *cbpartnum,
628                    char *cbdisp,
629                    void *cbcontent,
630                    char *cbtype,
631                    char *cbcharset,
632                    size_t cblength,
633                    char *cbencoding,
634                    void *cbuserdata),
635
636                  void (*PostMultiPartCallBack)
637                   (char *cbname,
638                    char *cbfilename,
639                    char *cbpartnum,
640                    char *cbdisp,
641                    void *cbcontent,
642                    char *cbtype,
643                    char *cbcharset,
644                    size_t cblength,
645                    char *cbencoding,
646                    void *cbuserdata),
647
648                   void *userdata,
649                   int dont_decode
650 )
651 {
652
653         the_mime_parser("", content_start, content_end,
654                         CallBack,
655                         PreMultiPartCallBack,
656                         PostMultiPartCallBack,
657                         userdata, dont_decode);
658 }
659
660
661
662 /*@}*/