* add xdgmime from freedesktop.org
[citadel.git] / libcitadel / lib / mime_parser.c
1 /*
2  * $Id$
3  *
4  * This is the MIME parser for Citadel.
5  *
6  * Copyright (c) 1998-2007 by the citadel.org development team.
7  * This code is distributed under the GNU General Public License v3.
8  *
9  */
10
11 #include <stdlib.h>
12 #include <unistd.h>
13 #include <stdio.h>
14 #include <signal.h>
15 #include <sys/types.h>
16 #include <ctype.h>
17 #include <string.h>
18 #include <sys/stat.h>
19 #include <errno.h>
20
21 #include "xdgmime/xdgmime.h"
22 #include "libcitadel.h"
23
24 void extract_key(char *target, char *source, char *key)
25 {
26         char *ptr;
27         char looking_for[256];
28         int double_quotes = 0;
29
30         snprintf(looking_for, sizeof looking_for, "%s=", key);
31
32         ptr = bmstrcasestr(source, looking_for);
33         if (ptr == NULL) {
34                 strcpy(target, "");
35                 return;
36         }
37         strcpy(target, (ptr + strlen(looking_for)));
38
39         for (ptr=target; (*ptr != 0); ++ptr) {
40
41                 /* A semicolon means we've hit the end of the key, unless we're inside double quotes */
42                 if ( (double_quotes != 1) && (*ptr == ';')) {
43                         *ptr = 0;
44                 }
45
46                 /* if we find double quotes, we've got a great set of string boundaries */
47                 if (*ptr == '\"') {
48                         ++double_quotes;
49                         if (double_quotes == 1) {
50                                 strcpy(ptr, ptr+1);
51                         }
52                         else {
53                                 *ptr = 0;
54                         }
55                 }
56         }
57 }
58
59
60 /*
61  * For non-multipart messages, we need to generate a quickie partnum of "1"
62  * to return to callback functions.  Some callbacks demand it.
63  */
64 char *fixed_partnum(char *supplied_partnum) {
65         if (supplied_partnum == NULL) return "1";
66         if (strlen(supplied_partnum)==0) return "1";
67         return supplied_partnum;
68 }
69
70
71
72 /*
73  * Convert "quoted-printable" to binary.  Returns number of bytes decoded.
74  * according to RFC2045 section 6.7
75  */
76 int CtdlDecodeQuotedPrintable(char *decoded, char *encoded, int sourcelen) {
77         unsigned int ch;
78         int decoded_length = 0;
79         int pos = 0;
80
81         while (pos < sourcelen)
82         {
83                 if (!strncmp(&encoded[pos], "=\r\n", 3))
84                 {
85                         pos += 3;
86                 }
87                 else if (!strncmp(&encoded[pos], "=\n", 2))
88                 {
89                         pos += 2;
90                 }
91                 else if (encoded[pos] == '=')
92                 {
93                         ch = 0;
94                         sscanf(&encoded[pos+1], "%02x", &ch);
95                         pos += 3;
96                         decoded[decoded_length++] = ch;
97                 }
98                 else
99                 {
100                         decoded[decoded_length++] = encoded[pos];
101                         pos += 1;
102                 }
103         }
104         decoded[decoded_length] = 0;
105         return(decoded_length);
106 }
107
108
109 /*
110  * Given a message or message-part body and a length, handle any necessary
111  * decoding and pass the request up the stack.
112  */
113 void mime_decode(char *partnum,
114                  char *part_start, size_t length,
115                  char *content_type, char *charset, char *encoding,
116                  char *disposition,
117                  char *name, char *filename,
118                  void (*CallBack)
119                   (char *cbname,
120                    char *cbfilename,
121                    char *cbpartnum,
122                    char *cbdisp,
123                    void *cbcontent,
124                    char *cbtype,
125                    char *cbcharset,
126                    size_t cblength,
127                    char *cbencoding,
128                    void *cbuserdata),
129                  void (*PreMultiPartCallBack)
130                   (char *cbname,
131                    char *cbfilename,
132                    char *cbpartnum,
133                    char *cbdisp,
134                    void *cbcontent,
135                    char *cbtype,
136                    char *cbcharset,
137                    size_t cblength,
138                    char *cbencoding,
139                    void *cbuserdata),
140                  void (*PostMultiPartCallBack)
141                   (char *cbname,
142                    char *cbfilename,
143                    char *cbpartnum,
144                    char *cbdisp,
145                    void *cbcontent,
146                    char *cbtype,
147                    char *cbcharset,
148                    size_t cblength,
149                    char *cbencoding,
150                    void *cbuserdata),
151                   void *userdata,
152                   int dont_decode
153 )
154 {
155
156         char *decoded;
157         size_t bytes_decoded = 0;
158
159         /* Some encodings aren't really encodings */
160         if (!strcasecmp(encoding, "7bit"))
161                 strcpy(encoding, "");
162         if (!strcasecmp(encoding, "8bit"))
163                 strcpy(encoding, "");
164         if (!strcasecmp(encoding, "binary"))
165                 strcpy(encoding, "");
166
167         /* If this part is not encoded, send as-is */
168         if ( (strlen(encoding) == 0) || (dont_decode)) {
169                 if (CallBack != NULL) {
170                         CallBack(name, filename, fixed_partnum(partnum),
171                                 disposition, part_start,
172                                 content_type, charset, length, encoding, userdata);
173                         }
174                 return;
175         }
176         
177         /* Fail silently if we hit an unknown encoding. */
178         if ((strcasecmp(encoding, "base64"))
179             && (strcasecmp(encoding, "quoted-printable"))) {
180                 return;
181         }
182
183         /*
184          * Allocate a buffer for the decoded data.  The output buffer is slightly
185          * larger than the input buffer; this assumes that the decoded data
186          * will never be significantly larger than the encoded data.  This is a
187          * safe assumption with base64, uuencode, and quoted-printable.
188          */
189         decoded = malloc(length + 32768);
190         if (decoded == NULL) {
191                 return;
192         }
193
194         if (!strcasecmp(encoding, "base64")) {
195                 bytes_decoded = CtdlDecodeBase64(decoded, part_start, length);
196         }
197         else if (!strcasecmp(encoding, "quoted-printable")) {
198                 bytes_decoded = CtdlDecodeQuotedPrintable(decoded, part_start, length);
199         }
200
201         if (bytes_decoded > 0) if (CallBack != NULL) {
202                 CallBack(name, filename, fixed_partnum(partnum),
203                         disposition, decoded,
204                         content_type, charset, bytes_decoded, "binary", userdata);
205         }
206
207         free(decoded);
208 }
209
210 /*
211  * Break out the components of a multipart message
212  * (This function expects to be fed HEADERS + CONTENT)
213  * Note: NULL can be supplied as content_end; in this case, the message is
214  * considered to have ended when the parser encounters a 0x00 byte.
215  */
216 void the_mime_parser(char *partnum,
217                      char *content_start, char *content_end,
218                      void (*CallBack)
219                       (char *cbname,
220                        char *cbfilename,
221                        char *cbpartnum,
222                        char *cbdisp,
223                        void *cbcontent,
224                        char *cbtype,
225                        char *cbcharset,
226                        size_t cblength,
227                        char *cbencoding,
228                        void *cbuserdata),
229                      void (*PreMultiPartCallBack)
230                       (char *cbname,
231                        char *cbfilename,
232                        char *cbpartnum,
233                        char *cbdisp,
234                        void *cbcontent,
235                        char *cbtype,
236                        char *cbcharset,
237                        size_t cblength,
238                        char *cbencoding,
239                        void *cbuserdata),
240                      void (*PostMultiPartCallBack)
241                       (char *cbname,
242                        char *cbfilename,
243                        char *cbpartnum,
244                        char *cbdisp,
245                        void *cbcontent,
246                        char *cbtype,
247                        char *cbcharset,
248                        size_t cblength,
249                        char *cbencoding,
250                        void *cbuserdata),
251                       void *userdata,
252                       int dont_decode
253 )
254 {
255
256         char *ptr;
257         char *srch = NULL;
258         char *part_start, *part_end = NULL;
259         char buf[SIZ];
260         char *header;
261         char *boundary;
262         char *startary;
263         size_t startary_len = 0;
264         char *endary;
265         char *next_boundary;
266         char *content_type;
267         char *charset;
268         size_t content_length;
269         char *encoding;
270         char *disposition;
271         char *name = NULL;
272         char *content_type_name;
273         char *content_disposition_name;
274         char *filename;
275         int is_multipart;
276         int part_seq = 0;
277         int i;
278         size_t length;
279         char nested_partnum[256];
280         int crlf_in_use = 0;
281         char *evaluate_crlf_ptr = NULL;
282         int buflen = 0;
283         int headerlen = 0;
284
285         ptr = content_start;
286         content_length = 0;
287
288         boundary = malloc(SIZ);
289         memset(boundary, 0, SIZ);
290
291         startary = malloc(SIZ);
292         memset(startary, 0, SIZ);
293
294         endary = malloc(SIZ);
295         memset(endary, 0, SIZ);
296
297         header = malloc(SIZ);
298         memset(header, 0, SIZ);
299
300         content_type = malloc(SIZ);
301         memset(content_type, 0, SIZ);
302
303         charset = malloc(SIZ);
304         memset(charset, 0, SIZ);
305
306         encoding = malloc(SIZ);
307         memset(encoding, 0, SIZ);
308
309         content_type_name = malloc(SIZ);
310         memset(content_type_name, 0, SIZ);
311
312         content_disposition_name = malloc(SIZ);
313         memset(content_disposition_name, 0, SIZ);
314
315         filename = malloc(SIZ);
316         memset(filename, 0, SIZ);
317
318         disposition = malloc(SIZ);
319         memset(disposition, 0, SIZ);
320
321         /* If the caller didn't supply an endpointer, generate one by measure */
322         if (content_end == NULL) {
323                 content_end = &content_start[strlen(content_start)];
324         }
325
326         /* Learn interesting things from the headers */
327         strcpy(header, "");
328         headerlen = 0;
329         do {
330                 ptr = memreadlinelen(ptr, buf, SIZ, &buflen);
331                 if (ptr >= content_end) {
332                         goto end_parser;
333                 }
334
335                 for (i = 0; i < buflen; ++i) {
336                         if (isspace(buf[i])) {
337                                 buf[i] = ' ';
338                         }
339                 }
340
341                 if (!isspace(buf[0])) {
342                         if (!strncasecmp(header, "Content-type:", 13)) {
343                                 strcpy(content_type, &header[13]);
344                                 striplt(content_type);
345                                 extract_key(content_type_name, content_type, "name");
346                                 extract_key(charset, content_type, "charset");
347                                 extract_key(boundary, header, "boundary");
348                                 /* Deal with weird headers */
349                                 if (strchr(content_type, ' '))
350                                         *(strchr(content_type, ' ')) = '\0';
351                                 if (strchr(content_type, ';'))
352                                         *(strchr(content_type, ';')) = '\0';
353                         }
354                         if (!strncasecmp(header, "Content-Disposition:", 20)) {
355                                 strcpy(disposition, &header[20]);
356                                 striplt(disposition);
357                                 extract_key(content_disposition_name, disposition, "name");
358                                 extract_key(filename, disposition, "filename");
359                         }
360                         if (!strncasecmp(header, "Content-length: ", 15)) {
361                                 char clbuf[10];
362                                 safestrncpy(clbuf, &header[15], sizeof clbuf);
363                                 striplt(clbuf);
364                                 content_length = (size_t) atol(clbuf);
365                         }
366                         if (!strncasecmp(header, "Content-transfer-encoding: ", 26)) {
367                                 strcpy(encoding, &header[26]);
368                                 striplt(encoding);
369                         }
370                         strcpy(header, "");
371                         headerlen = 0;
372                 }
373                 if ((headerlen + buflen + 2) < SIZ) {
374                         memcpy(&header[headerlen], buf, buflen);
375                         headerlen += buflen;
376                         header[headerlen] = '\0';
377                 }
378         } while ((!IsEmptyStr(buf)) && (*ptr != 0));
379
380         if (strchr(disposition, ';'))
381                 *(strchr(disposition, ';')) = '\0';
382         striplt(disposition);
383         if (strchr(content_type, ';'))
384                 *(strchr(content_type, ';')) = '\0';
385         striplt(content_type);
386
387         if (!IsEmptyStr(boundary)) {
388                 is_multipart = 1;
389         } else {
390                 is_multipart = 0;
391         }
392
393         /* If this is a multipart message, then recursively process it */
394         part_start = NULL;
395         if (is_multipart) {
396
397                 /* Tell the client about this message's multipartedness */
398                 if (PreMultiPartCallBack != NULL) {
399                         PreMultiPartCallBack("", "", partnum, "",
400                                 NULL, content_type, charset,
401                                 0, encoding, userdata);
402                 }
403
404                 /* Figure out where the boundaries are */
405                 snprintf(startary, SIZ, "--%s", boundary);
406                 snprintf(endary, SIZ, "--%s--", boundary);
407                 startary_len = strlen(startary);
408
409                 part_start = NULL;
410                 do {
411                         next_boundary = NULL;
412                         for (srch=ptr; srch<content_end; ++srch) {
413                                 if (!memcmp(srch, startary, startary_len)) {
414                                         next_boundary = srch;
415                                         srch = content_end;
416                                 }
417                         }
418
419                         if ( (part_start != NULL) && (next_boundary != NULL) ) {
420                                 part_end = next_boundary;
421                                 --part_end;             /* omit the trailing LF */
422                                 if (crlf_in_use) {
423                                         --part_end;     /* omit the trailing CR */
424                                 }
425
426                                 if (!IsEmptyStr(partnum)) {
427                                         snprintf(nested_partnum,
428                                                  sizeof nested_partnum,
429                                                  "%s.%d", partnum,
430                                                  ++part_seq);
431                                 }
432                                 else {
433                                         snprintf(nested_partnum,
434                                                  sizeof nested_partnum,
435                                                  "%d", ++part_seq);
436                                 }
437                                 the_mime_parser(nested_partnum,
438                                             part_start, part_end,
439                                                 CallBack,
440                                                 PreMultiPartCallBack,
441                                                 PostMultiPartCallBack,
442                                                 userdata,
443                                                 dont_decode);
444                         }
445
446                         if (next_boundary != NULL) {
447                                 /* If we pass out of scope, don't attempt to
448                                  * read past the end boundary. */
449                                 if (!strcmp(next_boundary, endary)) {
450                                         ptr = content_end;
451                                 }
452                                 else {
453                                         /* Set up for the next part. */
454                                         part_start = strstr(next_boundary, "\n");
455                                         
456                                         /* Determine whether newlines are LF or CRLF */
457                                         evaluate_crlf_ptr = part_start;
458                                         --evaluate_crlf_ptr;
459                                         if (!memcmp(evaluate_crlf_ptr, "\r\n", 2)) {
460                                                 crlf_in_use = 1;
461                                         }
462                                         else {
463                                                 crlf_in_use = 0;
464                                         }
465
466                                         /* Advance past the LF ... now we're in the next part */
467                                         ++part_start;
468                                         ptr = part_start;
469                                 }
470                         }
471                         else {
472                                 /* Invalid end of multipart.  Bail out! */
473                                 ptr = content_end;
474                         }
475                 } while ( (ptr < content_end) && (next_boundary != NULL) );
476
477                 if (PostMultiPartCallBack != NULL) {
478                         PostMultiPartCallBack("", "", partnum, "", NULL,
479                                 content_type, charset, 0, encoding, userdata);
480                 }
481                 goto end_parser;
482         }
483
484         /* If it's not a multipart message, then do something with it */
485         if (!is_multipart) {
486                 part_start = ptr;
487                 length = 0;
488                 while (ptr < content_end) {
489                         ++ptr;
490                         ++length;
491                 }
492                 part_end = content_end;
493
494                 /******
495                  * I thought there was an off-by-one error here, but there isn't.
496                  * This probably means that there's an off-by-one error somewhere
497                  * else ... or maybe only in certain messages?
498                 --part_end;
499                 --length;
500                 ******/
501                 
502                 /* Truncate if the header told us to */
503                 if ( (content_length > 0) && (length > content_length) ) {
504                         length = content_length;
505                 }
506
507                 /* Sometimes the "name" field is tacked on to Content-type,
508                  * and sometimes it's tacked on to Content-disposition.  Use
509                  * whichever one we have.
510                  */
511                 if (strlen(content_disposition_name) > strlen(content_type_name)) {
512                         name = content_disposition_name;
513                 }
514                 else {
515                         name = content_type_name;
516                 }
517         
518                 /* Ok, we've got a non-multipart part here, so do something with it.
519                  */
520                 mime_decode(partnum,
521                         part_start, length,
522                         content_type, charset, encoding, disposition,
523                         name, filename,
524                         CallBack, NULL, NULL,
525                         userdata, dont_decode
526                 );
527
528                 /*
529                  * Now if it's an encapsulated message/rfc822 then we have to recurse into it
530                  */
531                 if (!strcasecmp(content_type, "message/rfc822")) {
532
533                         if (PreMultiPartCallBack != NULL) {
534                                 PreMultiPartCallBack("", "", partnum, "",
535                                         NULL, content_type, charset,
536                                         0, encoding, userdata);
537                         }
538                         if (CallBack != NULL) {
539                                 if (strlen(partnum) > 0) {
540                                         snprintf(nested_partnum,
541                                                  sizeof nested_partnum,
542                                                  "%s.%d", partnum,
543                                                  ++part_seq);
544                                 }
545                                 else {
546                                         snprintf(nested_partnum,
547                                                  sizeof nested_partnum,
548                                                  "%d", ++part_seq);
549                                 }
550                                 the_mime_parser(nested_partnum,
551                                         part_start, part_end,
552                                         CallBack,
553                                         PreMultiPartCallBack,
554                                         PostMultiPartCallBack,
555                                         userdata,
556                                         dont_decode
557                                 );
558                         }
559                         if (PostMultiPartCallBack != NULL) {
560                                 PostMultiPartCallBack("", "", partnum, "", NULL,
561                                         content_type, charset, 0, encoding, userdata);
562                         }
563
564
565                 }
566
567         }
568
569 end_parser:     /* free the buffers!  end the oppression!! */
570         free(boundary);
571         free(startary);
572         free(endary);   
573         free(header);
574         free(content_type);
575         free(charset);
576         free(encoding);
577         free(content_type_name);
578         free(content_disposition_name);
579         free(filename);
580         free(disposition);
581 }
582
583
584
585 /*
586  * Entry point for the MIME parser.
587  * (This function expects to be fed HEADERS + CONTENT)
588  * Note: NULL can be supplied as content_end; in this case, the message is
589  * considered to have ended when the parser encounters a 0x00 byte.
590  */
591 void mime_parser(char *content_start,
592                 char *content_end,
593
594                  void (*CallBack)
595                   (char *cbname,
596                    char *cbfilename,
597                    char *cbpartnum,
598                    char *cbdisp,
599                    void *cbcontent,
600                    char *cbtype,
601                    char *cbcharset,
602                    size_t cblength,
603                    char *cbencoding,
604                    void *cbuserdata),
605
606                  void (*PreMultiPartCallBack)
607                   (char *cbname,
608                    char *cbfilename,
609                    char *cbpartnum,
610                    char *cbdisp,
611                    void *cbcontent,
612                    char *cbtype,
613                    char *cbcharset,
614                    size_t cblength,
615                    char *cbencoding,
616                    void *cbuserdata),
617
618                  void (*PostMultiPartCallBack)
619                   (char *cbname,
620                    char *cbfilename,
621                    char *cbpartnum,
622                    char *cbdisp,
623                    void *cbcontent,
624                    char *cbtype,
625                    char *cbcharset,
626                    size_t cblength,
627                    char *cbencoding,
628                    void *cbuserdata),
629
630                   void *userdata,
631                   int dont_decode
632 )
633 {
634
635         the_mime_parser("", content_start, content_end,
636                         CallBack,
637                         PreMultiPartCallBack,
638                         PostMultiPartCallBack,
639                         userdata, dont_decode);
640 }
641
642
643
644
645
646
647 typedef struct _MimeGuess {
648         const char *Pattern;
649         size_t PatternLen;
650         long PatternOffset;
651         const char *MimeString;
652 } MimeGuess;
653
654 MimeGuess MyMimes [] = {
655         {
656                 "GIF",
657                 3,
658                 0,
659                 "image/gif"
660         },
661         {
662                 "\xff\xd8",
663                 2,
664                 0,
665                 "image/jpeg"
666         },
667         {
668                 "\x89PNG",
669                 4,
670                 0,
671                 "image/png"
672         },
673         { // last...
674                 "",
675                 0,
676                 0,
677                 ""
678         }
679 };
680
681
682 const char *GuessMimeType(char *data, size_t dlen)
683 {
684         return (xdg_mime_get_mime_type_for_data(data, dlen));
685 /*
686         int MimeIndex = 0;
687
688         while (MyMimes[MimeIndex].PatternLen != 0)
689         {
690                 if ((MyMimes[MimeIndex].PatternLen + 
691                      MyMimes[MimeIndex].PatternOffset < dlen) &&
692                     strncmp(MyMimes[MimeIndex].Pattern, 
693                             &data[MyMimes[MimeIndex].PatternOffset], 
694                             MyMimes[MimeIndex].PatternLen) == 0)
695                 {
696                         break;
697                 }
698                 MimeIndex ++;
699         }
700         return MyMimes[MimeIndex].MimeString;
701 */
702 }
703
704
705 const char* GuessMimeByFilename(const char *what, size_t len)
706 {
707         return xdg_mime_get_mime_type_from_file_name(what);
708 /*
709         if (!strncasecmp(&what[len - 4], ".gif", 4))
710                 return "image/gif";
711         else if (!strncasecmp(&what[len - 3], ".js", 3))
712                 return  "text/javascript";
713         else if (!strncasecmp(&what[len - 4], ".txt", 4))
714                 return "text/plain";
715         else if (!strncasecmp(&what[len - 4], ".css", 4))
716                 return "text/css";
717         else if (!strncasecmp(&what[len - 4], ".jpg", 4))
718                 return "image/jpeg";
719         else if (!strncasecmp(&what[len - 4], ".png", 4))
720                 return "image/png";
721         else if (!strncasecmp(&what[len - 4], ".ico", 4))
722                 return "image/x-icon";
723         else if (!strncasecmp(&what[len - 5], ".html", 5))
724                 return "text/html";
725         else if (!strncasecmp(&what[len - 4], ".htm", 4))
726                 return "text/html";
727         else if (!strncasecmp(&what[len - 4], ".wml", 4))
728                 return "text/vnd.wap.wml";
729         else if (!strncasecmp(&what[len - 5], ".wmls", 5))
730                 return "text/vnd.wap.wmlscript";
731         else if (!strncasecmp(&what[len - 5], ".wmlc", 5))
732                 return "application/vnd.wap.wmlc";
733         else if (!strncasecmp(&what[len - 6], ".wmlsc", 6))
734                 return "application/vnd.wap.wmlscriptc";
735         else if (!strncasecmp(&what[len - 5], ".wbmp", 5))
736                 return "image/vnd.wap.wbmp";
737         else
738                 return "application/octet-stream";
739 */
740 }