utf8ify_rfc822_string() is in libcitadel now
[citadel.git] / libcitadel / lib / html_to_ascii.c
1 /*
2  * Functions which handle translation between HTML and plain text
3  * Copyright (c) 2000-2018 by the citadel.org team
4  *
5 // This program is open source software.  Use, duplication, or disclosure
6 // is subject to the terms of the GNU General Public License, version 3.
7  */
8
9 #include <stdlib.h>
10 #include <unistd.h>
11 #include <stdio.h>
12 #include <signal.h>
13 #include <sys/types.h>
14 #include <ctype.h>
15 #include <string.h>
16 #include <sys/stat.h>
17 #include <errno.h>
18 #include <limits.h>
19
20 #if TIME_WITH_SYS_TIME
21 # include <sys/time.h>
22 # include <time.h>
23 #else
24 # if HAVE_SYS_TIME_H
25 #  include <sys/time.h>
26 # else
27 #  include <time.h>
28 # endif
29 #endif
30
31 #include "libcitadel.h"
32  
33
34 /*
35  * Convert HTML to plain text.
36  *
37  * inputmsg      = pointer to raw HTML message
38  * screenwidth   = desired output screenwidth
39  */
40 char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth) {
41         char inbuf[SIZ];
42         int inbuf_len = 0;
43         char outbuf[SIZ];
44         char tag[1024];
45         int done_reading = 0;
46         const char *inptr;
47         char *outptr;
48         size_t outptr_buffer_size;
49         size_t output_len = 0;
50         int i, j, ch, did_out, rb, scanch;
51         int nest = 0;           /* Bracket nesting level */
52         int blockquote = 0;     /* BLOCKQUOTE nesting level */
53         int styletag = 0;       /* STYLE tag nesting level */
54         int styletag_start = 0;
55         int bytes_processed = 0;
56         char nl[128];
57
58         tag[0] = '\0';
59         strcpy(nl, "\n");
60         inptr = inputmsg;
61         strcpy(inbuf, "");
62         strcpy(outbuf, "");
63         if (msglen == 0) msglen = strlen(inputmsg);
64
65         outptr_buffer_size = strlen(inptr) + SIZ;
66         outptr = malloc(outptr_buffer_size);
67         if (outptr == NULL) return NULL;
68         strcpy(outptr, "");
69         output_len = 0;
70
71         do {
72                 /* Fill the input buffer */
73                 inbuf_len = strlen(inbuf);
74                 if ( (done_reading == 0) && (inbuf_len < (SIZ-128)) ) {
75
76                         ch = *inptr++;
77                         if (ch != 0) {
78                                 inbuf[inbuf_len++] = ch;
79                                 inbuf[inbuf_len] = 0;
80                         } 
81                         else {
82                                 done_reading = 1;
83                         }
84
85                         ++bytes_processed;
86                         if (bytes_processed > msglen) {
87                                 done_reading = 1;
88                         }
89
90                 }
91
92                 /* Do some parsing */
93                 if (!IsEmptyStr(inbuf)) {
94
95
96                     /* Fold in all the spacing */
97                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
98                         if (inbuf[i]==10) inbuf[i]=32;
99                         if (inbuf[i]==13) inbuf[i]=32;
100                         if (inbuf[i]==9) inbuf[i]=32;
101                     }
102                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
103                         while ((inbuf[i]==32)&&(inbuf[i+1]==32)) {
104                                 strcpy(&inbuf[i], &inbuf[i+1]);
105                         }
106                     }
107
108                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
109
110                         ch = inbuf[i];
111
112                         if (ch == '<') {
113                                 ++nest;
114                                 strcpy(tag, "");
115                         }
116
117                         else if (ch == '>') {   /* We have a tag. */
118                                 if (nest > 0) --nest;
119
120                                 /* Unqualify the tag (truncate at first space) */
121                                 if (strchr(tag, ' ') != NULL) {
122                                         strcpy(strchr(tag, ' '), "");
123                                 }
124                                 
125                                 if (!strcasecmp(tag, "P")) {
126                                         strcat(outbuf, nl);
127                                         strcat(outbuf, nl);
128                                 }
129
130                                 if (!strcasecmp(tag, "/DIV")) {
131                                         strcat(outbuf, nl);
132                                         strcat(outbuf, nl);
133                                 }
134
135                                 if (!strcasecmp(tag, "LI")) {
136                                         strcat(outbuf, nl);
137                                         strcat(outbuf, " * ");
138                                 }
139
140                                 else if (!strcasecmp(tag, "/UL")) {
141                                         strcat(outbuf, nl);
142                                         strcat(outbuf, nl);
143                                 }
144
145                                 else if (!strcasecmp(tag, "H1")) {
146                                         strcat(outbuf, nl);
147                                         strcat(outbuf, nl);
148                                 }
149
150                                 else if (!strcasecmp(tag, "H2")) {
151                                         strcat(outbuf, nl);
152                                         strcat(outbuf, nl);
153                                 }
154
155                                 else if (!strcasecmp(tag, "H3")) {
156                                         strcat(outbuf, nl);
157                                         strcat(outbuf, nl);
158                                 }
159
160                                 else if (!strcasecmp(tag, "H4")) {
161                                         strcat(outbuf, nl);
162                                         strcat(outbuf, nl);
163                                 }
164
165                                 else if (!strcasecmp(tag, "/H1")) {
166                                         strcat(outbuf, nl);
167                                 }
168
169                                 else if (!strcasecmp(tag, "/H2")) {
170                                         strcat(outbuf, nl);
171                                 }
172
173                                 else if (!strcasecmp(tag, "/H3")) {
174                                         strcat(outbuf, nl);
175                                 }
176
177                                 else if (!strcasecmp(tag, "/H4")) {
178                                         strcat(outbuf, nl);
179                                 }
180
181                                 else if (!strcasecmp(tag, "HR")) {
182                                         strcat(outbuf, nl);
183                                         strcat(outbuf, " ");
184                                         for (j=0; j<screenwidth-2; ++j)
185                                                 strcat(outbuf, "-");
186                                         strcat(outbuf, nl);
187                                 }
188
189 #if 0
190         These seemed like a good idea at the time, but it just makes a mess.
191
192                                 else if (
193                                         (!strcasecmp(tag, "B"))
194                                         || (!strcasecmp(tag, "/B"))
195                                         || (!strcasecmp(tag, "STRONG"))
196                                         || (!strcasecmp(tag, "/STRONG"))
197                                 ) {
198                                         strcat(outbuf, "*");
199                                 }
200
201                                 else if (
202                                         (!strcasecmp(tag, "I"))
203                                         || (!strcasecmp(tag, "/I"))
204                                         || (!strcasecmp(tag, "EM"))
205                                         || (!strcasecmp(tag, "/EM"))
206                                 ) {
207                                         strcat(outbuf, "/");
208                                 }
209
210                                 else if (
211                                         (!strcasecmp(tag, "U"))
212                                         || (!strcasecmp(tag, "/U"))
213                                 ) {
214                                         strcat(outbuf, "_");
215                                 }
216 #endif
217
218                                 else if (!strcasecmp(tag, "BR")) {
219                                         strcat(outbuf, nl);
220                                 }
221
222                                 else if (!strcasecmp(tag, "TR")) {
223                                         strcat(outbuf, nl);
224                                 }
225
226                                 else if (!strcasecmp(tag, "/TABLE")) {
227                                         strcat(outbuf, nl);
228                                 }
229
230                                 else if (!strcasecmp(tag, "BLOCKQUOTE")) {
231                                         ++blockquote;
232                                         strcpy(nl, "\n");
233                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
234                                         strcat(outbuf, nl);
235                                 }
236
237                                 else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
238                                         strcat(outbuf, "\n");
239                                         --blockquote;
240                                         strcpy(nl, "\n");
241                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
242                                         strcat(outbuf, nl);
243                                 }
244
245                                 else if (!strcasecmp(tag, "STYLE")) {
246                                         ++styletag;
247                                         if (styletag == 1) {
248                                                 styletag_start = strlen(outbuf);
249                                         }
250                                 }
251
252                                 else if (!strcasecmp(tag, "/STYLE")) {
253                                         --styletag;
254                                         if (styletag == 0) {
255                                                 outbuf[styletag_start] = 0;
256                                         }
257                                 }
258
259                         }
260
261                         else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
262                                 tag[strlen(tag)+1] = 0;
263                                 tag[strlen(tag)] = ch;
264                         }
265                                 
266                         else if ((!nest) && (styletag == 0)) {
267                                 outbuf[strlen(outbuf)+1] = 0;
268                                 outbuf[strlen(outbuf)] = ch;
269                         }
270                     }
271                     strcpy(inbuf, &inbuf[i]);
272                 }
273
274                 /* Convert &; tags to the forbidden characters */
275                 if (!IsEmptyStr(outbuf)) for (i=0; !IsEmptyStr(&outbuf[i]); ++i) {
276
277                         /* Character entity references */
278                         if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
279                                 outbuf[i] = ' ';
280                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
281                         }
282
283                         if (!strncasecmp(&outbuf[i], "&ensp;", 6)) {
284                                 outbuf[i] = ' ';
285                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
286                         }
287
288                         if (!strncasecmp(&outbuf[i], "&emsp;", 6)) {
289                                 outbuf[i] = ' ';
290                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
291                         }
292
293                         if (!strncasecmp(&outbuf[i], "&thinsp;", 8)) {
294                                 outbuf[i] = ' ';
295                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
296                         }
297
298                         else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
299                                 outbuf[i] = '<';
300                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
301                         }
302
303                         else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
304                                 outbuf[i] = '>';
305                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
306                         }
307
308                         else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
309                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
310                         }
311
312                         else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
313                                 outbuf[i] = '\"';
314                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
315                         }
316
317                         else if (!strncasecmp(&outbuf[i], "&lsquo;", 7)) {
318                                 outbuf[i] = '`';
319                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
320                         }
321
322                         else if (!strncasecmp(&outbuf[i], "&rsquo;", 7)) {
323                                 outbuf[i] = '\'';
324                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
325                         }
326
327                         else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
328                                 outbuf[i] = '(';
329                                 outbuf[i+1] = 'c';
330                                 outbuf[i+2] = ')';
331                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
332                         }
333
334                         else if (!strncasecmp(&outbuf[i], "&bull;", 6)) {
335                                 outbuf[i] = ' ';
336                                 outbuf[i+1] = '*';
337                                 outbuf[i+2] = ' ';
338                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
339                         }
340
341                         else if (!strncasecmp(&outbuf[i], "&hellip;", 8)) {
342                                 outbuf[i] = '.';
343                                 outbuf[i+1] = '.';
344                                 outbuf[i+2] = '.';
345                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
346                         }
347
348                         else if (!strncasecmp(&outbuf[i], "&trade;", 7)) {
349                                 outbuf[i] = '(';
350                                 outbuf[i+1] = 't';
351                                 outbuf[i+2] = 'm';
352                                 outbuf[i+3] = ')';
353                                 strcpy(&outbuf[i+4], &outbuf[i+7]);
354                         }
355
356                         else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
357                                 outbuf[i] = '(';
358                                 outbuf[i+1] = 'r';
359                                 outbuf[i+2] = ')';
360                                 strcpy(&outbuf[i+3], &outbuf[i+5]);
361                         }
362
363                         else if (!strncasecmp(&outbuf[i], "&frac14;", 8)) {
364                                 outbuf[i] = '1';
365                                 outbuf[i+1] = '/';
366                                 outbuf[i+2] = '4';
367                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
368                         }
369
370                         else if (!strncasecmp(&outbuf[i], "&frac12;", 8)) {
371                                 outbuf[i] = '1';
372                                 outbuf[i+1] = '/';
373                                 outbuf[i+2] = '2';
374                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
375                         }
376
377                         else if (!strncasecmp(&outbuf[i], "&frac34;", 8)) {
378                                 outbuf[i] = '3';
379                                 outbuf[i+1] = '/';
380                                 outbuf[i+2] = '4';
381                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
382                         }
383
384                         else if (!strncasecmp(&outbuf[i], "&ndash;", 7)) {
385                                 outbuf[i] = '-';
386                                 outbuf[i+1] = '-';
387                                 strcpy(&outbuf[i+2], &outbuf[i+7]);
388                         }
389
390                         else if (!strncasecmp(&outbuf[i], "&mdash;", 7)) {
391                                 outbuf[i] = '-';
392                                 outbuf[i+1] = '-';
393                                 outbuf[i+2] = '-';
394                                 strcpy(&outbuf[i+3], &outbuf[i+7]);
395                         }
396
397                         else if (!strncmp(&outbuf[i], "&Ccedil;", 8)) {
398                                 outbuf[i] = 'C';
399                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
400                         }
401
402                         else if (!strncasecmp(&outbuf[i], "&ccedil;", 8)) {
403                                 outbuf[i] = 'c';
404                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
405                         }
406
407                         else if (!strncmp(&outbuf[i], "&Egrave;", 8)) {
408                                 outbuf[i] = 'E';
409                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
410                         }
411
412                         else if (!strncasecmp(&outbuf[i], "&egrave;", 8)) {
413                                 outbuf[i] = 'e';
414                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
415                         }
416
417                         else if (!strncmp(&outbuf[i], "&Ecirc;", 7)) {
418                                 outbuf[i] = 'E';
419                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
420                         }
421
422                         else if (!strncasecmp(&outbuf[i], "&ecirc;", 7)) {
423                                 outbuf[i] = 'e';
424                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
425                         }
426
427                         else if (!strncmp(&outbuf[i], "&Eacute;", 8)) {
428                                 outbuf[i] = 'E';
429                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
430                         }
431
432                         else if (!strncasecmp(&outbuf[i], "&eacute;", 8)) {
433                                 outbuf[i] = 'e';
434                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
435                         }
436
437                         else if (!strncmp(&outbuf[i], "&Agrave;", 8)) {
438                                 outbuf[i] = 'A';
439                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
440                         }
441
442                         else if (!strncasecmp(&outbuf[i], "&agrave;", 8)) {
443                                 outbuf[i] = 'a';
444                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
445                         }
446
447                         else if (!strncasecmp(&outbuf[i], "&ldquo;", 7)) {
448                                 outbuf[i] = '\"';
449                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
450                         }
451
452                         else if (!strncasecmp(&outbuf[i], "&rdquo;", 7)) {
453                                 outbuf[i] = '\"';
454                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
455                         }
456
457                         else if (!strncasecmp(&outbuf[i], "&acute;", 7)) {
458                                 outbuf[i] = '\'';
459                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
460                         }
461
462                         else if (!strncasecmp(&outbuf[i], "&#8217;", 7)) {
463                                 outbuf[i] = '\'';
464                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
465                         }
466
467                         else if (!strncasecmp(&outbuf[i], "&#8211;", 7)) {
468                                 outbuf[i] = '-';
469                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
470                         }
471
472                         /* two-digit decimal equivalents */
473                         else if (outbuf[i] == '&'       &&
474                                  outbuf[i + 1] == '#'   &&
475                                  isdigit(outbuf[i + 2]) && 
476                                  isdigit(outbuf[i + 3]) &&
477                                  (outbuf[i+4] == ';') ) 
478                         {
479                                 scanch = 0;
480                                 sscanf(&outbuf[i+2], "%02d", &scanch);
481                                 outbuf[i] = scanch;
482                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
483                         }
484
485                         /* three-digit decimal equivalents */
486                         else if (outbuf[i] == '&'       &&
487                                  outbuf[i + 1] == '#'   &&
488                                  isdigit(outbuf[i + 2]) && 
489                                  isdigit(outbuf[i + 3]) && 
490                                  isdigit(outbuf[i + 4]) &&
491                                  (outbuf[i + 5] == ';') ) 
492                         {
493                                 scanch = 0;
494                                 sscanf(&outbuf[i+2], "%03d", &scanch);
495                                 outbuf[i] = scanch;
496                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
497                         }
498
499                         /* four-digit decimal equivalents */
500                         else if (outbuf[i] == '&'       &&
501                                  outbuf[i + 1] == '#'   &&
502                                  isdigit(outbuf[i + 2]) && 
503                                  isdigit(outbuf[i + 3]) && 
504                                  isdigit(outbuf[i + 4]) &&
505                                  isdigit(outbuf[i + 5]) &&
506                                  (outbuf[i + 6] == ';') ) 
507                         {
508                                 scanch = 0;
509                                 sscanf(&outbuf[i+2], "%04d", &scanch);
510                                 outbuf[i] = scanch;
511                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
512                         }
513
514                 }
515
516                 /* Make sure the output buffer is big enough */
517                 if ((output_len + strlen(outbuf) + SIZ) > outptr_buffer_size) {
518                         outptr_buffer_size += SIZ;
519                         outptr = realloc(outptr, outptr_buffer_size);
520                         if (outptr == NULL) {
521                                 abort();
522                         }
523                 }
524
525                 /* Output any lines terminated with hard line breaks */
526                 do {
527                         did_out = 0;
528                         if (strlen(outbuf) > 0) {
529                             for (i = 0; i<strlen(outbuf); ++i) {
530                                 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
531
532                                         strncpy(&outptr[output_len], outbuf, i+1);
533                                         output_len += (i+1);
534
535                                         strcpy(outbuf, &outbuf[i+1]);
536                                         i = 0;
537                                         did_out = 1;
538                                 }
539                         }
540                     }
541                 } while (did_out);
542
543                 /* Add soft line breaks */
544                 if (strlen(outbuf) > (screenwidth - 2 )) {
545                         rb = (-1);
546                         for (i=0; i<(screenwidth-2); ++i) {
547                                 if (outbuf[i]==32) rb = i;
548                         }
549                         if (rb>=0) {
550                                 strncpy(&outptr[output_len], outbuf, rb);
551                                 output_len += rb;
552                                 strcpy(&outptr[output_len], nl);
553                                 output_len += strlen(nl);
554                                 strcpy(outbuf, &outbuf[rb+1]);
555                         } else {
556                                 strncpy(&outptr[output_len], outbuf,
557                                         screenwidth-2);
558                                 output_len += (screenwidth-2);
559                                 strcpy(&outptr[output_len], nl);
560                                 output_len += strlen(nl);
561                                 strcpy(outbuf, &outbuf[screenwidth-2]);
562                         }
563                 }
564
565         } while (done_reading == 0);
566
567         strcpy(&outptr[output_len], outbuf);
568         output_len += strlen(outbuf);
569
570         /* Strip leading/trailing whitespace.  We can't do this with
571          * striplt() because it uses too many strlen()'s
572          */
573         while ((output_len > 0) && (isspace(outptr[0]))) {
574                 strcpy(outptr, &outptr[1]);
575                 --output_len;
576         }
577         while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
578                 outptr[output_len-1] = 0;
579                 --output_len;
580         }
581
582         if ((output_len > 0) && (outptr[output_len-1] != '\n')) {
583                 strcat(outptr, "\n");
584                 ++output_len;
585         }
586
587         return outptr;
588
589 }