striplt() is now string_trim()
[citadel.git] / libcitadel / lib / html_to_ascii.c
1 // Functions which handle translation between HTML and plain text
2 // Copyright (c) 2000-2022 by the citadel.org team
3 //
4 // This program is open source software.  Use, duplication, or disclosure
5 // is subject to the terms of the GNU General Public License, version 3.
6
7 #include <stdlib.h>
8 #include <unistd.h>
9 #include <stdio.h>
10 #include <signal.h>
11 #include <sys/types.h>
12 #include <ctype.h>
13 #include <string.h>
14 #include <sys/stat.h>
15 #include <errno.h>
16 #include <limits.h>
17
18 #if TIME_WITH_SYS_TIME
19 # include <sys/time.h>
20 # include <time.h>
21 #else
22 # if HAVE_SYS_TIME_H
23 #  include <sys/time.h>
24 # else
25 #  include <time.h>
26 # endif
27 #endif
28
29 #include "libcitadel.h"
30  
31
32 // Convert HTML to plain text.
33 //
34 // inputmsg     = pointer to raw HTML message
35 // msglen       = stop reading after this many bytes
36 // screenwidth  = desired output screenwidth
37 // ansi         = if nonzero, assume output is to a terminal that supports ANSI escape codes
38 //
39 char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth, int ansi) {
40         char inbuf[SIZ];
41         int inbuf_len = 0;
42         char outbuf[SIZ];
43         char tag[1024];
44         int done_reading = 0;
45         const char *inptr;
46         char *outptr;
47         size_t outptr_buffer_size;
48         size_t output_len = 0;
49         int i, j, ch, did_out, rb, scanch;
50         int nest = 0;                           // Bracket nesting level
51         int blockquote = 0;                     // BLOCKQUOTE nesting level
52         int styletag = 0;                       // STYLE tag nesting level
53         int styletag_start = 0;
54         int bytes_processed = 0;
55         char nl[128];
56
57         tag[0] = '\0';
58         strcpy(nl, "\n");
59         inptr = inputmsg;
60         strcpy(inbuf, "");
61         strcpy(outbuf, "");
62         if (msglen == 0) msglen = strlen(inputmsg);
63
64         outptr_buffer_size = strlen(inptr) + SIZ;
65         outptr = malloc(outptr_buffer_size);
66         if (outptr == NULL) return NULL;
67         strcpy(outptr, "");
68         output_len = 0;
69
70         do {
71                 /* Fill the input buffer */
72                 inbuf_len = strlen(inbuf);
73                 if ( (done_reading == 0) && (inbuf_len < (SIZ-128)) ) {
74
75                         ch = *inptr++;
76                         if (ch != 0) {
77                                 inbuf[inbuf_len++] = ch;
78                                 inbuf[inbuf_len] = 0;
79                         } 
80                         else {
81                                 done_reading = 1;
82                         }
83
84                         ++bytes_processed;
85                         if (bytes_processed > msglen) {
86                                 done_reading = 1;
87                         }
88
89                 }
90
91                 /* Do some parsing */
92                 if (!IsEmptyStr(inbuf)) {
93
94
95                     /* Fold in all the spacing */
96                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
97                         if (inbuf[i]==10) inbuf[i]=32;
98                         if (inbuf[i]==13) inbuf[i]=32;
99                         if (inbuf[i]==9) inbuf[i]=32;
100                     }
101                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
102                         while ((inbuf[i]==32)&&(inbuf[i+1]==32)) {
103                                 strcpy(&inbuf[i], &inbuf[i+1]);
104                         }
105                     }
106
107                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
108
109                         ch = inbuf[i];
110
111                         if (ch == '<') {
112                                 ++nest;
113                                 strcpy(tag, "");
114                         }
115
116                         else if (ch == '>') {   /* We have a tag. */
117                                 if (nest > 0) --nest;
118
119                                 /* Unqualify the tag (truncate at first space) */
120                                 if (strchr(tag, ' ') != NULL) {
121                                         strcpy(strchr(tag, ' '), "");
122                                 }
123                                 
124                                 if (!strcasecmp(tag, "P")) {
125                                         strcat(outbuf, nl);
126                                         strcat(outbuf, nl);
127                                 }
128
129                                 if (!strcasecmp(tag, "/DIV")) {
130                                         strcat(outbuf, nl);
131                                         strcat(outbuf, nl);
132                                 }
133
134                                 if (!strcasecmp(tag, "LI")) {
135                                         strcat(outbuf, nl);
136                                         strcat(outbuf, " * ");
137                                 }
138
139                                 else if (!strcasecmp(tag, "/UL")) {
140                                         strcat(outbuf, nl);
141                                         strcat(outbuf, nl);
142                                 }
143
144                                 else if (!strcasecmp(tag, "H1")) {
145                                         strcat(outbuf, nl);
146                                         strcat(outbuf, nl);
147                                 }
148
149                                 else if (!strcasecmp(tag, "H2")) {
150                                         strcat(outbuf, nl);
151                                         strcat(outbuf, nl);
152                                 }
153
154                                 else if (!strcasecmp(tag, "H3")) {
155                                         strcat(outbuf, nl);
156                                         strcat(outbuf, nl);
157                                 }
158
159                                 else if (!strcasecmp(tag, "H4")) {
160                                         strcat(outbuf, nl);
161                                         strcat(outbuf, nl);
162                                 }
163
164                                 else if (!strcasecmp(tag, "/H1")) {
165                                         strcat(outbuf, nl);
166                                 }
167
168                                 else if (!strcasecmp(tag, "/H2")) {
169                                         strcat(outbuf, nl);
170                                 }
171
172                                 else if (!strcasecmp(tag, "/H3")) {
173                                         strcat(outbuf, nl);
174                                 }
175
176                                 else if (!strcasecmp(tag, "/H4")) {
177                                         strcat(outbuf, nl);
178                                 }
179
180                                 else if (!strcasecmp(tag, "HR")) {
181                                         strcat(outbuf, nl);
182                                         strcat(outbuf, " ");
183                                         for (j=0; j<screenwidth-2; ++j)
184                                                 strcat(outbuf, "-");
185                                         strcat(outbuf, nl);
186                                 }
187
188                                 else if (
189                                         (!strcasecmp(tag, "B"))
190                                         || (!strcasecmp(tag, "STRONG"))
191                                 ) {
192                                         if (ansi) {
193                                                 strcat(outbuf, "\033[1m");
194                                         }
195                                 }
196                                 else if (
197                                         (!strcasecmp(tag, "/B"))
198                                         || (!strcasecmp(tag, "/STRONG"))
199                                 ) {
200                                         if (ansi) {
201                                                 strcat(outbuf, "\033[22m");
202                                         }
203                                 }
204
205                                 else if (
206                                         (!strcasecmp(tag, "I"))
207                                         || (!strcasecmp(tag, "EM"))
208                                 ) {
209                                         if (ansi) {
210                                                 strcat(outbuf, "\033[3m");
211                                         }
212                                 }
213
214                                 else if (
215                                         (!strcasecmp(tag, "/I"))
216                                         || (!strcasecmp(tag, "/EM"))
217                                 ) {
218                                         if (ansi) {
219                                                 strcat(outbuf, "\033[23m");
220                                         }
221                                 }
222
223                                 else if (!strcasecmp(tag, "U")) {
224                                         if (ansi) {
225                                                 strcat(outbuf, "\033[4m");
226                                         }
227                                 }
228
229                                 else if (!strcasecmp(tag, "/U")) {
230                                         if (ansi) {
231                                                 strcat(outbuf, "\033[24m");
232                                         }
233                                 }
234
235                                 else if (!strcasecmp(tag, "BR")) {
236                                         strcat(outbuf, nl);
237                                 }
238
239                                 else if (!strcasecmp(tag, "TR")) {
240                                         strcat(outbuf, nl);
241                                 }
242
243                                 else if (!strcasecmp(tag, "/TABLE")) {
244                                         strcat(outbuf, nl);
245                                 }
246
247                                 else if (!strcasecmp(tag, "BLOCKQUOTE")) {
248                                         ++blockquote;
249                                         strcpy(nl, "\n");
250                                         if ( (blockquote == 1) && (ansi) ) {
251                                                 strcat(nl, "\033[2m\033[3m");
252                                         }
253                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
254                                         strcat(outbuf, nl);
255                                 }
256
257                                 else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
258                                         strcat(outbuf, "\n");
259                                         --blockquote;
260                                         if ( (blockquote == 0) && (ansi) ) {
261                                                 strcat(outbuf, "\033[22m\033[23m");
262                                         }
263                                         strcpy(nl, "\n");
264                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
265                                         strcat(outbuf, nl);
266                                 }
267
268                                 else if (!strcasecmp(tag, "STYLE")) {
269                                         ++styletag;
270                                         if (styletag == 1) {
271                                                 styletag_start = strlen(outbuf);
272                                         }
273                                 }
274
275                                 else if (!strcasecmp(tag, "/STYLE")) {
276                                         --styletag;
277                                         if (styletag == 0) {
278                                                 outbuf[styletag_start] = 0;
279                                         }
280                                 }
281
282                         }
283
284                         else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
285                                 tag[strlen(tag)+1] = 0;
286                                 tag[strlen(tag)] = ch;
287                         }
288                                 
289                         else if ((!nest) && (styletag == 0)) {
290                                 outbuf[strlen(outbuf)+1] = 0;
291                                 outbuf[strlen(outbuf)] = ch;
292                         }
293                     }
294                     strcpy(inbuf, &inbuf[i]);
295                 }
296
297                 /* Convert &; tags to the forbidden characters */
298                 if (!IsEmptyStr(outbuf)) for (i=0; !IsEmptyStr(&outbuf[i]); ++i) {
299
300                         /* Character entity references */
301                         if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
302                                 outbuf[i] = ' ';
303                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
304                         }
305
306                         if (!strncasecmp(&outbuf[i], "&ensp;", 6)) {
307                                 outbuf[i] = ' ';
308                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
309                         }
310
311                         if (!strncasecmp(&outbuf[i], "&emsp;", 6)) {
312                                 outbuf[i] = ' ';
313                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
314                         }
315
316                         if (!strncasecmp(&outbuf[i], "&thinsp;", 8)) {
317                                 outbuf[i] = ' ';
318                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
319                         }
320
321                         else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
322                                 outbuf[i] = '<';
323                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
324                         }
325
326                         else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
327                                 outbuf[i] = '>';
328                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
329                         }
330
331                         else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
332                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
333                         }
334
335                         else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
336                                 outbuf[i] = '\"';
337                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
338                         }
339
340                         else if (!strncasecmp(&outbuf[i], "&lsquo;", 7)) {
341                                 outbuf[i] = '`';
342                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
343                         }
344
345                         else if (!strncasecmp(&outbuf[i], "&rsquo;", 7)) {
346                                 outbuf[i] = '\'';
347                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
348                         }
349
350                         else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
351                                 outbuf[i] = '(';
352                                 outbuf[i+1] = 'c';
353                                 outbuf[i+2] = ')';
354                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
355                         }
356
357                         else if (!strncasecmp(&outbuf[i], "&bull;", 6)) {
358                                 outbuf[i] = ' ';
359                                 outbuf[i+1] = '*';
360                                 outbuf[i+2] = ' ';
361                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
362                         }
363
364                         else if (!strncasecmp(&outbuf[i], "&hellip;", 8)) {
365                                 outbuf[i] = '.';
366                                 outbuf[i+1] = '.';
367                                 outbuf[i+2] = '.';
368                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
369                         }
370
371                         else if (!strncasecmp(&outbuf[i], "&trade;", 7)) {
372                                 outbuf[i] = '(';
373                                 outbuf[i+1] = 't';
374                                 outbuf[i+2] = 'm';
375                                 outbuf[i+3] = ')';
376                                 strcpy(&outbuf[i+4], &outbuf[i+7]);
377                         }
378
379                         else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
380                                 outbuf[i] = '(';
381                                 outbuf[i+1] = 'r';
382                                 outbuf[i+2] = ')';
383                                 strcpy(&outbuf[i+3], &outbuf[i+5]);
384                         }
385
386                         else if (!strncasecmp(&outbuf[i], "&frac14;", 8)) {
387                                 outbuf[i] = '1';
388                                 outbuf[i+1] = '/';
389                                 outbuf[i+2] = '4';
390                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
391                         }
392
393                         else if (!strncasecmp(&outbuf[i], "&frac12;", 8)) {
394                                 outbuf[i] = '1';
395                                 outbuf[i+1] = '/';
396                                 outbuf[i+2] = '2';
397                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
398                         }
399
400                         else if (!strncasecmp(&outbuf[i], "&frac34;", 8)) {
401                                 outbuf[i] = '3';
402                                 outbuf[i+1] = '/';
403                                 outbuf[i+2] = '4';
404                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
405                         }
406
407                         else if (!strncasecmp(&outbuf[i], "&ndash;", 7)) {
408                                 outbuf[i] = '-';
409                                 outbuf[i+1] = '-';
410                                 strcpy(&outbuf[i+2], &outbuf[i+7]);
411                         }
412
413                         else if (!strncasecmp(&outbuf[i], "&mdash;", 7)) {
414                                 outbuf[i] = '-';
415                                 outbuf[i+1] = '-';
416                                 outbuf[i+2] = '-';
417                                 strcpy(&outbuf[i+3], &outbuf[i+7]);
418                         }
419
420                         else if (!strncmp(&outbuf[i], "&Ccedil;", 8)) {
421                                 outbuf[i] = 'C';
422                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
423                         }
424
425                         else if (!strncasecmp(&outbuf[i], "&ccedil;", 8)) {
426                                 outbuf[i] = 'c';
427                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
428                         }
429
430                         else if (!strncmp(&outbuf[i], "&Egrave;", 8)) {
431                                 outbuf[i] = 'E';
432                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
433                         }
434
435                         else if (!strncasecmp(&outbuf[i], "&egrave;", 8)) {
436                                 outbuf[i] = 'e';
437                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
438                         }
439
440                         else if (!strncmp(&outbuf[i], "&Ecirc;", 7)) {
441                                 outbuf[i] = 'E';
442                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
443                         }
444
445                         else if (!strncasecmp(&outbuf[i], "&ecirc;", 7)) {
446                                 outbuf[i] = 'e';
447                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
448                         }
449
450                         else if (!strncmp(&outbuf[i], "&Eacute;", 8)) {
451                                 outbuf[i] = 'E';
452                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
453                         }
454
455                         else if (!strncasecmp(&outbuf[i], "&eacute;", 8)) {
456                                 outbuf[i] = 'e';
457                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
458                         }
459
460                         else if (!strncmp(&outbuf[i], "&Agrave;", 8)) {
461                                 outbuf[i] = 'A';
462                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
463                         }
464
465                         else if (!strncasecmp(&outbuf[i], "&agrave;", 8)) {
466                                 outbuf[i] = 'a';
467                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
468                         }
469
470                         else if (!strncasecmp(&outbuf[i], "&ldquo;", 7)) {
471                                 outbuf[i] = '\"';
472                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
473                         }
474
475                         else if (!strncasecmp(&outbuf[i], "&rdquo;", 7)) {
476                                 outbuf[i] = '\"';
477                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
478                         }
479
480                         else if (!strncasecmp(&outbuf[i], "&acute;", 7)) {
481                                 outbuf[i] = '\'';
482                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
483                         }
484
485                         else if (!strncasecmp(&outbuf[i], "&#8217;", 7)) {
486                                 outbuf[i] = '\'';
487                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
488                         }
489
490                         else if (!strncasecmp(&outbuf[i], "&#8211;", 7)) {
491                                 outbuf[i] = '-';
492                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
493                         }
494
495                         /* two-digit decimal equivalents */
496                         else if (outbuf[i] == '&'       &&
497                                  outbuf[i + 1] == '#'   &&
498                                  isdigit(outbuf[i + 2]) && 
499                                  isdigit(outbuf[i + 3]) &&
500                                  (outbuf[i+4] == ';') ) 
501                         {
502                                 scanch = 0;
503                                 sscanf(&outbuf[i+2], "%02d", &scanch);
504                                 outbuf[i] = scanch;
505                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
506                         }
507
508                         /* three-digit decimal equivalents */
509                         else if (outbuf[i] == '&'       &&
510                                  outbuf[i + 1] == '#'   &&
511                                  isdigit(outbuf[i + 2]) && 
512                                  isdigit(outbuf[i + 3]) && 
513                                  isdigit(outbuf[i + 4]) &&
514                                  (outbuf[i + 5] == ';') ) 
515                         {
516                                 scanch = 0;
517                                 sscanf(&outbuf[i+2], "%03d", &scanch);
518                                 outbuf[i] = scanch;
519                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
520                         }
521
522                         /* four-digit decimal equivalents */
523                         else if (outbuf[i] == '&'       &&
524                                  outbuf[i + 1] == '#'   &&
525                                  isdigit(outbuf[i + 2]) && 
526                                  isdigit(outbuf[i + 3]) && 
527                                  isdigit(outbuf[i + 4]) &&
528                                  isdigit(outbuf[i + 5]) &&
529                                  (outbuf[i + 6] == ';') ) 
530                         {
531                                 scanch = 0;
532                                 sscanf(&outbuf[i+2], "%04d", &scanch);
533                                 outbuf[i] = scanch;
534                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
535                         }
536
537                 }
538
539                 /* Make sure the output buffer is big enough */
540                 if ((output_len + strlen(outbuf) + SIZ) > outptr_buffer_size) {
541                         outptr_buffer_size += SIZ;
542                         outptr = realloc(outptr, outptr_buffer_size);
543                         if (outptr == NULL) {
544                                 abort();
545                         }
546                 }
547
548                 /* Output any lines terminated with hard line breaks */
549                 do {
550                         did_out = 0;
551                         if (strlen(outbuf) > 0) {
552                             for (i = 0; i<strlen(outbuf); ++i) {
553                                 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
554
555                                         strncpy(&outptr[output_len], outbuf, i+1);
556                                         output_len += (i+1);
557
558                                         strcpy(outbuf, &outbuf[i+1]);
559                                         i = 0;
560                                         did_out = 1;
561                                 }
562                         }
563                     }
564                 } while (did_out);
565
566                 /* Add soft line breaks */
567                 if (strlen(outbuf) > (screenwidth - 2 )) {
568                         rb = (-1);
569                         for (i=0; i<(screenwidth-2); ++i) {
570                                 if (outbuf[i]==32) rb = i;
571                         }
572                         if (rb>=0) {
573                                 strncpy(&outptr[output_len], outbuf, rb);
574                                 output_len += rb;
575                                 strcpy(&outptr[output_len], nl);
576                                 output_len += strlen(nl);
577                                 strcpy(outbuf, &outbuf[rb+1]);
578                         } else {
579                                 strncpy(&outptr[output_len], outbuf,
580                                         screenwidth-2);
581                                 output_len += (screenwidth-2);
582                                 strcpy(&outptr[output_len], nl);
583                                 output_len += strlen(nl);
584                                 strcpy(outbuf, &outbuf[screenwidth-2]);
585                         }
586                 }
587
588         } while (done_reading == 0);
589
590         strcpy(&outptr[output_len], outbuf);
591         output_len += strlen(outbuf);
592
593         /* Strip leading/trailing whitespace.  We can't do this with
594          * string_trim() because it uses too many strlen()'s
595          */
596         while ((output_len > 0) && (isspace(outptr[0]))) {
597                 strcpy(outptr, &outptr[1]);
598                 --output_len;
599         }
600         while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
601                 outptr[output_len-1] = 0;
602                 --output_len;
603         }
604
605         if ((output_len > 0) && (outptr[output_len-1] != '\n')) {
606                 strcat(outptr, "\n");
607                 ++output_len;
608         }
609
610         return outptr;
611
612 }