3c1012f00bf601a9aa9ff0efa85d8a4679891a8d
[citadel.git] / libcitadel / lib / html_to_ascii.c
1 // Functions which handle translation between HTML and plain text
2 // Copyright (c) 2000-2022 by the citadel.org team
3 //
4 // This program is open source software.  Use, duplication, or disclosure
5 // is subject to the terms of the GNU General Public License, version 3.
6
7 #include <stdlib.h>
8 #include <unistd.h>
9 #include <stdio.h>
10 #include <signal.h>
11 #include <sys/types.h>
12 #include <ctype.h>
13 #include <string.h>
14 #include <sys/stat.h>
15 #include <errno.h>
16 #include <limits.h>
17
18 #if TIME_WITH_SYS_TIME
19 # include <sys/time.h>
20 # include <time.h>
21 #else
22 # if HAVE_SYS_TIME_H
23 #  include <sys/time.h>
24 # else
25 #  include <time.h>
26 # endif
27 #endif
28
29 #include "libcitadel.h"
30  
31
32 // Convert HTML to plain text.
33 //
34 // inputmsg     = pointer to raw HTML message
35 // msglen       = stop reading after this many bytes
36 // screenwidth  = desired output screenwidth
37 // ansi         = if nonzero, assume output is to a terminal that supports ANSI escape codes
38 //
39 char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth, int ansi) {
40         char inbuf[SIZ];
41         int inbuf_len = 0;
42         char outbuf[SIZ];
43         char tag[1024];
44         int done_reading = 0;
45         const char *inptr;
46         char *outptr;
47         size_t outptr_buffer_size;
48         size_t output_len = 0;
49         int i, j, ch, did_out, rb, scanch;
50         int nest = 0;                           // Bracket nesting level
51         int blockquote = 0;                     // BLOCKQUOTE nesting level
52         int styletag = 0;                       // STYLE tag nesting level
53         int styletag_start = 0;
54         int bytes_processed = 0;
55         char nl[128];
56
57         tag[0] = '\0';
58         strcpy(nl, "\n");
59         inptr = inputmsg;
60         strcpy(inbuf, "");
61         strcpy(outbuf, "");
62         if (msglen == 0) msglen = strlen(inputmsg);
63
64         outptr_buffer_size = strlen(inptr) + SIZ;
65         outptr = malloc(outptr_buffer_size);
66         if (outptr == NULL) return NULL;
67         strcpy(outptr, "");
68         output_len = 0;
69
70         do {
71                 // Fill the input buffer
72                 inbuf_len = strlen(inbuf);
73                 if ( (done_reading == 0) && (inbuf_len < (SIZ-128)) ) {
74
75                         ch = *inptr++;
76                         if (ch != 0) {
77                                 inbuf[inbuf_len++] = ch;
78                                 inbuf[inbuf_len] = 0;
79                         } 
80                         else {
81                                 done_reading = 1;
82                         }
83
84                         ++bytes_processed;
85                         if (bytes_processed > msglen) {
86                                 done_reading = 1;
87                         }
88
89                 }
90
91                 // Do some parsing
92                 if (!IsEmptyStr(inbuf)) {
93
94                     // Fold in all the spacing
95                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
96                         if (inbuf[i]==10) inbuf[i]=32;
97                         if (inbuf[i]==13) inbuf[i]=32;
98                         if (inbuf[i]==9) inbuf[i]=32;
99                     }
100                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
101                         while ((inbuf[i]==32)&&(inbuf[i+1]==32)) {
102                                 strcpy(&inbuf[i], &inbuf[i+1]);
103                         }
104                     }
105
106                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
107
108                         ch = inbuf[i];
109
110                         if (ch == '<') {
111                                 ++nest;
112                                 strcpy(tag, "");
113                         }
114
115                         else if (ch == '>') {   // We have a tag.
116                                 if (nest > 0) --nest;
117
118                                 // Unqualify the tag (truncate at first space)
119                                 if (strchr(tag, ' ') != NULL) {
120                                         strcpy(strchr(tag, ' '), "");
121                                 }
122                                 
123                                 if (!strcasecmp(tag, "P")) {
124                                         strcat(outbuf, nl);
125                                         strcat(outbuf, nl);
126                                 }
127
128                                 if (!strcasecmp(tag, "/DIV")) {
129                                         strcat(outbuf, nl);
130                                         strcat(outbuf, nl);
131                                 }
132
133                                 if (!strcasecmp(tag, "LI")) {
134                                         strcat(outbuf, nl);
135                                         strcat(outbuf, " * ");
136                                 }
137
138                                 else if (!strcasecmp(tag, "/UL")) {
139                                         strcat(outbuf, nl);
140                                         strcat(outbuf, nl);
141                                 }
142
143                                 else if (!strcasecmp(tag, "H1")) {
144                                         strcat(outbuf, nl);
145                                         strcat(outbuf, nl);
146                                 }
147
148                                 else if (!strcasecmp(tag, "H2")) {
149                                         strcat(outbuf, nl);
150                                         strcat(outbuf, nl);
151                                 }
152
153                                 else if (!strcasecmp(tag, "H3")) {
154                                         strcat(outbuf, nl);
155                                         strcat(outbuf, nl);
156                                 }
157
158                                 else if (!strcasecmp(tag, "H4")) {
159                                         strcat(outbuf, nl);
160                                         strcat(outbuf, nl);
161                                 }
162
163                                 else if (!strcasecmp(tag, "/H1")) {
164                                         strcat(outbuf, nl);
165                                 }
166
167                                 else if (!strcasecmp(tag, "/H2")) {
168                                         strcat(outbuf, nl);
169                                 }
170
171                                 else if (!strcasecmp(tag, "/H3")) {
172                                         strcat(outbuf, nl);
173                                 }
174
175                                 else if (!strcasecmp(tag, "/H4")) {
176                                         strcat(outbuf, nl);
177                                 }
178
179                                 else if (!strcasecmp(tag, "HR")) {
180                                         strcat(outbuf, nl);
181                                         strcat(outbuf, " ");
182                                         for (j=0; j<screenwidth-2; ++j)
183                                                 strcat(outbuf, "-");
184                                         strcat(outbuf, nl);
185                                 }
186
187                                 else if (
188                                         (!strcasecmp(tag, "B"))
189                                         || (!strcasecmp(tag, "STRONG"))
190                                 ) {
191                                         if (ansi) {
192                                                 strcat(outbuf, "\033[1m");
193                                         }
194                                 }
195                                 else if (
196                                         (!strcasecmp(tag, "/B"))
197                                         || (!strcasecmp(tag, "/STRONG"))
198                                 ) {
199                                         if (ansi) {
200                                                 strcat(outbuf, "\033[22m");
201                                         }
202                                 }
203
204                                 else if (
205                                         (!strcasecmp(tag, "I"))
206                                         || (!strcasecmp(tag, "EM"))
207                                 ) {
208                                         if (ansi) {
209                                                 strcat(outbuf, "\033[3m");
210                                         }
211                                 }
212
213                                 else if (
214                                         (!strcasecmp(tag, "/I"))
215                                         || (!strcasecmp(tag, "/EM"))
216                                 ) {
217                                         if (ansi) {
218                                                 strcat(outbuf, "\033[23m");
219                                         }
220                                 }
221
222                                 else if (!strcasecmp(tag, "U")) {
223                                         if (ansi) {
224                                                 strcat(outbuf, "\033[4m");
225                                         }
226                                 }
227
228                                 else if (!strcasecmp(tag, "/U")) {
229                                         if (ansi) {
230                                                 strcat(outbuf, "\033[24m");
231                                         }
232                                 }
233
234                                 else if (!strcasecmp(tag, "BR")) {
235                                         strcat(outbuf, nl);
236                                 }
237
238                                 else if (!strcasecmp(tag, "TR")) {
239                                         strcat(outbuf, nl);
240                                 }
241
242                                 else if (!strcasecmp(tag, "/TABLE")) {
243                                         strcat(outbuf, nl);
244                                 }
245
246                                 else if (!strcasecmp(tag, "BLOCKQUOTE")) {
247                                         ++blockquote;
248                                         strcpy(nl, "\n");
249                                         if ( (blockquote == 1) && (ansi) ) {
250                                                 strcat(nl, "\033[2m\033[3m");
251                                         }
252                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
253                                         strcat(outbuf, nl);
254                                 }
255
256                                 else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
257                                         strcat(outbuf, "\n");
258                                         --blockquote;
259                                         if ( (blockquote == 0) && (ansi) ) {
260                                                 strcat(outbuf, "\033[22m\033[23m");
261                                         }
262                                         strcpy(nl, "\n");
263                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
264                                         strcat(outbuf, nl);
265                                 }
266
267                                 else if (!strcasecmp(tag, "STYLE")) {
268                                         ++styletag;
269                                         if (styletag == 1) {
270                                                 styletag_start = strlen(outbuf);
271                                         }
272                                 }
273
274                                 else if (!strcasecmp(tag, "/STYLE")) {
275                                         --styletag;
276                                         if (styletag == 0) {
277                                                 outbuf[styletag_start] = 0;
278                                         }
279                                 }
280
281                         }
282
283                         else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
284                                 tag[strlen(tag)+1] = 0;
285                                 tag[strlen(tag)] = ch;
286                         }
287                                 
288                         else if ((!nest) && (styletag == 0)) {
289                                 outbuf[strlen(outbuf)+1] = 0;
290                                 outbuf[strlen(outbuf)] = ch;
291                         }
292                     }
293                     strcpy(inbuf, &inbuf[i]);
294                 }
295
296                 // Convert &; tags to the forbidden characters
297                 if (!IsEmptyStr(outbuf)) for (i=0; !IsEmptyStr(&outbuf[i]); ++i) {
298
299                         // Character entity references
300                         if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
301                                 outbuf[i] = ' ';
302                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
303                         }
304
305                         if (!strncasecmp(&outbuf[i], "&ensp;", 6)) {
306                                 outbuf[i] = ' ';
307                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
308                         }
309
310                         if (!strncasecmp(&outbuf[i], "&emsp;", 6)) {
311                                 outbuf[i] = ' ';
312                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
313                         }
314
315                         if (!strncasecmp(&outbuf[i], "&thinsp;", 8)) {
316                                 outbuf[i] = ' ';
317                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
318                         }
319
320                         else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
321                                 outbuf[i] = '<';
322                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
323                         }
324
325                         else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
326                                 outbuf[i] = '>';
327                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
328                         }
329
330                         else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
331                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
332                         }
333
334                         else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
335                                 outbuf[i] = '\"';
336                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
337                         }
338
339                         else if (!strncasecmp(&outbuf[i], "&lsquo;", 7)) {
340                                 outbuf[i] = '`';
341                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
342                         }
343
344                         else if (!strncasecmp(&outbuf[i], "&rsquo;", 7)) {
345                                 outbuf[i] = '\'';
346                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
347                         }
348
349                         else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
350                                 outbuf[i] = '(';
351                                 outbuf[i+1] = 'c';
352                                 outbuf[i+2] = ')';
353                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
354                         }
355
356                         else if (!strncasecmp(&outbuf[i], "&bull;", 6)) {
357                                 outbuf[i] = ' ';
358                                 outbuf[i+1] = '*';
359                                 outbuf[i+2] = ' ';
360                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
361                         }
362
363                         else if (!strncasecmp(&outbuf[i], "&hellip;", 8)) {
364                                 outbuf[i] = '.';
365                                 outbuf[i+1] = '.';
366                                 outbuf[i+2] = '.';
367                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
368                         }
369
370                         else if (!strncasecmp(&outbuf[i], "&trade;", 7)) {
371                                 outbuf[i] = '(';
372                                 outbuf[i+1] = 't';
373                                 outbuf[i+2] = 'm';
374                                 outbuf[i+3] = ')';
375                                 strcpy(&outbuf[i+4], &outbuf[i+7]);
376                         }
377
378                         else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
379                                 outbuf[i] = '(';
380                                 outbuf[i+1] = 'r';
381                                 outbuf[i+2] = ')';
382                                 strcpy(&outbuf[i+3], &outbuf[i+5]);
383                         }
384
385                         else if (!strncasecmp(&outbuf[i], "&frac14;", 8)) {
386                                 outbuf[i] = '1';
387                                 outbuf[i+1] = '/';
388                                 outbuf[i+2] = '4';
389                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
390                         }
391
392                         else if (!strncasecmp(&outbuf[i], "&frac12;", 8)) {
393                                 outbuf[i] = '1';
394                                 outbuf[i+1] = '/';
395                                 outbuf[i+2] = '2';
396                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
397                         }
398
399                         else if (!strncasecmp(&outbuf[i], "&frac34;", 8)) {
400                                 outbuf[i] = '3';
401                                 outbuf[i+1] = '/';
402                                 outbuf[i+2] = '4';
403                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
404                         }
405
406                         else if (!strncasecmp(&outbuf[i], "&ndash;", 7)) {
407                                 outbuf[i] = '-';
408                                 outbuf[i+1] = '-';
409                                 strcpy(&outbuf[i+2], &outbuf[i+7]);
410                         }
411
412                         else if (!strncasecmp(&outbuf[i], "&mdash;", 7)) {
413                                 outbuf[i] = '-';
414                                 outbuf[i+1] = '-';
415                                 outbuf[i+2] = '-';
416                                 strcpy(&outbuf[i+3], &outbuf[i+7]);
417                         }
418
419                         else if (!strncmp(&outbuf[i], "&Ccedil;", 8)) {
420                                 outbuf[i] = 'C';
421                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
422                         }
423
424                         else if (!strncasecmp(&outbuf[i], "&ccedil;", 8)) {
425                                 outbuf[i] = 'c';
426                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
427                         }
428
429                         else if (!strncmp(&outbuf[i], "&Egrave;", 8)) {
430                                 outbuf[i] = 'E';
431                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
432                         }
433
434                         else if (!strncasecmp(&outbuf[i], "&egrave;", 8)) {
435                                 outbuf[i] = 'e';
436                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
437                         }
438
439                         else if (!strncmp(&outbuf[i], "&Ecirc;", 7)) {
440                                 outbuf[i] = 'E';
441                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
442                         }
443
444                         else if (!strncasecmp(&outbuf[i], "&ecirc;", 7)) {
445                                 outbuf[i] = 'e';
446                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
447                         }
448
449                         else if (!strncmp(&outbuf[i], "&Eacute;", 8)) {
450                                 outbuf[i] = 'E';
451                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
452                         }
453
454                         else if (!strncasecmp(&outbuf[i], "&eacute;", 8)) {
455                                 outbuf[i] = 'e';
456                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
457                         }
458
459                         else if (!strncmp(&outbuf[i], "&Agrave;", 8)) {
460                                 outbuf[i] = 'A';
461                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
462                         }
463
464                         else if (!strncasecmp(&outbuf[i], "&agrave;", 8)) {
465                                 outbuf[i] = 'a';
466                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
467                         }
468
469                         else if (!strncasecmp(&outbuf[i], "&ldquo;", 7)) {
470                                 outbuf[i] = '\"';
471                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
472                         }
473
474                         else if (!strncasecmp(&outbuf[i], "&rdquo;", 7)) {
475                                 outbuf[i] = '\"';
476                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
477                         }
478
479                         else if (!strncasecmp(&outbuf[i], "&acute;", 7)) {
480                                 outbuf[i] = '\'';
481                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
482                         }
483
484                         else if (!strncasecmp(&outbuf[i], "&#8217;", 7)) {
485                                 outbuf[i] = '\'';
486                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
487                         }
488
489                         else if (!strncasecmp(&outbuf[i], "&#8211;", 7)) {
490                                 outbuf[i] = '-';
491                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
492                         }
493
494                         // two-digit decimal equivalents
495                         else if (outbuf[i] == '&'       &&
496                                  outbuf[i + 1] == '#'   &&
497                                  isdigit(outbuf[i + 2]) && 
498                                  isdigit(outbuf[i + 3]) &&
499                                  (outbuf[i+4] == ';') ) 
500                         {
501                                 scanch = 0;
502                                 sscanf(&outbuf[i+2], "%02d", &scanch);
503                                 outbuf[i] = scanch;
504                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
505                         }
506
507                         // three-digit decimal equivalents
508                         else if (outbuf[i] == '&'       &&
509                                  outbuf[i + 1] == '#'   &&
510                                  isdigit(outbuf[i + 2]) && 
511                                  isdigit(outbuf[i + 3]) && 
512                                  isdigit(outbuf[i + 4]) &&
513                                  (outbuf[i + 5] == ';') ) 
514                         {
515                                 scanch = 0;
516                                 sscanf(&outbuf[i+2], "%03d", &scanch);
517                                 outbuf[i] = scanch;
518                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
519                         }
520
521                         // four-digit decimal equivalents
522                         else if (outbuf[i] == '&'       &&
523                                  outbuf[i + 1] == '#'   &&
524                                  isdigit(outbuf[i + 2]) && 
525                                  isdigit(outbuf[i + 3]) && 
526                                  isdigit(outbuf[i + 4]) &&
527                                  isdigit(outbuf[i + 5]) &&
528                                  (outbuf[i + 6] == ';') ) 
529                         {
530                                 scanch = 0;
531                                 sscanf(&outbuf[i+2], "%04d", &scanch);
532                                 outbuf[i] = scanch;
533                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
534                         }
535
536                 }
537
538                 // Make sure the output buffer is big enough
539                 if ((output_len + strlen(outbuf) + SIZ) > outptr_buffer_size) {
540                         outptr_buffer_size += SIZ;
541                         outptr = realloc(outptr, outptr_buffer_size);
542                         if (outptr == NULL) {
543                                 abort();
544                         }
545                 }
546
547                 // Output any lines terminated with hard line breaks
548                 do {
549                         did_out = 0;
550                         if (strlen(outbuf) > 0) {
551                             for (i = 0; i<strlen(outbuf); ++i) {
552                                 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
553
554                                         strncpy(&outptr[output_len], outbuf, i+1);
555                                         output_len += (i+1);
556
557                                         strcpy(outbuf, &outbuf[i+1]);
558                                         i = 0;
559                                         did_out = 1;
560                                 }
561                         }
562                     }
563                 } while (did_out);
564
565                 // Add soft line breaks
566                 if (strlen(outbuf) > (screenwidth - 2 )) {
567                         rb = (-1);
568                         for (i=0; i<(screenwidth-2); ++i) {
569                                 if (outbuf[i]==32) rb = i;
570                         }
571                         if (rb>=0) {
572                                 strncpy(&outptr[output_len], outbuf, rb);
573                                 output_len += rb;
574                                 strcpy(&outptr[output_len], nl);
575                                 output_len += strlen(nl);
576                                 strcpy(outbuf, &outbuf[rb+1]);
577                         }
578                         else {
579                                 strncpy(&outptr[output_len], outbuf, screenwidth-2);
580                                 output_len += (screenwidth-2);
581                                 strcpy(&outptr[output_len], nl);
582                                 output_len += strlen(nl);
583                                 strcpy(outbuf, &outbuf[screenwidth-2]);
584                         }
585                 }
586
587         } while (done_reading == 0);
588
589         strcpy(&outptr[output_len], outbuf);
590         output_len += strlen(outbuf);
591
592         // Strip leading/trailing whitespace.
593         while ((output_len > 0) && (isspace(outptr[0]))) {
594                 strcpy(outptr, &outptr[1]);
595                 --output_len;
596         }
597         while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
598                 outptr[output_len-1] = 0;
599                 --output_len;
600         }
601
602         // Make sure the final line ends with a newline character.
603         if ((output_len > 0) && (outptr[output_len-1] != '\n')) {
604                 strcat(outptr, "\n");
605                 ++output_len;
606         }
607
608         return outptr;
609
610 }