Added a new parameter to html_to_ascii() to let it know when it's rendering to a...
[citadel.git] / libcitadel / lib / html_to_ascii.c
1 // Functions which handle translation between HTML and plain text
2 // Copyright (c) 2000-2022 by the citadel.org team
3 //
4 // This program is open source software.  Use, duplication, or disclosure
5 // is subject to the terms of the GNU General Public License, version 3.
6
7 #include <stdlib.h>
8 #include <unistd.h>
9 #include <stdio.h>
10 #include <signal.h>
11 #include <sys/types.h>
12 #include <ctype.h>
13 #include <string.h>
14 #include <sys/stat.h>
15 #include <errno.h>
16 #include <limits.h>
17
18 #if TIME_WITH_SYS_TIME
19 # include <sys/time.h>
20 # include <time.h>
21 #else
22 # if HAVE_SYS_TIME_H
23 #  include <sys/time.h>
24 # else
25 #  include <time.h>
26 # endif
27 #endif
28
29 #include "libcitadel.h"
30  
31
32 // Convert HTML to plain text.
33 //
34 // inputmsg     = pointer to raw HTML message
35 // msglen       = stop reading after this many bytes
36 // screenwidth  = desired output screenwidth
37 // ansi         = if nonzero, assume output is to a terminal that supports ANSI escape codes
38 //
39 char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth, int ansi) {
40         char inbuf[SIZ];
41         int inbuf_len = 0;
42         char outbuf[SIZ];
43         char tag[1024];
44         int done_reading = 0;
45         const char *inptr;
46         char *outptr;
47         size_t outptr_buffer_size;
48         size_t output_len = 0;
49         int i, j, ch, did_out, rb, scanch;
50         int nest = 0;                           // Bracket nesting level
51         int blockquote = 0;                     // BLOCKQUOTE nesting level
52         int styletag = 0;                       // STYLE tag nesting level
53         int styletag_start = 0;
54         int bytes_processed = 0;
55         char nl[128];
56
57         tag[0] = '\0';
58         strcpy(nl, "\n");
59         inptr = inputmsg;
60         strcpy(inbuf, "");
61         strcpy(outbuf, "");
62         if (msglen == 0) msglen = strlen(inputmsg);
63
64         outptr_buffer_size = strlen(inptr) + SIZ;
65         outptr = malloc(outptr_buffer_size);
66         if (outptr == NULL) return NULL;
67         strcpy(outptr, "");
68         output_len = 0;
69
70         do {
71                 /* Fill the input buffer */
72                 inbuf_len = strlen(inbuf);
73                 if ( (done_reading == 0) && (inbuf_len < (SIZ-128)) ) {
74
75                         ch = *inptr++;
76                         if (ch != 0) {
77                                 inbuf[inbuf_len++] = ch;
78                                 inbuf[inbuf_len] = 0;
79                         } 
80                         else {
81                                 done_reading = 1;
82                         }
83
84                         ++bytes_processed;
85                         if (bytes_processed > msglen) {
86                                 done_reading = 1;
87                         }
88
89                 }
90
91                 /* Do some parsing */
92                 if (!IsEmptyStr(inbuf)) {
93
94
95                     /* Fold in all the spacing */
96                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
97                         if (inbuf[i]==10) inbuf[i]=32;
98                         if (inbuf[i]==13) inbuf[i]=32;
99                         if (inbuf[i]==9) inbuf[i]=32;
100                     }
101                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
102                         while ((inbuf[i]==32)&&(inbuf[i+1]==32)) {
103                                 strcpy(&inbuf[i], &inbuf[i+1]);
104                         }
105                     }
106
107                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
108
109                         ch = inbuf[i];
110
111                         if (ch == '<') {
112                                 ++nest;
113                                 strcpy(tag, "");
114                         }
115
116                         else if (ch == '>') {   /* We have a tag. */
117                                 if (nest > 0) --nest;
118
119                                 /* Unqualify the tag (truncate at first space) */
120                                 if (strchr(tag, ' ') != NULL) {
121                                         strcpy(strchr(tag, ' '), "");
122                                 }
123                                 
124                                 if (!strcasecmp(tag, "P")) {
125                                         strcat(outbuf, nl);
126                                         strcat(outbuf, nl);
127                                 }
128
129                                 if (!strcasecmp(tag, "/DIV")) {
130                                         strcat(outbuf, nl);
131                                         strcat(outbuf, nl);
132                                 }
133
134                                 if (!strcasecmp(tag, "LI")) {
135                                         strcat(outbuf, nl);
136                                         strcat(outbuf, " * ");
137                                 }
138
139                                 else if (!strcasecmp(tag, "/UL")) {
140                                         strcat(outbuf, nl);
141                                         strcat(outbuf, nl);
142                                 }
143
144                                 else if (!strcasecmp(tag, "H1")) {
145                                         strcat(outbuf, nl);
146                                         strcat(outbuf, nl);
147                                 }
148
149                                 else if (!strcasecmp(tag, "H2")) {
150                                         strcat(outbuf, nl);
151                                         strcat(outbuf, nl);
152                                 }
153
154                                 else if (!strcasecmp(tag, "H3")) {
155                                         strcat(outbuf, nl);
156                                         strcat(outbuf, nl);
157                                 }
158
159                                 else if (!strcasecmp(tag, "H4")) {
160                                         strcat(outbuf, nl);
161                                         strcat(outbuf, nl);
162                                 }
163
164                                 else if (!strcasecmp(tag, "/H1")) {
165                                         strcat(outbuf, nl);
166                                 }
167
168                                 else if (!strcasecmp(tag, "/H2")) {
169                                         strcat(outbuf, nl);
170                                 }
171
172                                 else if (!strcasecmp(tag, "/H3")) {
173                                         strcat(outbuf, nl);
174                                 }
175
176                                 else if (!strcasecmp(tag, "/H4")) {
177                                         strcat(outbuf, nl);
178                                 }
179
180                                 else if (!strcasecmp(tag, "HR")) {
181                                         strcat(outbuf, nl);
182                                         strcat(outbuf, " ");
183                                         for (j=0; j<screenwidth-2; ++j)
184                                                 strcat(outbuf, "-");
185                                         strcat(outbuf, nl);
186                                 }
187
188 #if 0
189         These seemed like a good idea at the time, but it just makes a mess.
190
191                                 else if (
192                                         (!strcasecmp(tag, "B"))
193                                         || (!strcasecmp(tag, "/B"))
194                                         || (!strcasecmp(tag, "STRONG"))
195                                         || (!strcasecmp(tag, "/STRONG"))
196                                 ) {
197                                         strcat(outbuf, "*");
198                                 }
199
200                                 else if (
201                                         (!strcasecmp(tag, "I"))
202                                         || (!strcasecmp(tag, "/I"))
203                                         || (!strcasecmp(tag, "EM"))
204                                         || (!strcasecmp(tag, "/EM"))
205                                 ) {
206                                         strcat(outbuf, "/");
207                                 }
208
209                                 else if (
210                                         (!strcasecmp(tag, "U"))
211                                         || (!strcasecmp(tag, "/U"))
212                                 ) {
213                                         strcat(outbuf, "_");
214                                 }
215 #endif
216
217                                 else if (!strcasecmp(tag, "BR")) {
218                                         strcat(outbuf, nl);
219                                 }
220
221                                 else if (!strcasecmp(tag, "TR")) {
222                                         strcat(outbuf, nl);
223                                 }
224
225                                 else if (!strcasecmp(tag, "/TABLE")) {
226                                         strcat(outbuf, nl);
227                                 }
228
229                                 else if (!strcasecmp(tag, "BLOCKQUOTE")) {
230                                         ++blockquote;
231                                         strcpy(nl, "\n");
232                                         if ( (blockquote == 1) && (ansi) ) {
233                                                 strcat(nl, "\033[2m\033[3m");
234                                         }
235                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
236                                         strcat(outbuf, nl);
237                                 }
238
239                                 else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
240                                         strcat(outbuf, "\n");
241                                         --blockquote;
242                                         if ( (blockquote == 0) && (ansi) ) {
243                                                 strcat(outbuf, "\033[22m\033[23m");
244                                         }
245                                         strcpy(nl, "\n");
246                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
247                                         strcat(outbuf, nl);
248                                 }
249
250                                 else if (!strcasecmp(tag, "STYLE")) {
251                                         ++styletag;
252                                         if (styletag == 1) {
253                                                 styletag_start = strlen(outbuf);
254                                         }
255                                 }
256
257                                 else if (!strcasecmp(tag, "/STYLE")) {
258                                         --styletag;
259                                         if (styletag == 0) {
260                                                 outbuf[styletag_start] = 0;
261                                         }
262                                 }
263
264                         }
265
266                         else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
267                                 tag[strlen(tag)+1] = 0;
268                                 tag[strlen(tag)] = ch;
269                         }
270                                 
271                         else if ((!nest) && (styletag == 0)) {
272                                 outbuf[strlen(outbuf)+1] = 0;
273                                 outbuf[strlen(outbuf)] = ch;
274                         }
275                     }
276                     strcpy(inbuf, &inbuf[i]);
277                 }
278
279                 /* Convert &; tags to the forbidden characters */
280                 if (!IsEmptyStr(outbuf)) for (i=0; !IsEmptyStr(&outbuf[i]); ++i) {
281
282                         /* Character entity references */
283                         if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
284                                 outbuf[i] = ' ';
285                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
286                         }
287
288                         if (!strncasecmp(&outbuf[i], "&ensp;", 6)) {
289                                 outbuf[i] = ' ';
290                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
291                         }
292
293                         if (!strncasecmp(&outbuf[i], "&emsp;", 6)) {
294                                 outbuf[i] = ' ';
295                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
296                         }
297
298                         if (!strncasecmp(&outbuf[i], "&thinsp;", 8)) {
299                                 outbuf[i] = ' ';
300                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
301                         }
302
303                         else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
304                                 outbuf[i] = '<';
305                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
306                         }
307
308                         else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
309                                 outbuf[i] = '>';
310                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
311                         }
312
313                         else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
314                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
315                         }
316
317                         else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
318                                 outbuf[i] = '\"';
319                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
320                         }
321
322                         else if (!strncasecmp(&outbuf[i], "&lsquo;", 7)) {
323                                 outbuf[i] = '`';
324                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
325                         }
326
327                         else if (!strncasecmp(&outbuf[i], "&rsquo;", 7)) {
328                                 outbuf[i] = '\'';
329                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
330                         }
331
332                         else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
333                                 outbuf[i] = '(';
334                                 outbuf[i+1] = 'c';
335                                 outbuf[i+2] = ')';
336                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
337                         }
338
339                         else if (!strncasecmp(&outbuf[i], "&bull;", 6)) {
340                                 outbuf[i] = ' ';
341                                 outbuf[i+1] = '*';
342                                 outbuf[i+2] = ' ';
343                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
344                         }
345
346                         else if (!strncasecmp(&outbuf[i], "&hellip;", 8)) {
347                                 outbuf[i] = '.';
348                                 outbuf[i+1] = '.';
349                                 outbuf[i+2] = '.';
350                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
351                         }
352
353                         else if (!strncasecmp(&outbuf[i], "&trade;", 7)) {
354                                 outbuf[i] = '(';
355                                 outbuf[i+1] = 't';
356                                 outbuf[i+2] = 'm';
357                                 outbuf[i+3] = ')';
358                                 strcpy(&outbuf[i+4], &outbuf[i+7]);
359                         }
360
361                         else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
362                                 outbuf[i] = '(';
363                                 outbuf[i+1] = 'r';
364                                 outbuf[i+2] = ')';
365                                 strcpy(&outbuf[i+3], &outbuf[i+5]);
366                         }
367
368                         else if (!strncasecmp(&outbuf[i], "&frac14;", 8)) {
369                                 outbuf[i] = '1';
370                                 outbuf[i+1] = '/';
371                                 outbuf[i+2] = '4';
372                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
373                         }
374
375                         else if (!strncasecmp(&outbuf[i], "&frac12;", 8)) {
376                                 outbuf[i] = '1';
377                                 outbuf[i+1] = '/';
378                                 outbuf[i+2] = '2';
379                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
380                         }
381
382                         else if (!strncasecmp(&outbuf[i], "&frac34;", 8)) {
383                                 outbuf[i] = '3';
384                                 outbuf[i+1] = '/';
385                                 outbuf[i+2] = '4';
386                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
387                         }
388
389                         else if (!strncasecmp(&outbuf[i], "&ndash;", 7)) {
390                                 outbuf[i] = '-';
391                                 outbuf[i+1] = '-';
392                                 strcpy(&outbuf[i+2], &outbuf[i+7]);
393                         }
394
395                         else if (!strncasecmp(&outbuf[i], "&mdash;", 7)) {
396                                 outbuf[i] = '-';
397                                 outbuf[i+1] = '-';
398                                 outbuf[i+2] = '-';
399                                 strcpy(&outbuf[i+3], &outbuf[i+7]);
400                         }
401
402                         else if (!strncmp(&outbuf[i], "&Ccedil;", 8)) {
403                                 outbuf[i] = 'C';
404                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
405                         }
406
407                         else if (!strncasecmp(&outbuf[i], "&ccedil;", 8)) {
408                                 outbuf[i] = 'c';
409                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
410                         }
411
412                         else if (!strncmp(&outbuf[i], "&Egrave;", 8)) {
413                                 outbuf[i] = 'E';
414                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
415                         }
416
417                         else if (!strncasecmp(&outbuf[i], "&egrave;", 8)) {
418                                 outbuf[i] = 'e';
419                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
420                         }
421
422                         else if (!strncmp(&outbuf[i], "&Ecirc;", 7)) {
423                                 outbuf[i] = 'E';
424                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
425                         }
426
427                         else if (!strncasecmp(&outbuf[i], "&ecirc;", 7)) {
428                                 outbuf[i] = 'e';
429                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
430                         }
431
432                         else if (!strncmp(&outbuf[i], "&Eacute;", 8)) {
433                                 outbuf[i] = 'E';
434                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
435                         }
436
437                         else if (!strncasecmp(&outbuf[i], "&eacute;", 8)) {
438                                 outbuf[i] = 'e';
439                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
440                         }
441
442                         else if (!strncmp(&outbuf[i], "&Agrave;", 8)) {
443                                 outbuf[i] = 'A';
444                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
445                         }
446
447                         else if (!strncasecmp(&outbuf[i], "&agrave;", 8)) {
448                                 outbuf[i] = 'a';
449                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
450                         }
451
452                         else if (!strncasecmp(&outbuf[i], "&ldquo;", 7)) {
453                                 outbuf[i] = '\"';
454                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
455                         }
456
457                         else if (!strncasecmp(&outbuf[i], "&rdquo;", 7)) {
458                                 outbuf[i] = '\"';
459                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
460                         }
461
462                         else if (!strncasecmp(&outbuf[i], "&acute;", 7)) {
463                                 outbuf[i] = '\'';
464                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
465                         }
466
467                         else if (!strncasecmp(&outbuf[i], "&#8217;", 7)) {
468                                 outbuf[i] = '\'';
469                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
470                         }
471
472                         else if (!strncasecmp(&outbuf[i], "&#8211;", 7)) {
473                                 outbuf[i] = '-';
474                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
475                         }
476
477                         /* two-digit decimal equivalents */
478                         else if (outbuf[i] == '&'       &&
479                                  outbuf[i + 1] == '#'   &&
480                                  isdigit(outbuf[i + 2]) && 
481                                  isdigit(outbuf[i + 3]) &&
482                                  (outbuf[i+4] == ';') ) 
483                         {
484                                 scanch = 0;
485                                 sscanf(&outbuf[i+2], "%02d", &scanch);
486                                 outbuf[i] = scanch;
487                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
488                         }
489
490                         /* three-digit decimal equivalents */
491                         else if (outbuf[i] == '&'       &&
492                                  outbuf[i + 1] == '#'   &&
493                                  isdigit(outbuf[i + 2]) && 
494                                  isdigit(outbuf[i + 3]) && 
495                                  isdigit(outbuf[i + 4]) &&
496                                  (outbuf[i + 5] == ';') ) 
497                         {
498                                 scanch = 0;
499                                 sscanf(&outbuf[i+2], "%03d", &scanch);
500                                 outbuf[i] = scanch;
501                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
502                         }
503
504                         /* four-digit decimal equivalents */
505                         else if (outbuf[i] == '&'       &&
506                                  outbuf[i + 1] == '#'   &&
507                                  isdigit(outbuf[i + 2]) && 
508                                  isdigit(outbuf[i + 3]) && 
509                                  isdigit(outbuf[i + 4]) &&
510                                  isdigit(outbuf[i + 5]) &&
511                                  (outbuf[i + 6] == ';') ) 
512                         {
513                                 scanch = 0;
514                                 sscanf(&outbuf[i+2], "%04d", &scanch);
515                                 outbuf[i] = scanch;
516                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
517                         }
518
519                 }
520
521                 /* Make sure the output buffer is big enough */
522                 if ((output_len + strlen(outbuf) + SIZ) > outptr_buffer_size) {
523                         outptr_buffer_size += SIZ;
524                         outptr = realloc(outptr, outptr_buffer_size);
525                         if (outptr == NULL) {
526                                 abort();
527                         }
528                 }
529
530                 /* Output any lines terminated with hard line breaks */
531                 do {
532                         did_out = 0;
533                         if (strlen(outbuf) > 0) {
534                             for (i = 0; i<strlen(outbuf); ++i) {
535                                 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
536
537                                         strncpy(&outptr[output_len], outbuf, i+1);
538                                         output_len += (i+1);
539
540                                         strcpy(outbuf, &outbuf[i+1]);
541                                         i = 0;
542                                         did_out = 1;
543                                 }
544                         }
545                     }
546                 } while (did_out);
547
548                 /* Add soft line breaks */
549                 if (strlen(outbuf) > (screenwidth - 2 )) {
550                         rb = (-1);
551                         for (i=0; i<(screenwidth-2); ++i) {
552                                 if (outbuf[i]==32) rb = i;
553                         }
554                         if (rb>=0) {
555                                 strncpy(&outptr[output_len], outbuf, rb);
556                                 output_len += rb;
557                                 strcpy(&outptr[output_len], nl);
558                                 output_len += strlen(nl);
559                                 strcpy(outbuf, &outbuf[rb+1]);
560                         } else {
561                                 strncpy(&outptr[output_len], outbuf,
562                                         screenwidth-2);
563                                 output_len += (screenwidth-2);
564                                 strcpy(&outptr[output_len], nl);
565                                 output_len += strlen(nl);
566                                 strcpy(outbuf, &outbuf[screenwidth-2]);
567                         }
568                 }
569
570         } while (done_reading == 0);
571
572         strcpy(&outptr[output_len], outbuf);
573         output_len += strlen(outbuf);
574
575         /* Strip leading/trailing whitespace.  We can't do this with
576          * striplt() because it uses too many strlen()'s
577          */
578         while ((output_len > 0) && (isspace(outptr[0]))) {
579                 strcpy(outptr, &outptr[1]);
580                 --output_len;
581         }
582         while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
583                 outptr[output_len-1] = 0;
584                 --output_len;
585         }
586
587         if ((output_len > 0) && (outptr[output_len-1] != '\n')) {
588                 strcat(outptr, "\n");
589                 ++output_len;
590         }
591
592         return outptr;
593
594 }