1 // Functions which handle translation between HTML and plain text
2 // Copyright (c) 2000-2022 by the citadel.org team
4 // This program is open source software. Use, duplication, or disclosure
5 // is subject to the terms of the GNU General Public License, version 3.
11 #include <sys/types.h>
18 #if TIME_WITH_SYS_TIME
19 # include <sys/time.h>
23 # include <sys/time.h>
29 #include "libcitadel.h"
32 // Convert HTML to plain text.
34 // inputmsg = pointer to raw HTML message
35 // msglen = stop reading after this many bytes
36 // screenwidth = desired output screenwidth
37 // ansi = if nonzero, assume output is to a terminal that supports ANSI escape codes
39 char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth, int ansi) {
47 size_t outptr_buffer_size;
48 size_t output_len = 0;
49 int i, j, ch, did_out, rb, scanch;
50 int nest = 0; // Bracket nesting level
51 int blockquote = 0; // BLOCKQUOTE nesting level
52 int styletag = 0; // STYLE tag nesting level
53 int styletag_start = 0;
54 int bytes_processed = 0;
62 if (msglen == 0) msglen = strlen(inputmsg);
64 outptr_buffer_size = strlen(inptr) + SIZ;
65 outptr = malloc(outptr_buffer_size);
66 if (outptr == NULL) return NULL;
71 // Fill the input buffer
72 inbuf_len = strlen(inbuf);
73 if ( (done_reading == 0) && (inbuf_len < (SIZ-128)) ) {
77 inbuf[inbuf_len++] = ch;
85 if (bytes_processed > msglen) {
92 if (!IsEmptyStr(inbuf)) {
94 // Fold in all the spacing
95 for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
96 if (inbuf[i]==10) inbuf[i]=32;
97 if (inbuf[i]==13) inbuf[i]=32;
98 if (inbuf[i]==9) inbuf[i]=32;
100 for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
101 while ((inbuf[i]==32)&&(inbuf[i+1]==32)) {
102 strcpy(&inbuf[i], &inbuf[i+1]);
106 for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
115 else if (ch == '>') { // We have a tag.
116 if (nest > 0) --nest;
118 // Unqualify the tag (truncate at first space)
119 if (strchr(tag, ' ') != NULL) {
120 strcpy(strchr(tag, ' '), "");
123 if (!strcasecmp(tag, "P")) {
128 if (!strcasecmp(tag, "/DIV")) {
133 if (!strcasecmp(tag, "LI")) {
135 strcat(outbuf, " * ");
138 else if (!strcasecmp(tag, "/UL")) {
143 else if (!strcasecmp(tag, "H1")) {
148 else if (!strcasecmp(tag, "H2")) {
153 else if (!strcasecmp(tag, "H3")) {
158 else if (!strcasecmp(tag, "H4")) {
163 else if (!strcasecmp(tag, "/H1")) {
167 else if (!strcasecmp(tag, "/H2")) {
171 else if (!strcasecmp(tag, "/H3")) {
175 else if (!strcasecmp(tag, "/H4")) {
179 else if (!strcasecmp(tag, "HR")) {
182 for (j=0; j<screenwidth-2; ++j)
188 (!strcasecmp(tag, "B"))
189 || (!strcasecmp(tag, "STRONG"))
192 strcat(outbuf, "\033[1m");
196 (!strcasecmp(tag, "/B"))
197 || (!strcasecmp(tag, "/STRONG"))
200 strcat(outbuf, "\033[22m");
205 (!strcasecmp(tag, "I"))
206 || (!strcasecmp(tag, "EM"))
209 strcat(outbuf, "\033[3m");
214 (!strcasecmp(tag, "/I"))
215 || (!strcasecmp(tag, "/EM"))
218 strcat(outbuf, "\033[23m");
222 else if (!strcasecmp(tag, "U")) {
224 strcat(outbuf, "\033[4m");
228 else if (!strcasecmp(tag, "/U")) {
230 strcat(outbuf, "\033[24m");
234 else if (!strcasecmp(tag, "BR")) {
238 else if (!strcasecmp(tag, "TR")) {
242 else if (!strcasecmp(tag, "/TABLE")) {
246 else if (!strcasecmp(tag, "BLOCKQUOTE")) {
249 if ( (blockquote == 1) && (ansi) ) {
250 strcat(nl, "\033[2m\033[3m");
252 for (j=0; j<blockquote; ++j) strcat(nl, ">");
256 else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
257 strcat(outbuf, "\n");
259 if ( (blockquote == 0) && (ansi) ) {
260 strcat(outbuf, "\033[22m\033[23m");
263 for (j=0; j<blockquote; ++j) strcat(nl, ">");
267 else if (!strcasecmp(tag, "STYLE")) {
270 styletag_start = strlen(outbuf);
274 else if (!strcasecmp(tag, "/STYLE")) {
277 outbuf[styletag_start] = 0;
283 else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
284 tag[strlen(tag)+1] = 0;
285 tag[strlen(tag)] = ch;
288 else if ((!nest) && (styletag == 0)) {
289 outbuf[strlen(outbuf)+1] = 0;
290 outbuf[strlen(outbuf)] = ch;
293 strcpy(inbuf, &inbuf[i]);
296 // Convert &; tags to the forbidden characters
297 if (!IsEmptyStr(outbuf)) for (i=0; !IsEmptyStr(&outbuf[i]); ++i) {
299 // Character entity references
300 if (!strncasecmp(&outbuf[i], " ", 6)) {
302 strcpy(&outbuf[i+1], &outbuf[i+6]);
305 if (!strncasecmp(&outbuf[i], " ", 6)) {
307 strcpy(&outbuf[i+1], &outbuf[i+6]);
310 if (!strncasecmp(&outbuf[i], " ", 6)) {
312 strcpy(&outbuf[i+1], &outbuf[i+6]);
315 if (!strncasecmp(&outbuf[i], " ", 8)) {
317 strcpy(&outbuf[i+1], &outbuf[i+8]);
320 else if (!strncasecmp(&outbuf[i], "<", 4)) {
322 strcpy(&outbuf[i+1], &outbuf[i+4]);
325 else if (!strncasecmp(&outbuf[i], ">", 4)) {
327 strcpy(&outbuf[i+1], &outbuf[i+4]);
330 else if (!strncasecmp(&outbuf[i], "&", 5)) {
331 strcpy(&outbuf[i+1], &outbuf[i+5]);
334 else if (!strncasecmp(&outbuf[i], """, 6)) {
336 strcpy(&outbuf[i+1], &outbuf[i+6]);
339 else if (!strncasecmp(&outbuf[i], "‘", 7)) {
341 strcpy(&outbuf[i+1], &outbuf[i+7]);
344 else if (!strncasecmp(&outbuf[i], "’", 7)) {
346 strcpy(&outbuf[i+1], &outbuf[i+7]);
349 else if (!strncasecmp(&outbuf[i], "©", 6)) {
353 strcpy(&outbuf[i+3], &outbuf[i+6]);
356 else if (!strncasecmp(&outbuf[i], "•", 6)) {
360 strcpy(&outbuf[i+3], &outbuf[i+6]);
363 else if (!strncasecmp(&outbuf[i], "…", 8)) {
367 strcpy(&outbuf[i+3], &outbuf[i+8]);
370 else if (!strncasecmp(&outbuf[i], "™", 7)) {
375 strcpy(&outbuf[i+4], &outbuf[i+7]);
378 else if (!strncasecmp(&outbuf[i], "®", 5)) {
382 strcpy(&outbuf[i+3], &outbuf[i+5]);
385 else if (!strncasecmp(&outbuf[i], "¼", 8)) {
389 strcpy(&outbuf[i+3], &outbuf[i+8]);
392 else if (!strncasecmp(&outbuf[i], "½", 8)) {
396 strcpy(&outbuf[i+3], &outbuf[i+8]);
399 else if (!strncasecmp(&outbuf[i], "¾", 8)) {
403 strcpy(&outbuf[i+3], &outbuf[i+8]);
406 else if (!strncasecmp(&outbuf[i], "–", 7)) {
409 strcpy(&outbuf[i+2], &outbuf[i+7]);
412 else if (!strncasecmp(&outbuf[i], "—", 7)) {
416 strcpy(&outbuf[i+3], &outbuf[i+7]);
419 else if (!strncmp(&outbuf[i], "Ç", 8)) {
421 strcpy(&outbuf[i+1], &outbuf[i+8]);
424 else if (!strncasecmp(&outbuf[i], "ç", 8)) {
426 strcpy(&outbuf[i+1], &outbuf[i+8]);
429 else if (!strncmp(&outbuf[i], "È", 8)) {
431 strcpy(&outbuf[i+1], &outbuf[i+8]);
434 else if (!strncasecmp(&outbuf[i], "è", 8)) {
436 strcpy(&outbuf[i+1], &outbuf[i+8]);
439 else if (!strncmp(&outbuf[i], "Ê", 7)) {
441 strcpy(&outbuf[i+1], &outbuf[i+7]);
444 else if (!strncasecmp(&outbuf[i], "ê", 7)) {
446 strcpy(&outbuf[i+1], &outbuf[i+7]);
449 else if (!strncmp(&outbuf[i], "É", 8)) {
451 strcpy(&outbuf[i+1], &outbuf[i+8]);
454 else if (!strncasecmp(&outbuf[i], "é", 8)) {
456 strcpy(&outbuf[i+1], &outbuf[i+8]);
459 else if (!strncmp(&outbuf[i], "À", 8)) {
461 strcpy(&outbuf[i+1], &outbuf[i+8]);
464 else if (!strncasecmp(&outbuf[i], "à", 8)) {
466 strcpy(&outbuf[i+1], &outbuf[i+8]);
469 else if (!strncasecmp(&outbuf[i], "“", 7)) {
471 strcpy(&outbuf[i+1], &outbuf[i+7]);
474 else if (!strncasecmp(&outbuf[i], "”", 7)) {
476 strcpy(&outbuf[i+1], &outbuf[i+7]);
479 else if (!strncasecmp(&outbuf[i], "´", 7)) {
481 strcpy(&outbuf[i+1], &outbuf[i+7]);
484 else if (!strncasecmp(&outbuf[i], "’", 7)) {
486 strcpy(&outbuf[i+1], &outbuf[i+7]);
489 else if (!strncasecmp(&outbuf[i], "–", 7)) {
491 strcpy(&outbuf[i+1], &outbuf[i+7]);
494 // two-digit decimal equivalents
495 else if (outbuf[i] == '&' &&
496 outbuf[i + 1] == '#' &&
497 isdigit(outbuf[i + 2]) &&
498 isdigit(outbuf[i + 3]) &&
499 (outbuf[i+4] == ';') )
502 sscanf(&outbuf[i+2], "%02d", &scanch);
504 strcpy(&outbuf[i+1], &outbuf[i+5]);
507 // three-digit decimal equivalents
508 else if (outbuf[i] == '&' &&
509 outbuf[i + 1] == '#' &&
510 isdigit(outbuf[i + 2]) &&
511 isdigit(outbuf[i + 3]) &&
512 isdigit(outbuf[i + 4]) &&
513 (outbuf[i + 5] == ';') )
516 sscanf(&outbuf[i+2], "%03d", &scanch);
518 strcpy(&outbuf[i+1], &outbuf[i+6]);
521 // four-digit decimal equivalents
522 else if (outbuf[i] == '&' &&
523 outbuf[i + 1] == '#' &&
524 isdigit(outbuf[i + 2]) &&
525 isdigit(outbuf[i + 3]) &&
526 isdigit(outbuf[i + 4]) &&
527 isdigit(outbuf[i + 5]) &&
528 (outbuf[i + 6] == ';') )
531 sscanf(&outbuf[i+2], "%04d", &scanch);
533 strcpy(&outbuf[i+1], &outbuf[i+7]);
538 // Make sure the output buffer is big enough
539 if ((output_len + strlen(outbuf) + SIZ) > outptr_buffer_size) {
540 outptr_buffer_size += SIZ;
541 outptr = realloc(outptr, outptr_buffer_size);
542 if (outptr == NULL) {
547 // Output any lines terminated with hard line breaks
550 if (strlen(outbuf) > 0) {
551 for (i = 0; i<strlen(outbuf); ++i) {
552 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
554 strncpy(&outptr[output_len], outbuf, i+1);
557 strcpy(outbuf, &outbuf[i+1]);
565 // Add soft line breaks
566 if (strlen(outbuf) > (screenwidth - 2 )) {
568 for (i=0; i<(screenwidth-2); ++i) {
569 if (outbuf[i]==32) rb = i;
572 strncpy(&outptr[output_len], outbuf, rb);
574 strcpy(&outptr[output_len], nl);
575 output_len += strlen(nl);
576 strcpy(outbuf, &outbuf[rb+1]);
579 strncpy(&outptr[output_len], outbuf, screenwidth-2);
580 output_len += (screenwidth-2);
581 strcpy(&outptr[output_len], nl);
582 output_len += strlen(nl);
583 strcpy(outbuf, &outbuf[screenwidth-2]);
587 } while (done_reading == 0);
589 strcpy(&outptr[output_len], outbuf);
590 output_len += strlen(outbuf);
592 // Strip leading/trailing whitespace.
593 while ((output_len > 0) && (isspace(outptr[0]))) {
594 strcpy(outptr, &outptr[1]);
597 while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
598 outptr[output_len-1] = 0;
602 // Make sure the final line ends with a newline character.
603 if ((output_len > 0) && (outptr[output_len-1] != '\n')) {
604 strcat(outptr, "\n");