2 * $Id: html.c 6014 2008-02-04 18:38:35Z ajc $
4 * Functions which handle translation between HTML and plain text
5 * Copyright (c) 2000-2005 by Art Cancro and others. This program is
6 * released under the terms of the GNU General Public License.
13 #include <sys/types.h>
20 #if TIME_WITH_SYS_TIME
21 # include <sys/time.h>
25 # include <sys/time.h>
31 #include "libcitadel.h"
35 * Convert HTML to plain text.
37 * inputmsg = pointer to raw HTML message
38 * screenwidth = desired output screenwidth
39 * do_citaformat = set to 1 to indent newlines with spaces
41 char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaformat) {
49 size_t outptr_buffer_size;
50 size_t output_len = 0;
51 int i, j, ch, did_out, rb, scanch;
52 int nest = 0; /* Bracket nesting level */
53 int blockquote = 0; /* BLOCKQUOTE nesting level */
54 int styletag = 0; /* STYLE tag nesting level */
55 int styletag_start = 0;
56 int bytes_processed = 0;
63 if (msglen == 0) msglen = strlen(inputmsg);
65 outptr_buffer_size = strlen(inptr) + SIZ;
66 outptr = malloc(outptr_buffer_size);
67 if (outptr == NULL) return NULL;
72 /* Fill the input buffer */
73 inbuf_len = strlen(inbuf);
74 if ( (done_reading == 0) && (inbuf_len < (SIZ-128)) ) {
78 inbuf[inbuf_len++] = ch;
86 if (bytes_processed > msglen) {
93 if (!IsEmptyStr(inbuf)) {
96 /* Fold in all the spacing */
97 for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
98 if (inbuf[i]==10) inbuf[i]=32;
99 if (inbuf[i]==13) inbuf[i]=32;
100 if (inbuf[i]==9) inbuf[i]=32;
102 for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
103 while ((inbuf[i]==32)&&(inbuf[i+1]==32)) {
104 strcpy(&inbuf[i], &inbuf[i+1]);
108 for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
117 else if (ch == '>') { /* We have a tag. */
118 if (nest > 0) --nest;
120 /* Unqualify the tag (truncate at first space) */
121 if (strchr(tag, ' ') != NULL) {
122 strcpy(strchr(tag, ' '), "");
125 if (!strcasecmp(tag, "P")) {
130 if (!strcasecmp(tag, "/DIV")) {
135 if (!strcasecmp(tag, "LI")) {
137 strcat(outbuf, " * ");
140 else if (!strcasecmp(tag, "/UL")) {
145 else if (!strcasecmp(tag, "H1")) {
150 else if (!strcasecmp(tag, "H2")) {
155 else if (!strcasecmp(tag, "H3")) {
160 else if (!strcasecmp(tag, "H4")) {
165 else if (!strcasecmp(tag, "/H1")) {
169 else if (!strcasecmp(tag, "/H2")) {
173 else if (!strcasecmp(tag, "/H3")) {
177 else if (!strcasecmp(tag, "/H4")) {
181 else if (!strcasecmp(tag, "HR")) {
184 for (j=0; j<screenwidth-2; ++j)
189 else if (!strcasecmp(tag, "BR")) {
193 else if (!strcasecmp(tag, "TR")) {
197 else if (!strcasecmp(tag, "/TABLE")) {
201 else if (!strcasecmp(tag, "BLOCKQUOTE")) {
204 for (j=0; j<blockquote; ++j) strcat(nl, ">");
208 else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
209 strcat(outbuf, "\n");
212 for (j=0; j<blockquote; ++j) strcat(nl, ">");
216 else if (!strcasecmp(tag, "STYLE")) {
219 styletag_start = strlen(outbuf);
223 else if (!strcasecmp(tag, "/STYLE")) {
226 outbuf[styletag_start] = 0;
232 else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
233 tag[strlen(tag)+1] = 0;
234 tag[strlen(tag)] = ch;
238 outbuf[strlen(outbuf)+1] = 0;
239 outbuf[strlen(outbuf)] = ch;
242 strcpy(inbuf, &inbuf[i]);
245 /* Convert &; tags to the forbidden characters */
246 if (!IsEmptyStr(outbuf)) for (i=0; !IsEmptyStr(&outbuf[i]); ++i) {
248 /* Character entity references */
249 if (!strncasecmp(&outbuf[i], " ", 6)) {
251 strcpy(&outbuf[i+1], &outbuf[i+6]);
254 if (!strncasecmp(&outbuf[i], " ", 6)) {
256 strcpy(&outbuf[i+1], &outbuf[i+6]);
259 if (!strncasecmp(&outbuf[i], " ", 6)) {
261 strcpy(&outbuf[i+1], &outbuf[i+6]);
264 if (!strncasecmp(&outbuf[i], " ", 8)) {
266 strcpy(&outbuf[i+1], &outbuf[i+8]);
269 else if (!strncasecmp(&outbuf[i], "<", 4)) {
271 strcpy(&outbuf[i+1], &outbuf[i+4]);
274 else if (!strncasecmp(&outbuf[i], ">", 4)) {
276 strcpy(&outbuf[i+1], &outbuf[i+4]);
279 else if (!strncasecmp(&outbuf[i], "&", 5)) {
280 strcpy(&outbuf[i+1], &outbuf[i+5]);
283 else if (!strncasecmp(&outbuf[i], """, 6)) {
285 strcpy(&outbuf[i+1], &outbuf[i+6]);
288 else if (!strncasecmp(&outbuf[i], "‘", 7)) {
290 strcpy(&outbuf[i+1], &outbuf[i+7]);
293 else if (!strncasecmp(&outbuf[i], "’", 7)) {
295 strcpy(&outbuf[i+1], &outbuf[i+7]);
298 else if (!strncasecmp(&outbuf[i], "©", 6)) {
302 strcpy(&outbuf[i+3], &outbuf[i+6]);
305 else if (!strncasecmp(&outbuf[i], "•", 6)) {
309 strcpy(&outbuf[i+3], &outbuf[i+6]);
312 else if (!strncasecmp(&outbuf[i], "…", 8)) {
316 strcpy(&outbuf[i+3], &outbuf[i+8]);
319 else if (!strncasecmp(&outbuf[i], "™", 7)) {
324 strcpy(&outbuf[i+4], &outbuf[i+7]);
327 else if (!strncasecmp(&outbuf[i], "®", 5)) {
331 strcpy(&outbuf[i+3], &outbuf[i+5]);
334 else if (!strncasecmp(&outbuf[i], "¼", 8)) {
338 strcpy(&outbuf[i+3], &outbuf[i+8]);
341 else if (!strncasecmp(&outbuf[i], "½", 8)) {
345 strcpy(&outbuf[i+3], &outbuf[i+8]);
348 else if (!strncasecmp(&outbuf[i], "¾", 8)) {
352 strcpy(&outbuf[i+3], &outbuf[i+8]);
355 else if (!strncasecmp(&outbuf[i], "–", 7)) {
358 strcpy(&outbuf[i+2], &outbuf[i+7]);
361 else if (!strncasecmp(&outbuf[i], "—", 7)) {
365 strcpy(&outbuf[i+3], &outbuf[i+7]);
368 else if (!strncmp(&outbuf[i], "Ç", 8)) {
370 strcpy(&outbuf[i+1], &outbuf[i+8]);
373 else if (!strncasecmp(&outbuf[i], "ç", 8)) {
375 strcpy(&outbuf[i+1], &outbuf[i+8]);
378 else if (!strncmp(&outbuf[i], "È", 8)) {
380 strcpy(&outbuf[i+1], &outbuf[i+8]);
383 else if (!strncasecmp(&outbuf[i], "è", 8)) {
385 strcpy(&outbuf[i+1], &outbuf[i+8]);
388 else if (!strncmp(&outbuf[i], "Ê", 7)) {
390 strcpy(&outbuf[i+1], &outbuf[i+7]);
393 else if (!strncasecmp(&outbuf[i], "ê", 7)) {
395 strcpy(&outbuf[i+1], &outbuf[i+7]);
398 else if (!strncmp(&outbuf[i], "É", 8)) {
400 strcpy(&outbuf[i+1], &outbuf[i+8]);
403 else if (!strncasecmp(&outbuf[i], "é", 8)) {
405 strcpy(&outbuf[i+1], &outbuf[i+8]);
408 else if (!strncmp(&outbuf[i], "À", 8)) {
410 strcpy(&outbuf[i+1], &outbuf[i+8]);
413 else if (!strncasecmp(&outbuf[i], "à", 8)) {
415 strcpy(&outbuf[i+1], &outbuf[i+8]);
418 else if (!strncasecmp(&outbuf[i], "“", 7)) {
420 strcpy(&outbuf[i+1], &outbuf[i+7]);
423 else if (!strncasecmp(&outbuf[i], "”", 7)) {
425 strcpy(&outbuf[i+1], &outbuf[i+7]);
428 else if (!strncasecmp(&outbuf[i], "´", 7)) {
430 strcpy(&outbuf[i+1], &outbuf[i+7]);
433 /* two-digit decimal equivalents */
434 else if ((!strncmp(&outbuf[i], "&#", 2))
435 && (outbuf[i+4] == ';') ) {
437 sscanf(&outbuf[i+2], "%02d", &scanch);
439 strcpy(&outbuf[i+1], &outbuf[i+5]);
442 /* three-digit decimal equivalents */
443 else if ((!strncmp(&outbuf[i], "&#", 2))
444 && (outbuf[i+5] == ';') ) {
446 sscanf(&outbuf[i+2], "%03d", &scanch);
448 strcpy(&outbuf[i+1], &outbuf[i+6]);
453 /* Make sure the output buffer is big enough */
454 if ((output_len + strlen(outbuf) + SIZ) > outptr_buffer_size) {
455 outptr_buffer_size += SIZ;
456 outptr = realloc(outptr, outptr_buffer_size);
457 if (outptr == NULL) {
462 /* Output any lines terminated with hard line breaks */
465 if (strlen(outbuf) > 0) {
466 for (i = 0; i<strlen(outbuf); ++i) {
467 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
469 strncpy(&outptr[output_len], outbuf, i+1);
473 strcpy(&outptr[output_len], " ");
477 strcpy(outbuf, &outbuf[i+1]);
485 /* Add soft line breaks */
486 if (strlen(outbuf) > (screenwidth - 2 )) {
488 for (i=0; i<(screenwidth-2); ++i) {
489 if (outbuf[i]==32) rb = i;
492 strncpy(&outptr[output_len], outbuf, rb);
494 strcpy(&outptr[output_len], nl);
495 output_len += strlen(nl);
497 strcpy(&outptr[output_len], " ");
500 strcpy(outbuf, &outbuf[rb+1]);
502 strncpy(&outptr[output_len], outbuf,
504 output_len += (screenwidth-2);
505 strcpy(&outptr[output_len], nl);
506 output_len += strlen(nl);
508 strcpy(&outptr[output_len], " ");
511 strcpy(outbuf, &outbuf[screenwidth-2]);
515 } while (done_reading == 0);
517 strcpy(&outptr[output_len], outbuf);
518 output_len += strlen(outbuf);
520 /* Strip leading/trailing whitespace. We can't do this with
521 * striplt() because it uses too many strlen()'s
523 while ((output_len > 0) && (isspace(outptr[0]))) {
524 strcpy(outptr, &outptr[1]);
527 while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
528 outptr[output_len-1] = 0;
532 if (outptr[output_len-1] != '\n') {
533 strcat(outptr, "\n");