* don't double libcitadel header function for safestrncpy in webcit.h
[citadel.git] / webcit / html.c
1 /*
2  * $Id: html.c 6014 2008-02-04 18:38:35Z ajc $
3  *
4  * Functions which handle translation between HTML and plain text
5  * Copyright (c) 2000-2005 by Art Cancro and others.   This program is
6  * released under the terms of the GNU General Public License.
7  */
8
9 #include "webcit.h"
10  
11
12 /*
13  * Convert HTML to plain text.
14  *
15  * inputmsg      = pointer to raw HTML message
16  * screenwidth   = desired output screenwidth
17  * do_citaformat = set to 1 to indent newlines with spaces
18  */
19 char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaformat) {
20         char inbuf[SIZ];
21         int inbuf_len = 0;
22         char outbuf[SIZ];
23         char tag[1024];
24         int done_reading = 0;
25         char *inptr;
26         char *outptr;
27         size_t outptr_buffer_size;
28         size_t output_len = 0;
29         int i, j, ch, did_out, rb, scanch;
30         int nest = 0;           /* Bracket nesting level */
31         int blockquote = 0;     /* BLOCKQUOTE nesting level */
32         int styletag = 0;       /* STYLE tag nesting level */
33         int styletag_start = 0;
34         int bytes_processed = 0;
35         char nl[128];
36
37         strcpy(nl, "\n");
38         inptr = inputmsg;
39         strcpy(inbuf, "");
40         strcpy(outbuf, "");
41         if (msglen == 0) msglen = strlen(inputmsg);
42
43         outptr_buffer_size = strlen(inptr) + SIZ;
44         outptr = malloc(outptr_buffer_size);
45         if (outptr == NULL) return NULL;
46         strcpy(outptr, "");
47         output_len = 0;
48
49         do {
50                 /* Fill the input buffer */
51                 inbuf_len = strlen(inbuf);
52                 if ( (done_reading == 0) && (inbuf_len < (SIZ-128)) ) {
53
54                         ch = *inptr++;
55                         if (ch != 0) {
56                                 inbuf[inbuf_len++] = ch;
57                                 inbuf[inbuf_len] = 0;
58                         } 
59                         else {
60                                 done_reading = 1;
61                         }
62
63                         ++bytes_processed;
64                         if (bytes_processed > msglen) {
65                                 done_reading = 1;
66                         }
67
68                 }
69
70                 /* Do some parsing */
71                 if (!IsEmptyStr(inbuf)) {
72
73
74                     /* Fold in all the spacing */
75                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
76                         if (inbuf[i]==10) inbuf[i]=32;
77                         if (inbuf[i]==13) inbuf[i]=32;
78                         if (inbuf[i]==9) inbuf[i]=32;
79                         /*** we like foreign characters now.
80                         if ((inbuf[i]<32) || (inbuf[i]>126)) {
81                                 inbuf[i] = '?';
82                         } */
83                     }
84                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
85                         while ((inbuf[i]==32)&&(inbuf[i+1]==32))
86                                 strcpy(&inbuf[i], &inbuf[i+1]);
87                     }
88
89                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
90
91                         ch = inbuf[i];
92
93                         if (ch == '<') {
94                                 ++nest;
95                                 strcpy(tag, "");
96                         }
97
98                         else if (ch == '>') {   /* We have a tag. */
99                                 if (nest > 0) --nest;
100
101                                 /* Unqualify the tag (truncate at first space) */
102                                 if (strchr(tag, ' ') != NULL) {
103                                         strcpy(strchr(tag, ' '), "");
104                                 }
105                                 
106                                 if (!strcasecmp(tag, "P")) {
107                                         strcat(outbuf, nl);
108                                         strcat(outbuf, nl);
109                                 }
110
111                                 if (!strcasecmp(tag, "/DIV")) {
112                                         strcat(outbuf, nl);
113                                         strcat(outbuf, nl);
114                                 }
115
116                                 if (!strcasecmp(tag, "LI")) {
117                                         strcat(outbuf, nl);
118                                         strcat(outbuf, " * ");
119                                 }
120
121                                 else if (!strcasecmp(tag, "/UL")) {
122                                         strcat(outbuf, nl);
123                                         strcat(outbuf, nl);
124                                 }
125
126                                 else if (!strcasecmp(tag, "H1")) {
127                                         strcat(outbuf, nl);
128                                         strcat(outbuf, nl);
129                                 }
130
131                                 else if (!strcasecmp(tag, "H2")) {
132                                         strcat(outbuf, nl);
133                                         strcat(outbuf, nl);
134                                 }
135
136                                 else if (!strcasecmp(tag, "H3")) {
137                                         strcat(outbuf, nl);
138                                         strcat(outbuf, nl);
139                                 }
140
141                                 else if (!strcasecmp(tag, "H4")) {
142                                         strcat(outbuf, nl);
143                                         strcat(outbuf, nl);
144                                 }
145
146                                 else if (!strcasecmp(tag, "/H1")) {
147                                         strcat(outbuf, nl);
148                                 }
149
150                                 else if (!strcasecmp(tag, "/H2")) {
151                                         strcat(outbuf, nl);
152                                 }
153
154                                 else if (!strcasecmp(tag, "/H3")) {
155                                         strcat(outbuf, nl);
156                                 }
157
158                                 else if (!strcasecmp(tag, "/H4")) {
159                                         strcat(outbuf, nl);
160                                 }
161
162                                 else if (!strcasecmp(tag, "HR")) {
163                                         strcat(outbuf, nl);
164                                         strcat(outbuf, " ");
165                                         for (j=0; j<screenwidth-2; ++j)
166                                                 strcat(outbuf, "-");
167                                         strcat(outbuf, nl);
168                                 }
169
170                                 else if (!strcasecmp(tag, "BR")) {
171                                         strcat(outbuf, nl);
172                                 }
173
174                                 else if (!strcasecmp(tag, "TR")) {
175                                         strcat(outbuf, nl);
176                                 }
177
178                                 else if (!strcasecmp(tag, "/TABLE")) {
179                                         strcat(outbuf, nl);
180                                 }
181
182                                 else if (!strcasecmp(tag, "BLOCKQUOTE")) {
183                                         ++blockquote;
184                                         strcpy(nl, "\n");
185                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
186                                         strcat(outbuf, nl);
187                                 }
188
189                                 else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
190                                         strcat(outbuf, "\n");
191                                         --blockquote;
192                                         strcpy(nl, "\n");
193                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
194                                         strcat(outbuf, nl);
195                                 }
196
197                                 else if (!strcasecmp(tag, "STYLE")) {
198                                         ++styletag;
199                                         if (styletag == 1) {
200                                                 styletag_start = strlen(outbuf);
201                                         }
202                                 }
203
204                                 else if (!strcasecmp(tag, "/STYLE")) {
205                                         --styletag;
206                                         if (styletag == 0) {
207                                                 outbuf[styletag_start] = 0;
208                                         }
209                                 }
210
211                         }
212
213                         else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
214                                 tag[strlen(tag)+1] = 0;
215                                 tag[strlen(tag)] = ch;
216                         }
217                                 
218                         else if (!nest) {
219                                 outbuf[strlen(outbuf)+1] = 0;
220                                 outbuf[strlen(outbuf)] = ch;
221                         }
222                     }
223                     strcpy(inbuf, &inbuf[i]);
224                 }
225
226                 /* Convert &; tags to the forbidden characters */
227                 if (!IsEmptyStr(outbuf)) for (i=0; !IsEmptyStr(&outbuf[i]); ++i) {
228
229                         /* Character entity references */
230                         if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
231                                 outbuf[i] = ' ';
232                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
233                         }
234
235                         if (!strncasecmp(&outbuf[i], "&ensp;", 6)) {
236                                 outbuf[i] = ' ';
237                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
238                         }
239
240                         if (!strncasecmp(&outbuf[i], "&emsp;", 6)) {
241                                 outbuf[i] = ' ';
242                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
243                         }
244
245                         if (!strncasecmp(&outbuf[i], "&thinsp;", 8)) {
246                                 outbuf[i] = ' ';
247                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
248                         }
249
250                         else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
251                                 outbuf[i] = '<';
252                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
253                         }
254
255                         else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
256                                 outbuf[i] = '>';
257                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
258                         }
259
260                         else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
261                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
262                         }
263
264                         else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
265                                 outbuf[i] = '\"';
266                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
267                         }
268
269                         else if (!strncasecmp(&outbuf[i], "&lsquo;", 7)) {
270                                 outbuf[i] = '`';
271                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
272                         }
273
274                         else if (!strncasecmp(&outbuf[i], "&rsquo;", 7)) {
275                                 outbuf[i] = '\'';
276                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
277                         }
278
279                         else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
280                                 outbuf[i] = '(';
281                                 outbuf[i+1] = 'c';
282                                 outbuf[i+2] = ')';
283                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
284                         }
285
286                         else if (!strncasecmp(&outbuf[i], "&bull;", 6)) {
287                                 outbuf[i] = ' ';
288                                 outbuf[i+1] = '*';
289                                 outbuf[i+2] = ' ';
290                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
291                         }
292
293                         else if (!strncasecmp(&outbuf[i], "&hellip;", 8)) {
294                                 outbuf[i] = '.';
295                                 outbuf[i+1] = '.';
296                                 outbuf[i+2] = '.';
297                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
298                         }
299
300                         else if (!strncasecmp(&outbuf[i], "&trade;", 7)) {
301                                 outbuf[i] = '(';
302                                 outbuf[i+1] = 't';
303                                 outbuf[i+2] = 'm';
304                                 outbuf[i+3] = ')';
305                                 strcpy(&outbuf[i+4], &outbuf[i+7]);
306                         }
307
308                         else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
309                                 outbuf[i] = '(';
310                                 outbuf[i+1] = 'r';
311                                 outbuf[i+2] = ')';
312                                 strcpy(&outbuf[i+3], &outbuf[i+5]);
313                         }
314
315                         else if (!strncasecmp(&outbuf[i], "&frac14;", 8)) {
316                                 outbuf[i] = '1';
317                                 outbuf[i+1] = '/';
318                                 outbuf[i+2] = '4';
319                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
320                         }
321
322                         else if (!strncasecmp(&outbuf[i], "&frac12;", 8)) {
323                                 outbuf[i] = '1';
324                                 outbuf[i+1] = '/';
325                                 outbuf[i+2] = '2';
326                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
327                         }
328
329                         else if (!strncasecmp(&outbuf[i], "&frac34;", 8)) {
330                                 outbuf[i] = '3';
331                                 outbuf[i+1] = '/';
332                                 outbuf[i+2] = '4';
333                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
334                         }
335
336                         else if (!strncasecmp(&outbuf[i], "&ndash;", 7)) {
337                                 outbuf[i] = '-';
338                                 outbuf[i+1] = '-';
339                                 strcpy(&outbuf[i+2], &outbuf[i+7]);
340                         }
341
342                         else if (!strncasecmp(&outbuf[i], "&mdash;", 7)) {
343                                 outbuf[i] = '-';
344                                 outbuf[i+1] = '-';
345                                 outbuf[i+2] = '-';
346                                 strcpy(&outbuf[i+3], &outbuf[i+7]);
347                         }
348
349                         else if (!strncmp(&outbuf[i], "&Ccedil;", 8)) {
350                                 outbuf[i] = 'C';
351                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
352                         }
353
354                         else if (!strncasecmp(&outbuf[i], "&ccedil;", 8)) {
355                                 outbuf[i] = 'c';
356                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
357                         }
358
359                         else if (!strncmp(&outbuf[i], "&Egrave;", 8)) {
360                                 outbuf[i] = 'E';
361                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
362                         }
363
364                         else if (!strncasecmp(&outbuf[i], "&egrave;", 8)) {
365                                 outbuf[i] = 'e';
366                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
367                         }
368
369                         else if (!strncmp(&outbuf[i], "&Ecirc;", 7)) {
370                                 outbuf[i] = 'E';
371                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
372                         }
373
374                         else if (!strncasecmp(&outbuf[i], "&ecirc;", 7)) {
375                                 outbuf[i] = 'e';
376                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
377                         }
378
379                         else if (!strncmp(&outbuf[i], "&Eacute;", 8)) {
380                                 outbuf[i] = 'E';
381                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
382                         }
383
384                         else if (!strncasecmp(&outbuf[i], "&eacute;", 8)) {
385                                 outbuf[i] = 'e';
386                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
387                         }
388
389                         else if (!strncmp(&outbuf[i], "&Agrave;", 8)) {
390                                 outbuf[i] = 'A';
391                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
392                         }
393
394                         else if (!strncasecmp(&outbuf[i], "&agrave;", 8)) {
395                                 outbuf[i] = 'a';
396                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
397                         }
398
399                         else if (!strncasecmp(&outbuf[i], "&ldquo;", 7)) {
400                                 outbuf[i] = '\"';
401                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
402                         }
403
404                         else if (!strncasecmp(&outbuf[i], "&rdquo;", 7)) {
405                                 outbuf[i] = '\"';
406                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
407                         }
408
409                         else if (!strncasecmp(&outbuf[i], "&acute;", 7)) {
410                                 outbuf[i] = '\'';
411                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
412                         }
413
414                         /* two-digit decimal equivalents */
415                         else if ((!strncmp(&outbuf[i], "&#", 2))
416                               && (outbuf[i+4] == ';') ) {
417                                 scanch = 0;
418                                 sscanf(&outbuf[i+2], "%02d", &scanch);
419                                 outbuf[i] = scanch;
420                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
421                         }
422
423                         /* three-digit decimal equivalents */
424                         else if ((!strncmp(&outbuf[i], "&#", 2))
425                               && (outbuf[i+5] == ';') ) {
426                                 scanch = 0;
427                                 sscanf(&outbuf[i+2], "%03d", &scanch);
428                                 outbuf[i] = scanch;
429                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
430                         }
431
432                 }
433
434                 /* Make sure the output buffer is big enough */
435                 if ((output_len + strlen(outbuf) + SIZ) > outptr_buffer_size) {
436                         outptr_buffer_size += SIZ;
437                         outptr = realloc(outptr, outptr_buffer_size);
438                         if (outptr == NULL) {
439                                 abort();
440                         }
441                 }
442
443                 /* Output any lines terminated with hard line breaks */
444                 do {
445                         did_out = 0;
446                         if (strlen(outbuf) > 0) {
447                             for (i = 0; i<strlen(outbuf); ++i) {
448                                 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
449
450                                         strncpy(&outptr[output_len], outbuf, i+1);
451                                         output_len += (i+1);
452
453                                         if (do_citaformat) {
454                                                 strcpy(&outptr[output_len], " ");
455                                                 ++output_len;
456                                         }
457
458                                         strcpy(outbuf, &outbuf[i+1]);
459                                         i = 0;
460                                         did_out = 1;
461                                 }
462                         }
463                     }
464                 } while (did_out);
465
466                 /* Add soft line breaks */
467                 if (strlen(outbuf) > (screenwidth - 2 )) {
468                         rb = (-1);
469                         for (i=0; i<(screenwidth-2); ++i) {
470                                 if (outbuf[i]==32) rb = i;
471                         }
472                         if (rb>=0) {
473                                 strncpy(&outptr[output_len], outbuf, rb);
474                                 output_len += rb;
475                                 strcpy(&outptr[output_len], nl);
476                                 output_len += strlen(nl);
477                                 if (do_citaformat) {
478                                         strcpy(&outptr[output_len], " ");
479                                         ++output_len;
480                                 }
481                                 strcpy(outbuf, &outbuf[rb+1]);
482                         } else {
483                                 strncpy(&outptr[output_len], outbuf,
484                                         screenwidth-2);
485                                 output_len += (screenwidth-2);
486                                 strcpy(&outptr[output_len], nl);
487                                 output_len += strlen(nl);
488                                 if (do_citaformat) {
489                                         strcpy(&outptr[output_len], " ");
490                                         ++output_len;
491                                 }
492                                 strcpy(outbuf, &outbuf[screenwidth-2]);
493                         }
494                 }
495
496         } while (done_reading == 0);
497
498         strcpy(&outptr[output_len], outbuf);
499         output_len += strlen(outbuf);
500
501         /* Strip leading/trailing whitespace.  We can't do this with
502          * striplt() because it uses too many strlen()'s
503          */
504         while ((output_len > 0) && (isspace(outptr[0]))) {
505                 strcpy(outptr, &outptr[1]);
506                 --output_len;
507         }
508         while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
509                 outptr[output_len-1] = 0;
510                 --output_len;
511         }
512
513         if (outptr[output_len-1] != '\n') {
514                 strcat(outptr, "\n");
515                 ++output_len;
516         }
517
518         return outptr;
519
520 }