e7d7fc5dd96a772a717a4c3ec8531d1d1f7eec06
[citadel.git] / libcitadel / lib / html_to_ascii.c
1 /*
2  * $Id: html.c 6014 2008-02-04 18:38:35Z ajc $
3  *
4  * Functions which handle translation between HTML and plain text
5  * Copyright (c) 2000-2005 by Art Cancro and others.   This program is
6  * released under the terms of the GNU General Public License.
7  */
8
9 #include <stdlib.h>
10 #include <unistd.h>
11 #include <stdio.h>
12 #include <signal.h>
13 #include <sys/types.h>
14 #include <ctype.h>
15 #include <string.h>
16 #include <sys/stat.h>
17 #include <errno.h>
18 #include <limits.h>
19
20 #if TIME_WITH_SYS_TIME
21 # include <sys/time.h>
22 # include <time.h>
23 #else
24 # if HAVE_SYS_TIME_H
25 #  include <sys/time.h>
26 # else
27 #  include <time.h>
28 # endif
29 #endif
30
31 #include "libcitadel.h"
32  
33
34 /*
35  * Convert HTML to plain text.
36  *
37  * inputmsg      = pointer to raw HTML message
38  * screenwidth   = desired output screenwidth
39  * do_citaformat = set to 1 to indent newlines with spaces
40  */
41 char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaformat) {
42         char inbuf[SIZ];
43         int inbuf_len = 0;
44         char outbuf[SIZ];
45         char tag[1024];
46         int done_reading = 0;
47         char *inptr;
48         char *outptr;
49         size_t outptr_buffer_size;
50         size_t output_len = 0;
51         int i, j, ch, did_out, rb, scanch;
52         int nest = 0;           /* Bracket nesting level */
53         int blockquote = 0;     /* BLOCKQUOTE nesting level */
54         int styletag = 0;       /* STYLE tag nesting level */
55         int styletag_start = 0;
56         int bytes_processed = 0;
57         char nl[128];
58
59         strcpy(nl, "\n");
60         inptr = inputmsg;
61         strcpy(inbuf, "");
62         strcpy(outbuf, "");
63         if (msglen == 0) msglen = strlen(inputmsg);
64
65         outptr_buffer_size = strlen(inptr) + SIZ;
66         outptr = malloc(outptr_buffer_size);
67         if (outptr == NULL) return NULL;
68         strcpy(outptr, "");
69         output_len = 0;
70
71         do {
72                 /* Fill the input buffer */
73                 inbuf_len = strlen(inbuf);
74                 if ( (done_reading == 0) && (inbuf_len < (SIZ-128)) ) {
75
76                         ch = *inptr++;
77                         if (ch != 0) {
78                                 inbuf[inbuf_len++] = ch;
79                                 inbuf[inbuf_len] = 0;
80                         } 
81                         else {
82                                 done_reading = 1;
83                         }
84
85                         ++bytes_processed;
86                         if (bytes_processed > msglen) {
87                                 done_reading = 1;
88                         }
89
90                 }
91
92                 /* Do some parsing */
93                 if (!IsEmptyStr(inbuf)) {
94
95
96                     /* Fold in all the spacing */
97                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
98                         if (inbuf[i]==10) inbuf[i]=32;
99                         if (inbuf[i]==13) inbuf[i]=32;
100                         if (inbuf[i]==9) inbuf[i]=32;
101                         /*** we like foreign characters now.
102                         if ((inbuf[i]<32) || (inbuf[i]>126)) {
103                                 inbuf[i] = '?';
104                         } */
105                     }
106                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
107                         while ((inbuf[i]==32)&&(inbuf[i+1]==32))
108                                 strcpy(&inbuf[i], &inbuf[i+1]);
109                     }
110
111                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
112
113                         ch = inbuf[i];
114
115                         if (ch == '<') {
116                                 ++nest;
117                                 strcpy(tag, "");
118                         }
119
120                         else if (ch == '>') {   /* We have a tag. */
121                                 if (nest > 0) --nest;
122
123                                 /* Unqualify the tag (truncate at first space) */
124                                 if (strchr(tag, ' ') != NULL) {
125                                         strcpy(strchr(tag, ' '), "");
126                                 }
127                                 
128                                 if (!strcasecmp(tag, "P")) {
129                                         strcat(outbuf, nl);
130                                         strcat(outbuf, nl);
131                                 }
132
133                                 if (!strcasecmp(tag, "/DIV")) {
134                                         strcat(outbuf, nl);
135                                         strcat(outbuf, nl);
136                                 }
137
138                                 if (!strcasecmp(tag, "LI")) {
139                                         strcat(outbuf, nl);
140                                         strcat(outbuf, " * ");
141                                 }
142
143                                 else if (!strcasecmp(tag, "/UL")) {
144                                         strcat(outbuf, nl);
145                                         strcat(outbuf, nl);
146                                 }
147
148                                 else if (!strcasecmp(tag, "H1")) {
149                                         strcat(outbuf, nl);
150                                         strcat(outbuf, nl);
151                                 }
152
153                                 else if (!strcasecmp(tag, "H2")) {
154                                         strcat(outbuf, nl);
155                                         strcat(outbuf, nl);
156                                 }
157
158                                 else if (!strcasecmp(tag, "H3")) {
159                                         strcat(outbuf, nl);
160                                         strcat(outbuf, nl);
161                                 }
162
163                                 else if (!strcasecmp(tag, "H4")) {
164                                         strcat(outbuf, nl);
165                                         strcat(outbuf, nl);
166                                 }
167
168                                 else if (!strcasecmp(tag, "/H1")) {
169                                         strcat(outbuf, nl);
170                                 }
171
172                                 else if (!strcasecmp(tag, "/H2")) {
173                                         strcat(outbuf, nl);
174                                 }
175
176                                 else if (!strcasecmp(tag, "/H3")) {
177                                         strcat(outbuf, nl);
178                                 }
179
180                                 else if (!strcasecmp(tag, "/H4")) {
181                                         strcat(outbuf, nl);
182                                 }
183
184                                 else if (!strcasecmp(tag, "HR")) {
185                                         strcat(outbuf, nl);
186                                         strcat(outbuf, " ");
187                                         for (j=0; j<screenwidth-2; ++j)
188                                                 strcat(outbuf, "-");
189                                         strcat(outbuf, nl);
190                                 }
191
192                                 else if (!strcasecmp(tag, "BR")) {
193                                         strcat(outbuf, nl);
194                                 }
195
196                                 else if (!strcasecmp(tag, "TR")) {
197                                         strcat(outbuf, nl);
198                                 }
199
200                                 else if (!strcasecmp(tag, "/TABLE")) {
201                                         strcat(outbuf, nl);
202                                 }
203
204                                 else if (!strcasecmp(tag, "BLOCKQUOTE")) {
205                                         ++blockquote;
206                                         strcpy(nl, "\n");
207                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
208                                         strcat(outbuf, nl);
209                                 }
210
211                                 else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
212                                         strcat(outbuf, "\n");
213                                         --blockquote;
214                                         strcpy(nl, "\n");
215                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
216                                         strcat(outbuf, nl);
217                                 }
218
219                                 else if (!strcasecmp(tag, "STYLE")) {
220                                         ++styletag;
221                                         if (styletag == 1) {
222                                                 styletag_start = strlen(outbuf);
223                                         }
224                                 }
225
226                                 else if (!strcasecmp(tag, "/STYLE")) {
227                                         --styletag;
228                                         if (styletag == 0) {
229                                                 outbuf[styletag_start] = 0;
230                                         }
231                                 }
232
233                         }
234
235                         else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
236                                 tag[strlen(tag)+1] = 0;
237                                 tag[strlen(tag)] = ch;
238                         }
239                                 
240                         else if (!nest) {
241                                 outbuf[strlen(outbuf)+1] = 0;
242                                 outbuf[strlen(outbuf)] = ch;
243                         }
244                     }
245                     strcpy(inbuf, &inbuf[i]);
246                 }
247
248                 /* Convert &; tags to the forbidden characters */
249                 if (!IsEmptyStr(outbuf)) for (i=0; !IsEmptyStr(&outbuf[i]); ++i) {
250
251                         /* Character entity references */
252                         if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
253                                 outbuf[i] = ' ';
254                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
255                         }
256
257                         if (!strncasecmp(&outbuf[i], "&ensp;", 6)) {
258                                 outbuf[i] = ' ';
259                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
260                         }
261
262                         if (!strncasecmp(&outbuf[i], "&emsp;", 6)) {
263                                 outbuf[i] = ' ';
264                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
265                         }
266
267                         if (!strncasecmp(&outbuf[i], "&thinsp;", 8)) {
268                                 outbuf[i] = ' ';
269                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
270                         }
271
272                         else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
273                                 outbuf[i] = '<';
274                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
275                         }
276
277                         else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
278                                 outbuf[i] = '>';
279                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
280                         }
281
282                         else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
283                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
284                         }
285
286                         else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
287                                 outbuf[i] = '\"';
288                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
289                         }
290
291                         else if (!strncasecmp(&outbuf[i], "&lsquo;", 7)) {
292                                 outbuf[i] = '`';
293                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
294                         }
295
296                         else if (!strncasecmp(&outbuf[i], "&rsquo;", 7)) {
297                                 outbuf[i] = '\'';
298                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
299                         }
300
301                         else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
302                                 outbuf[i] = '(';
303                                 outbuf[i+1] = 'c';
304                                 outbuf[i+2] = ')';
305                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
306                         }
307
308                         else if (!strncasecmp(&outbuf[i], "&bull;", 6)) {
309                                 outbuf[i] = ' ';
310                                 outbuf[i+1] = '*';
311                                 outbuf[i+2] = ' ';
312                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
313                         }
314
315                         else if (!strncasecmp(&outbuf[i], "&hellip;", 8)) {
316                                 outbuf[i] = '.';
317                                 outbuf[i+1] = '.';
318                                 outbuf[i+2] = '.';
319                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
320                         }
321
322                         else if (!strncasecmp(&outbuf[i], "&trade;", 7)) {
323                                 outbuf[i] = '(';
324                                 outbuf[i+1] = 't';
325                                 outbuf[i+2] = 'm';
326                                 outbuf[i+3] = ')';
327                                 strcpy(&outbuf[i+4], &outbuf[i+7]);
328                         }
329
330                         else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
331                                 outbuf[i] = '(';
332                                 outbuf[i+1] = 'r';
333                                 outbuf[i+2] = ')';
334                                 strcpy(&outbuf[i+3], &outbuf[i+5]);
335                         }
336
337                         else if (!strncasecmp(&outbuf[i], "&frac14;", 8)) {
338                                 outbuf[i] = '1';
339                                 outbuf[i+1] = '/';
340                                 outbuf[i+2] = '4';
341                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
342                         }
343
344                         else if (!strncasecmp(&outbuf[i], "&frac12;", 8)) {
345                                 outbuf[i] = '1';
346                                 outbuf[i+1] = '/';
347                                 outbuf[i+2] = '2';
348                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
349                         }
350
351                         else if (!strncasecmp(&outbuf[i], "&frac34;", 8)) {
352                                 outbuf[i] = '3';
353                                 outbuf[i+1] = '/';
354                                 outbuf[i+2] = '4';
355                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
356                         }
357
358                         else if (!strncasecmp(&outbuf[i], "&ndash;", 7)) {
359                                 outbuf[i] = '-';
360                                 outbuf[i+1] = '-';
361                                 strcpy(&outbuf[i+2], &outbuf[i+7]);
362                         }
363
364                         else if (!strncasecmp(&outbuf[i], "&mdash;", 7)) {
365                                 outbuf[i] = '-';
366                                 outbuf[i+1] = '-';
367                                 outbuf[i+2] = '-';
368                                 strcpy(&outbuf[i+3], &outbuf[i+7]);
369                         }
370
371                         else if (!strncmp(&outbuf[i], "&Ccedil;", 8)) {
372                                 outbuf[i] = 'C';
373                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
374                         }
375
376                         else if (!strncasecmp(&outbuf[i], "&ccedil;", 8)) {
377                                 outbuf[i] = 'c';
378                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
379                         }
380
381                         else if (!strncmp(&outbuf[i], "&Egrave;", 8)) {
382                                 outbuf[i] = 'E';
383                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
384                         }
385
386                         else if (!strncasecmp(&outbuf[i], "&egrave;", 8)) {
387                                 outbuf[i] = 'e';
388                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
389                         }
390
391                         else if (!strncmp(&outbuf[i], "&Ecirc;", 7)) {
392                                 outbuf[i] = 'E';
393                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
394                         }
395
396                         else if (!strncasecmp(&outbuf[i], "&ecirc;", 7)) {
397                                 outbuf[i] = 'e';
398                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
399                         }
400
401                         else if (!strncmp(&outbuf[i], "&Eacute;", 8)) {
402                                 outbuf[i] = 'E';
403                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
404                         }
405
406                         else if (!strncasecmp(&outbuf[i], "&eacute;", 8)) {
407                                 outbuf[i] = 'e';
408                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
409                         }
410
411                         else if (!strncmp(&outbuf[i], "&Agrave;", 8)) {
412                                 outbuf[i] = 'A';
413                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
414                         }
415
416                         else if (!strncasecmp(&outbuf[i], "&agrave;", 8)) {
417                                 outbuf[i] = 'a';
418                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
419                         }
420
421                         else if (!strncasecmp(&outbuf[i], "&ldquo;", 7)) {
422                                 outbuf[i] = '\"';
423                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
424                         }
425
426                         else if (!strncasecmp(&outbuf[i], "&rdquo;", 7)) {
427                                 outbuf[i] = '\"';
428                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
429                         }
430
431                         else if (!strncasecmp(&outbuf[i], "&acute;", 7)) {
432                                 outbuf[i] = '\'';
433                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
434                         }
435
436                         /* two-digit decimal equivalents */
437                         else if ((!strncmp(&outbuf[i], "&#", 2))
438                               && (outbuf[i+4] == ';') ) {
439                                 scanch = 0;
440                                 sscanf(&outbuf[i+2], "%02d", &scanch);
441                                 outbuf[i] = scanch;
442                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
443                         }
444
445                         /* three-digit decimal equivalents */
446                         else if ((!strncmp(&outbuf[i], "&#", 2))
447                               && (outbuf[i+5] == ';') ) {
448                                 scanch = 0;
449                                 sscanf(&outbuf[i+2], "%03d", &scanch);
450                                 outbuf[i] = scanch;
451                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
452                         }
453
454                 }
455
456                 /* Make sure the output buffer is big enough */
457                 if ((output_len + strlen(outbuf) + SIZ) > outptr_buffer_size) {
458                         outptr_buffer_size += SIZ;
459                         outptr = realloc(outptr, outptr_buffer_size);
460                         if (outptr == NULL) {
461                                 abort();
462                         }
463                 }
464
465                 /* Output any lines terminated with hard line breaks */
466                 do {
467                         did_out = 0;
468                         if (strlen(outbuf) > 0) {
469                             for (i = 0; i<strlen(outbuf); ++i) {
470                                 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
471
472                                         strncpy(&outptr[output_len], outbuf, i+1);
473                                         output_len += (i+1);
474
475                                         if (do_citaformat) {
476                                                 strcpy(&outptr[output_len], " ");
477                                                 ++output_len;
478                                         }
479
480                                         strcpy(outbuf, &outbuf[i+1]);
481                                         i = 0;
482                                         did_out = 1;
483                                 }
484                         }
485                     }
486                 } while (did_out);
487
488                 /* Add soft line breaks */
489                 if (strlen(outbuf) > (screenwidth - 2 )) {
490                         rb = (-1);
491                         for (i=0; i<(screenwidth-2); ++i) {
492                                 if (outbuf[i]==32) rb = i;
493                         }
494                         if (rb>=0) {
495                                 strncpy(&outptr[output_len], outbuf, rb);
496                                 output_len += rb;
497                                 strcpy(&outptr[output_len], nl);
498                                 output_len += strlen(nl);
499                                 if (do_citaformat) {
500                                         strcpy(&outptr[output_len], " ");
501                                         ++output_len;
502                                 }
503                                 strcpy(outbuf, &outbuf[rb+1]);
504                         } else {
505                                 strncpy(&outptr[output_len], outbuf,
506                                         screenwidth-2);
507                                 output_len += (screenwidth-2);
508                                 strcpy(&outptr[output_len], nl);
509                                 output_len += strlen(nl);
510                                 if (do_citaformat) {
511                                         strcpy(&outptr[output_len], " ");
512                                         ++output_len;
513                                 }
514                                 strcpy(outbuf, &outbuf[screenwidth-2]);
515                         }
516                 }
517
518         } while (done_reading == 0);
519
520         strcpy(&outptr[output_len], outbuf);
521         output_len += strlen(outbuf);
522
523         /* Strip leading/trailing whitespace.  We can't do this with
524          * striplt() because it uses too many strlen()'s
525          */
526         while ((output_len > 0) && (isspace(outptr[0]))) {
527                 strcpy(outptr, &outptr[1]);
528                 --output_len;
529         }
530         while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
531                 outptr[output_len-1] = 0;
532                 --output_len;
533         }
534
535         if (outptr[output_len-1] != '\n') {
536                 strcat(outptr, "\n");
537                 ++output_len;
538         }
539
540         return outptr;
541
542 }