Version number to 7.35 in preparation for an upcoming release.
[citadel.git] / libcitadel / lib / html_to_ascii.c
1 /*
2  * $Id: html.c 6014 2008-02-04 18:38:35Z ajc $
3  *
4  * Functions which handle translation between HTML and plain text
5  * Copyright (c) 2000-2005 by Art Cancro and others.   This program is
6  * released under the terms of the GNU General Public License.
7  */
8
9 #include <stdlib.h>
10 #include <unistd.h>
11 #include <stdio.h>
12 #include <signal.h>
13 #include <sys/types.h>
14 #include <ctype.h>
15 #include <string.h>
16 #include <sys/stat.h>
17 #include <errno.h>
18 #include <limits.h>
19
20 #if TIME_WITH_SYS_TIME
21 # include <sys/time.h>
22 # include <time.h>
23 #else
24 # if HAVE_SYS_TIME_H
25 #  include <sys/time.h>
26 # else
27 #  include <time.h>
28 # endif
29 #endif
30
31 #include "libcitadel.h"
32  
33
34 /*
35  * Convert HTML to plain text.
36  *
37  * inputmsg      = pointer to raw HTML message
38  * screenwidth   = desired output screenwidth
39  * do_citaformat = set to 1 to indent newlines with spaces
40  */
41 char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaformat) {
42         char inbuf[SIZ];
43         int inbuf_len = 0;
44         char outbuf[SIZ];
45         char tag[1024];
46         int done_reading = 0;
47         char *inptr;
48         char *outptr;
49         size_t outptr_buffer_size;
50         size_t output_len = 0;
51         int i, j, ch, did_out, rb, scanch;
52         int nest = 0;           /* Bracket nesting level */
53         int blockquote = 0;     /* BLOCKQUOTE nesting level */
54         int styletag = 0;       /* STYLE tag nesting level */
55         int styletag_start = 0;
56         int bytes_processed = 0;
57         char nl[128];
58
59         strcpy(nl, "\n");
60         inptr = inputmsg;
61         strcpy(inbuf, "");
62         strcpy(outbuf, "");
63         if (msglen == 0) msglen = strlen(inputmsg);
64
65         outptr_buffer_size = strlen(inptr) + SIZ;
66         outptr = malloc(outptr_buffer_size);
67         if (outptr == NULL) return NULL;
68         strcpy(outptr, "");
69         output_len = 0;
70
71         do {
72                 /* Fill the input buffer */
73                 inbuf_len = strlen(inbuf);
74                 if ( (done_reading == 0) && (inbuf_len < (SIZ-128)) ) {
75
76                         ch = *inptr++;
77                         if (ch != 0) {
78                                 inbuf[inbuf_len++] = ch;
79                                 inbuf[inbuf_len] = 0;
80                         } 
81                         else {
82                                 done_reading = 1;
83                         }
84
85                         ++bytes_processed;
86                         if (bytes_processed > msglen) {
87                                 done_reading = 1;
88                         }
89
90                 }
91
92                 /* Do some parsing */
93                 if (!IsEmptyStr(inbuf)) {
94
95
96                     /* Fold in all the spacing */
97                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
98                         if (inbuf[i]==10) inbuf[i]=32;
99                         if (inbuf[i]==13) inbuf[i]=32;
100                         if (inbuf[i]==9) inbuf[i]=32;
101                     }
102                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
103                         while ((inbuf[i]==32)&&(inbuf[i+1]==32)) {
104                                 strcpy(&inbuf[i], &inbuf[i+1]);
105                         }
106                     }
107
108                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
109
110                         ch = inbuf[i];
111
112                         if (ch == '<') {
113                                 ++nest;
114                                 strcpy(tag, "");
115                         }
116
117                         else if (ch == '>') {   /* We have a tag. */
118                                 if (nest > 0) --nest;
119
120                                 /* Unqualify the tag (truncate at first space) */
121                                 if (strchr(tag, ' ') != NULL) {
122                                         strcpy(strchr(tag, ' '), "");
123                                 }
124                                 
125                                 if (!strcasecmp(tag, "P")) {
126                                         strcat(outbuf, nl);
127                                         strcat(outbuf, nl);
128                                 }
129
130                                 if (!strcasecmp(tag, "/DIV")) {
131                                         strcat(outbuf, nl);
132                                         strcat(outbuf, nl);
133                                 }
134
135                                 if (!strcasecmp(tag, "LI")) {
136                                         strcat(outbuf, nl);
137                                         strcat(outbuf, " * ");
138                                 }
139
140                                 else if (!strcasecmp(tag, "/UL")) {
141                                         strcat(outbuf, nl);
142                                         strcat(outbuf, nl);
143                                 }
144
145                                 else if (!strcasecmp(tag, "H1")) {
146                                         strcat(outbuf, nl);
147                                         strcat(outbuf, nl);
148                                 }
149
150                                 else if (!strcasecmp(tag, "H2")) {
151                                         strcat(outbuf, nl);
152                                         strcat(outbuf, nl);
153                                 }
154
155                                 else if (!strcasecmp(tag, "H3")) {
156                                         strcat(outbuf, nl);
157                                         strcat(outbuf, nl);
158                                 }
159
160                                 else if (!strcasecmp(tag, "H4")) {
161                                         strcat(outbuf, nl);
162                                         strcat(outbuf, nl);
163                                 }
164
165                                 else if (!strcasecmp(tag, "/H1")) {
166                                         strcat(outbuf, nl);
167                                 }
168
169                                 else if (!strcasecmp(tag, "/H2")) {
170                                         strcat(outbuf, nl);
171                                 }
172
173                                 else if (!strcasecmp(tag, "/H3")) {
174                                         strcat(outbuf, nl);
175                                 }
176
177                                 else if (!strcasecmp(tag, "/H4")) {
178                                         strcat(outbuf, nl);
179                                 }
180
181                                 else if (!strcasecmp(tag, "HR")) {
182                                         strcat(outbuf, nl);
183                                         strcat(outbuf, " ");
184                                         for (j=0; j<screenwidth-2; ++j)
185                                                 strcat(outbuf, "-");
186                                         strcat(outbuf, nl);
187                                 }
188
189                                 else if (!strcasecmp(tag, "BR")) {
190                                         strcat(outbuf, nl);
191                                 }
192
193                                 else if (!strcasecmp(tag, "TR")) {
194                                         strcat(outbuf, nl);
195                                 }
196
197                                 else if (!strcasecmp(tag, "/TABLE")) {
198                                         strcat(outbuf, nl);
199                                 }
200
201                                 else if (!strcasecmp(tag, "BLOCKQUOTE")) {
202                                         ++blockquote;
203                                         strcpy(nl, "\n");
204                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
205                                         strcat(outbuf, nl);
206                                 }
207
208                                 else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
209                                         strcat(outbuf, "\n");
210                                         --blockquote;
211                                         strcpy(nl, "\n");
212                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
213                                         strcat(outbuf, nl);
214                                 }
215
216                                 else if (!strcasecmp(tag, "STYLE")) {
217                                         ++styletag;
218                                         if (styletag == 1) {
219                                                 styletag_start = strlen(outbuf);
220                                         }
221                                 }
222
223                                 else if (!strcasecmp(tag, "/STYLE")) {
224                                         --styletag;
225                                         if (styletag == 0) {
226                                                 outbuf[styletag_start] = 0;
227                                         }
228                                 }
229
230                         }
231
232                         else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
233                                 tag[strlen(tag)+1] = 0;
234                                 tag[strlen(tag)] = ch;
235                         }
236                                 
237                         else if (!nest) {
238                                 outbuf[strlen(outbuf)+1] = 0;
239                                 outbuf[strlen(outbuf)] = ch;
240                         }
241                     }
242                     strcpy(inbuf, &inbuf[i]);
243                 }
244
245                 /* Convert &; tags to the forbidden characters */
246                 if (!IsEmptyStr(outbuf)) for (i=0; !IsEmptyStr(&outbuf[i]); ++i) {
247
248                         /* Character entity references */
249                         if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
250                                 outbuf[i] = ' ';
251                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
252                         }
253
254                         if (!strncasecmp(&outbuf[i], "&ensp;", 6)) {
255                                 outbuf[i] = ' ';
256                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
257                         }
258
259                         if (!strncasecmp(&outbuf[i], "&emsp;", 6)) {
260                                 outbuf[i] = ' ';
261                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
262                         }
263
264                         if (!strncasecmp(&outbuf[i], "&thinsp;", 8)) {
265                                 outbuf[i] = ' ';
266                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
267                         }
268
269                         else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
270                                 outbuf[i] = '<';
271                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
272                         }
273
274                         else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
275                                 outbuf[i] = '>';
276                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
277                         }
278
279                         else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
280                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
281                         }
282
283                         else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
284                                 outbuf[i] = '\"';
285                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
286                         }
287
288                         else if (!strncasecmp(&outbuf[i], "&lsquo;", 7)) {
289                                 outbuf[i] = '`';
290                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
291                         }
292
293                         else if (!strncasecmp(&outbuf[i], "&rsquo;", 7)) {
294                                 outbuf[i] = '\'';
295                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
296                         }
297
298                         else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
299                                 outbuf[i] = '(';
300                                 outbuf[i+1] = 'c';
301                                 outbuf[i+2] = ')';
302                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
303                         }
304
305                         else if (!strncasecmp(&outbuf[i], "&bull;", 6)) {
306                                 outbuf[i] = ' ';
307                                 outbuf[i+1] = '*';
308                                 outbuf[i+2] = ' ';
309                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
310                         }
311
312                         else if (!strncasecmp(&outbuf[i], "&hellip;", 8)) {
313                                 outbuf[i] = '.';
314                                 outbuf[i+1] = '.';
315                                 outbuf[i+2] = '.';
316                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
317                         }
318
319                         else if (!strncasecmp(&outbuf[i], "&trade;", 7)) {
320                                 outbuf[i] = '(';
321                                 outbuf[i+1] = 't';
322                                 outbuf[i+2] = 'm';
323                                 outbuf[i+3] = ')';
324                                 strcpy(&outbuf[i+4], &outbuf[i+7]);
325                         }
326
327                         else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
328                                 outbuf[i] = '(';
329                                 outbuf[i+1] = 'r';
330                                 outbuf[i+2] = ')';
331                                 strcpy(&outbuf[i+3], &outbuf[i+5]);
332                         }
333
334                         else if (!strncasecmp(&outbuf[i], "&frac14;", 8)) {
335                                 outbuf[i] = '1';
336                                 outbuf[i+1] = '/';
337                                 outbuf[i+2] = '4';
338                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
339                         }
340
341                         else if (!strncasecmp(&outbuf[i], "&frac12;", 8)) {
342                                 outbuf[i] = '1';
343                                 outbuf[i+1] = '/';
344                                 outbuf[i+2] = '2';
345                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
346                         }
347
348                         else if (!strncasecmp(&outbuf[i], "&frac34;", 8)) {
349                                 outbuf[i] = '3';
350                                 outbuf[i+1] = '/';
351                                 outbuf[i+2] = '4';
352                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
353                         }
354
355                         else if (!strncasecmp(&outbuf[i], "&ndash;", 7)) {
356                                 outbuf[i] = '-';
357                                 outbuf[i+1] = '-';
358                                 strcpy(&outbuf[i+2], &outbuf[i+7]);
359                         }
360
361                         else if (!strncasecmp(&outbuf[i], "&mdash;", 7)) {
362                                 outbuf[i] = '-';
363                                 outbuf[i+1] = '-';
364                                 outbuf[i+2] = '-';
365                                 strcpy(&outbuf[i+3], &outbuf[i+7]);
366                         }
367
368                         else if (!strncmp(&outbuf[i], "&Ccedil;", 8)) {
369                                 outbuf[i] = 'C';
370                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
371                         }
372
373                         else if (!strncasecmp(&outbuf[i], "&ccedil;", 8)) {
374                                 outbuf[i] = 'c';
375                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
376                         }
377
378                         else if (!strncmp(&outbuf[i], "&Egrave;", 8)) {
379                                 outbuf[i] = 'E';
380                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
381                         }
382
383                         else if (!strncasecmp(&outbuf[i], "&egrave;", 8)) {
384                                 outbuf[i] = 'e';
385                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
386                         }
387
388                         else if (!strncmp(&outbuf[i], "&Ecirc;", 7)) {
389                                 outbuf[i] = 'E';
390                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
391                         }
392
393                         else if (!strncasecmp(&outbuf[i], "&ecirc;", 7)) {
394                                 outbuf[i] = 'e';
395                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
396                         }
397
398                         else if (!strncmp(&outbuf[i], "&Eacute;", 8)) {
399                                 outbuf[i] = 'E';
400                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
401                         }
402
403                         else if (!strncasecmp(&outbuf[i], "&eacute;", 8)) {
404                                 outbuf[i] = 'e';
405                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
406                         }
407
408                         else if (!strncmp(&outbuf[i], "&Agrave;", 8)) {
409                                 outbuf[i] = 'A';
410                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
411                         }
412
413                         else if (!strncasecmp(&outbuf[i], "&agrave;", 8)) {
414                                 outbuf[i] = 'a';
415                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
416                         }
417
418                         else if (!strncasecmp(&outbuf[i], "&ldquo;", 7)) {
419                                 outbuf[i] = '\"';
420                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
421                         }
422
423                         else if (!strncasecmp(&outbuf[i], "&rdquo;", 7)) {
424                                 outbuf[i] = '\"';
425                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
426                         }
427
428                         else if (!strncasecmp(&outbuf[i], "&acute;", 7)) {
429                                 outbuf[i] = '\'';
430                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
431                         }
432
433                         /* two-digit decimal equivalents */
434                         else if ((!strncmp(&outbuf[i], "&#", 2))
435                               && (outbuf[i+4] == ';') ) {
436                                 scanch = 0;
437                                 sscanf(&outbuf[i+2], "%02d", &scanch);
438                                 outbuf[i] = scanch;
439                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
440                         }
441
442                         /* three-digit decimal equivalents */
443                         else if ((!strncmp(&outbuf[i], "&#", 2))
444                               && (outbuf[i+5] == ';') ) {
445                                 scanch = 0;
446                                 sscanf(&outbuf[i+2], "%03d", &scanch);
447                                 outbuf[i] = scanch;
448                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
449                         }
450
451                 }
452
453                 /* Make sure the output buffer is big enough */
454                 if ((output_len + strlen(outbuf) + SIZ) > outptr_buffer_size) {
455                         outptr_buffer_size += SIZ;
456                         outptr = realloc(outptr, outptr_buffer_size);
457                         if (outptr == NULL) {
458                                 abort();
459                         }
460                 }
461
462                 /* Output any lines terminated with hard line breaks */
463                 do {
464                         did_out = 0;
465                         if (strlen(outbuf) > 0) {
466                             for (i = 0; i<strlen(outbuf); ++i) {
467                                 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
468
469                                         strncpy(&outptr[output_len], outbuf, i+1);
470                                         output_len += (i+1);
471
472                                         if (do_citaformat) {
473                                                 strcpy(&outptr[output_len], " ");
474                                                 ++output_len;
475                                         }
476
477                                         strcpy(outbuf, &outbuf[i+1]);
478                                         i = 0;
479                                         did_out = 1;
480                                 }
481                         }
482                     }
483                 } while (did_out);
484
485                 /* Add soft line breaks */
486                 if (strlen(outbuf) > (screenwidth - 2 )) {
487                         rb = (-1);
488                         for (i=0; i<(screenwidth-2); ++i) {
489                                 if (outbuf[i]==32) rb = i;
490                         }
491                         if (rb>=0) {
492                                 strncpy(&outptr[output_len], outbuf, rb);
493                                 output_len += rb;
494                                 strcpy(&outptr[output_len], nl);
495                                 output_len += strlen(nl);
496                                 if (do_citaformat) {
497                                         strcpy(&outptr[output_len], " ");
498                                         ++output_len;
499                                 }
500                                 strcpy(outbuf, &outbuf[rb+1]);
501                         } else {
502                                 strncpy(&outptr[output_len], outbuf,
503                                         screenwidth-2);
504                                 output_len += (screenwidth-2);
505                                 strcpy(&outptr[output_len], nl);
506                                 output_len += strlen(nl);
507                                 if (do_citaformat) {
508                                         strcpy(&outptr[output_len], " ");
509                                         ++output_len;
510                                 }
511                                 strcpy(outbuf, &outbuf[screenwidth-2]);
512                         }
513                 }
514
515         } while (done_reading == 0);
516
517         strcpy(&outptr[output_len], outbuf);
518         output_len += strlen(outbuf);
519
520         /* Strip leading/trailing whitespace.  We can't do this with
521          * striplt() because it uses too many strlen()'s
522          */
523         while ((output_len > 0) && (isspace(outptr[0]))) {
524                 strcpy(outptr, &outptr[1]);
525                 --output_len;
526         }
527         while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
528                 outptr[output_len-1] = 0;
529                 --output_len;
530         }
531
532         if (outptr[output_len-1] != '\n') {
533                 strcat(outptr, "\n");
534                 ++output_len;
535         }
536
537         return outptr;
538
539 }