initialize 'tag' - value as valgrind tells us
[citadel.git] / libcitadel / lib / html_to_ascii.c
1 /*
2  * Functions which handle translation between HTML and plain text
3  * Copyright (c) 2000-2010 by the citadel.org team
4  *
5  * This program is open source software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  */
19
20 #include <stdlib.h>
21 #include <unistd.h>
22 #include <stdio.h>
23 #include <signal.h>
24 #include <sys/types.h>
25 #include <ctype.h>
26 #include <string.h>
27 #include <sys/stat.h>
28 #include <errno.h>
29 #include <limits.h>
30
31 #if TIME_WITH_SYS_TIME
32 # include <sys/time.h>
33 # include <time.h>
34 #else
35 # if HAVE_SYS_TIME_H
36 #  include <sys/time.h>
37 # else
38 #  include <time.h>
39 # endif
40 #endif
41
42 #include "libcitadel.h"
43  
44
45 /*
46  * Convert HTML to plain text.
47  *
48  * inputmsg      = pointer to raw HTML message
49  * screenwidth   = desired output screenwidth
50  * do_citaformat = set to 1 to indent newlines with spaces
51  */
52 char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth, int do_citaformat) {
53         char inbuf[SIZ];
54         int inbuf_len = 0;
55         char outbuf[SIZ];
56         char tag[1024];
57         int done_reading = 0;
58         const char *inptr;
59         char *outptr;
60         size_t outptr_buffer_size;
61         size_t output_len = 0;
62         int i, j, ch, did_out, rb, scanch;
63         int nest = 0;           /* Bracket nesting level */
64         int blockquote = 0;     /* BLOCKQUOTE nesting level */
65         int styletag = 0;       /* STYLE tag nesting level */
66         int styletag_start = 0;
67         int bytes_processed = 0;
68         char nl[128];
69
70         tag[0] = '\0';
71         strcpy(nl, "\n");
72         inptr = inputmsg;
73         strcpy(inbuf, "");
74         strcpy(outbuf, "");
75         if (msglen == 0) msglen = strlen(inputmsg);
76
77         outptr_buffer_size = strlen(inptr) + SIZ;
78         outptr = malloc(outptr_buffer_size);
79         if (outptr == NULL) return NULL;
80         strcpy(outptr, "");
81         output_len = 0;
82
83         do {
84                 /* Fill the input buffer */
85                 inbuf_len = strlen(inbuf);
86                 if ( (done_reading == 0) && (inbuf_len < (SIZ-128)) ) {
87
88                         ch = *inptr++;
89                         if (ch != 0) {
90                                 inbuf[inbuf_len++] = ch;
91                                 inbuf[inbuf_len] = 0;
92                         } 
93                         else {
94                                 done_reading = 1;
95                         }
96
97                         ++bytes_processed;
98                         if (bytes_processed > msglen) {
99                                 done_reading = 1;
100                         }
101
102                 }
103
104                 /* Do some parsing */
105                 if (!IsEmptyStr(inbuf)) {
106
107
108                     /* Fold in all the spacing */
109                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
110                         if (inbuf[i]==10) inbuf[i]=32;
111                         if (inbuf[i]==13) inbuf[i]=32;
112                         if (inbuf[i]==9) inbuf[i]=32;
113                     }
114                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
115                         while ((inbuf[i]==32)&&(inbuf[i+1]==32)) {
116                                 strcpy(&inbuf[i], &inbuf[i+1]);
117                         }
118                     }
119
120                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
121
122                         ch = inbuf[i];
123
124                         if (ch == '<') {
125                                 ++nest;
126                                 strcpy(tag, "");
127                         }
128
129                         else if (ch == '>') {   /* We have a tag. */
130                                 if (nest > 0) --nest;
131
132                                 /* Unqualify the tag (truncate at first space) */
133                                 if (strchr(tag, ' ') != NULL) {
134                                         strcpy(strchr(tag, ' '), "");
135                                 }
136                                 
137                                 if (!strcasecmp(tag, "P")) {
138                                         strcat(outbuf, nl);
139                                         strcat(outbuf, nl);
140                                 }
141
142                                 if (!strcasecmp(tag, "/DIV")) {
143                                         strcat(outbuf, nl);
144                                         strcat(outbuf, nl);
145                                 }
146
147                                 if (!strcasecmp(tag, "LI")) {
148                                         strcat(outbuf, nl);
149                                         strcat(outbuf, " * ");
150                                 }
151
152                                 else if (!strcasecmp(tag, "/UL")) {
153                                         strcat(outbuf, nl);
154                                         strcat(outbuf, nl);
155                                 }
156
157                                 else if (!strcasecmp(tag, "H1")) {
158                                         strcat(outbuf, nl);
159                                         strcat(outbuf, nl);
160                                 }
161
162                                 else if (!strcasecmp(tag, "H2")) {
163                                         strcat(outbuf, nl);
164                                         strcat(outbuf, nl);
165                                 }
166
167                                 else if (!strcasecmp(tag, "H3")) {
168                                         strcat(outbuf, nl);
169                                         strcat(outbuf, nl);
170                                 }
171
172                                 else if (!strcasecmp(tag, "H4")) {
173                                         strcat(outbuf, nl);
174                                         strcat(outbuf, nl);
175                                 }
176
177                                 else if (!strcasecmp(tag, "/H1")) {
178                                         strcat(outbuf, nl);
179                                 }
180
181                                 else if (!strcasecmp(tag, "/H2")) {
182                                         strcat(outbuf, nl);
183                                 }
184
185                                 else if (!strcasecmp(tag, "/H3")) {
186                                         strcat(outbuf, nl);
187                                 }
188
189                                 else if (!strcasecmp(tag, "/H4")) {
190                                         strcat(outbuf, nl);
191                                 }
192
193                                 else if (!strcasecmp(tag, "HR")) {
194                                         strcat(outbuf, nl);
195                                         strcat(outbuf, " ");
196                                         for (j=0; j<screenwidth-2; ++j)
197                                                 strcat(outbuf, "-");
198                                         strcat(outbuf, nl);
199                                 }
200
201                                 else if (
202                                         (!strcasecmp(tag, "B"))
203                                         || (!strcasecmp(tag, "/B"))
204                                         || (!strcasecmp(tag, "STRONG"))
205                                         || (!strcasecmp(tag, "/STRONG"))
206                                 ) {
207                                         strcat(outbuf, "*");
208                                         
209                                 }
210
211                                 else if (
212                                         (!strcasecmp(tag, "I"))
213                                         || (!strcasecmp(tag, "/I"))
214                                         || (!strcasecmp(tag, "EM"))
215                                         || (!strcasecmp(tag, "/EM"))
216                                 ) {
217                                         strcat(outbuf, "/");
218                                         
219                                 }
220
221                                 else if (
222                                         (!strcasecmp(tag, "U"))
223                                         || (!strcasecmp(tag, "/U"))
224                                 ) {
225                                         strcat(outbuf, "_");
226                                         
227                                 }
228
229                                 else if (!strcasecmp(tag, "BR")) {
230                                         strcat(outbuf, nl);
231                                 }
232
233                                 else if (!strcasecmp(tag, "TR")) {
234                                         strcat(outbuf, nl);
235                                 }
236
237                                 else if (!strcasecmp(tag, "/TABLE")) {
238                                         strcat(outbuf, nl);
239                                 }
240
241                                 else if (!strcasecmp(tag, "BLOCKQUOTE")) {
242                                         ++blockquote;
243                                         strcpy(nl, "\n");
244                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
245                                         strcat(outbuf, nl);
246                                 }
247
248                                 else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
249                                         strcat(outbuf, "\n");
250                                         --blockquote;
251                                         strcpy(nl, "\n");
252                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
253                                         strcat(outbuf, nl);
254                                 }
255
256                                 else if (!strcasecmp(tag, "STYLE")) {
257                                         ++styletag;
258                                         if (styletag == 1) {
259                                                 styletag_start = strlen(outbuf);
260                                         }
261                                 }
262
263                                 else if (!strcasecmp(tag, "/STYLE")) {
264                                         --styletag;
265                                         if (styletag == 0) {
266                                                 outbuf[styletag_start] = 0;
267                                         }
268                                 }
269
270                         }
271
272                         else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
273                                 tag[strlen(tag)+1] = 0;
274                                 tag[strlen(tag)] = ch;
275                         }
276                                 
277                         else if (!nest) {
278                                 outbuf[strlen(outbuf)+1] = 0;
279                                 outbuf[strlen(outbuf)] = ch;
280                         }
281                     }
282                     strcpy(inbuf, &inbuf[i]);
283                 }
284
285                 /* Convert &; tags to the forbidden characters */
286                 if (!IsEmptyStr(outbuf)) for (i=0; !IsEmptyStr(&outbuf[i]); ++i) {
287
288                         /* Character entity references */
289                         if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
290                                 outbuf[i] = ' ';
291                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
292                         }
293
294                         if (!strncasecmp(&outbuf[i], "&ensp;", 6)) {
295                                 outbuf[i] = ' ';
296                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
297                         }
298
299                         if (!strncasecmp(&outbuf[i], "&emsp;", 6)) {
300                                 outbuf[i] = ' ';
301                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
302                         }
303
304                         if (!strncasecmp(&outbuf[i], "&thinsp;", 8)) {
305                                 outbuf[i] = ' ';
306                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
307                         }
308
309                         else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
310                                 outbuf[i] = '<';
311                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
312                         }
313
314                         else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
315                                 outbuf[i] = '>';
316                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
317                         }
318
319                         else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
320                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
321                         }
322
323                         else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
324                                 outbuf[i] = '\"';
325                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
326                         }
327
328                         else if (!strncasecmp(&outbuf[i], "&lsquo;", 7)) {
329                                 outbuf[i] = '`';
330                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
331                         }
332
333                         else if (!strncasecmp(&outbuf[i], "&rsquo;", 7)) {
334                                 outbuf[i] = '\'';
335                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
336                         }
337
338                         else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
339                                 outbuf[i] = '(';
340                                 outbuf[i+1] = 'c';
341                                 outbuf[i+2] = ')';
342                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
343                         }
344
345                         else if (!strncasecmp(&outbuf[i], "&bull;", 6)) {
346                                 outbuf[i] = ' ';
347                                 outbuf[i+1] = '*';
348                                 outbuf[i+2] = ' ';
349                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
350                         }
351
352                         else if (!strncasecmp(&outbuf[i], "&hellip;", 8)) {
353                                 outbuf[i] = '.';
354                                 outbuf[i+1] = '.';
355                                 outbuf[i+2] = '.';
356                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
357                         }
358
359                         else if (!strncasecmp(&outbuf[i], "&trade;", 7)) {
360                                 outbuf[i] = '(';
361                                 outbuf[i+1] = 't';
362                                 outbuf[i+2] = 'm';
363                                 outbuf[i+3] = ')';
364                                 strcpy(&outbuf[i+4], &outbuf[i+7]);
365                         }
366
367                         else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
368                                 outbuf[i] = '(';
369                                 outbuf[i+1] = 'r';
370                                 outbuf[i+2] = ')';
371                                 strcpy(&outbuf[i+3], &outbuf[i+5]);
372                         }
373
374                         else if (!strncasecmp(&outbuf[i], "&frac14;", 8)) {
375                                 outbuf[i] = '1';
376                                 outbuf[i+1] = '/';
377                                 outbuf[i+2] = '4';
378                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
379                         }
380
381                         else if (!strncasecmp(&outbuf[i], "&frac12;", 8)) {
382                                 outbuf[i] = '1';
383                                 outbuf[i+1] = '/';
384                                 outbuf[i+2] = '2';
385                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
386                         }
387
388                         else if (!strncasecmp(&outbuf[i], "&frac34;", 8)) {
389                                 outbuf[i] = '3';
390                                 outbuf[i+1] = '/';
391                                 outbuf[i+2] = '4';
392                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
393                         }
394
395                         else if (!strncasecmp(&outbuf[i], "&ndash;", 7)) {
396                                 outbuf[i] = '-';
397                                 outbuf[i+1] = '-';
398                                 strcpy(&outbuf[i+2], &outbuf[i+7]);
399                         }
400
401                         else if (!strncasecmp(&outbuf[i], "&mdash;", 7)) {
402                                 outbuf[i] = '-';
403                                 outbuf[i+1] = '-';
404                                 outbuf[i+2] = '-';
405                                 strcpy(&outbuf[i+3], &outbuf[i+7]);
406                         }
407
408                         else if (!strncmp(&outbuf[i], "&Ccedil;", 8)) {
409                                 outbuf[i] = 'C';
410                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
411                         }
412
413                         else if (!strncasecmp(&outbuf[i], "&ccedil;", 8)) {
414                                 outbuf[i] = 'c';
415                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
416                         }
417
418                         else if (!strncmp(&outbuf[i], "&Egrave;", 8)) {
419                                 outbuf[i] = 'E';
420                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
421                         }
422
423                         else if (!strncasecmp(&outbuf[i], "&egrave;", 8)) {
424                                 outbuf[i] = 'e';
425                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
426                         }
427
428                         else if (!strncmp(&outbuf[i], "&Ecirc;", 7)) {
429                                 outbuf[i] = 'E';
430                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
431                         }
432
433                         else if (!strncasecmp(&outbuf[i], "&ecirc;", 7)) {
434                                 outbuf[i] = 'e';
435                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
436                         }
437
438                         else if (!strncmp(&outbuf[i], "&Eacute;", 8)) {
439                                 outbuf[i] = 'E';
440                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
441                         }
442
443                         else if (!strncasecmp(&outbuf[i], "&eacute;", 8)) {
444                                 outbuf[i] = 'e';
445                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
446                         }
447
448                         else if (!strncmp(&outbuf[i], "&Agrave;", 8)) {
449                                 outbuf[i] = 'A';
450                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
451                         }
452
453                         else if (!strncasecmp(&outbuf[i], "&agrave;", 8)) {
454                                 outbuf[i] = 'a';
455                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
456                         }
457
458                         else if (!strncasecmp(&outbuf[i], "&ldquo;", 7)) {
459                                 outbuf[i] = '\"';
460                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
461                         }
462
463                         else if (!strncasecmp(&outbuf[i], "&rdquo;", 7)) {
464                                 outbuf[i] = '\"';
465                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
466                         }
467
468                         else if (!strncasecmp(&outbuf[i], "&acute;", 7)) {
469                                 outbuf[i] = '\'';
470                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
471                         }
472
473                         else if (!strncasecmp(&outbuf[i], "&#8217;", 7)) {
474                                 outbuf[i] = '\'';
475                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
476                         }
477
478                         else if (!strncasecmp(&outbuf[i], "&#8211;", 7)) {
479                                 outbuf[i] = '-';
480                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
481                         }
482
483                         /* two-digit decimal equivalents */
484                         else if (outbuf[i] == '&'       &&
485                                  outbuf[i + 1] == '#'   &&
486                                  isdigit(outbuf[i + 2]) && 
487                                  isdigit(outbuf[i + 3]) &&
488                                  (outbuf[i+4] == ';') ) 
489                         {
490                                 scanch = 0;
491                                 sscanf(&outbuf[i+2], "%02d", &scanch);
492                                 outbuf[i] = scanch;
493                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
494                         }
495
496                         /* three-digit decimal equivalents */
497                         else if (outbuf[i] == '&'       &&
498                                  outbuf[i + 1] == '#'   &&
499                                  isdigit(outbuf[i + 2]) && 
500                                  isdigit(outbuf[i + 3]) && 
501                                  isdigit(outbuf[i + 4]) &&
502                                  (outbuf[i + 5] == ';') ) 
503                         {
504                                 scanch = 0;
505                                 sscanf(&outbuf[i+2], "%03d", &scanch);
506                                 outbuf[i] = scanch;
507                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
508                         }
509
510                         /* four-digit decimal equivalents */
511                         else if (outbuf[i] == '&'       &&
512                                  outbuf[i + 1] == '#'   &&
513                                  isdigit(outbuf[i + 2]) && 
514                                  isdigit(outbuf[i + 3]) && 
515                                  isdigit(outbuf[i + 4]) &&
516                                  isdigit(outbuf[i + 5]) &&
517                                  (outbuf[i + 6] == ';') ) 
518                         {
519                                 scanch = 0;
520                                 sscanf(&outbuf[i+2], "%04d", &scanch);
521                                 outbuf[i] = scanch;
522                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
523                         }
524
525                 }
526
527                 /* Make sure the output buffer is big enough */
528                 if ((output_len + strlen(outbuf) + SIZ) > outptr_buffer_size) {
529                         outptr_buffer_size += SIZ;
530                         outptr = realloc(outptr, outptr_buffer_size);
531                         if (outptr == NULL) {
532                                 abort();
533                         }
534                 }
535
536                 /* Output any lines terminated with hard line breaks */
537                 do {
538                         did_out = 0;
539                         if (strlen(outbuf) > 0) {
540                             for (i = 0; i<strlen(outbuf); ++i) {
541                                 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
542
543                                         strncpy(&outptr[output_len], outbuf, i+1);
544                                         output_len += (i+1);
545
546                                         if (do_citaformat) {
547                                                 strcpy(&outptr[output_len], " ");
548                                                 ++output_len;
549                                         }
550
551                                         strcpy(outbuf, &outbuf[i+1]);
552                                         i = 0;
553                                         did_out = 1;
554                                 }
555                         }
556                     }
557                 } while (did_out);
558
559                 /* Add soft line breaks */
560                 if (strlen(outbuf) > (screenwidth - 2 )) {
561                         rb = (-1);
562                         for (i=0; i<(screenwidth-2); ++i) {
563                                 if (outbuf[i]==32) rb = i;
564                         }
565                         if (rb>=0) {
566                                 strncpy(&outptr[output_len], outbuf, rb);
567                                 output_len += rb;
568                                 strcpy(&outptr[output_len], nl);
569                                 output_len += strlen(nl);
570                                 if (do_citaformat) {
571                                         strcpy(&outptr[output_len], " ");
572                                         ++output_len;
573                                 }
574                                 strcpy(outbuf, &outbuf[rb+1]);
575                         } else {
576                                 strncpy(&outptr[output_len], outbuf,
577                                         screenwidth-2);
578                                 output_len += (screenwidth-2);
579                                 strcpy(&outptr[output_len], nl);
580                                 output_len += strlen(nl);
581                                 if (do_citaformat) {
582                                         strcpy(&outptr[output_len], " ");
583                                         ++output_len;
584                                 }
585                                 strcpy(outbuf, &outbuf[screenwidth-2]);
586                         }
587                 }
588
589         } while (done_reading == 0);
590
591         strcpy(&outptr[output_len], outbuf);
592         output_len += strlen(outbuf);
593
594         /* Strip leading/trailing whitespace.  We can't do this with
595          * striplt() because it uses too many strlen()'s
596          */
597         while ((output_len > 0) && (isspace(outptr[0]))) {
598                 strcpy(outptr, &outptr[1]);
599                 --output_len;
600         }
601         while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
602                 outptr[output_len-1] = 0;
603                 --output_len;
604         }
605
606         if ((output_len > 0) && (outptr[output_len-1] != '\n')) {
607                 strcat(outptr, "\n");
608                 ++output_len;
609         }
610
611         return outptr;
612
613 }