Added a couple more entity declarations
[citadel.git] / libcitadel / lib / html_to_ascii.c
1 /*
2  * Functions which handle translation between HTML and plain text
3  * Copyright (c) 2000-2010 by the citadel.org team
4  *
5  * This program is open source software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 3 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  */
19
20 #include <stdlib.h>
21 #include <unistd.h>
22 #include <stdio.h>
23 #include <signal.h>
24 #include <sys/types.h>
25 #include <ctype.h>
26 #include <string.h>
27 #include <sys/stat.h>
28 #include <errno.h>
29 #include <limits.h>
30
31 #if TIME_WITH_SYS_TIME
32 # include <sys/time.h>
33 # include <time.h>
34 #else
35 # if HAVE_SYS_TIME_H
36 #  include <sys/time.h>
37 # else
38 #  include <time.h>
39 # endif
40 #endif
41
42 #include "libcitadel.h"
43  
44
45 /*
46  * Convert HTML to plain text.
47  *
48  * inputmsg      = pointer to raw HTML message
49  * screenwidth   = desired output screenwidth
50  * do_citaformat = set to 1 to indent newlines with spaces
51  */
52 char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth, int do_citaformat) {
53         char inbuf[SIZ];
54         int inbuf_len = 0;
55         char outbuf[SIZ];
56         char tag[1024];
57         int done_reading = 0;
58         const char *inptr;
59         char *outptr;
60         size_t outptr_buffer_size;
61         size_t output_len = 0;
62         int i, j, ch, did_out, rb, scanch;
63         int nest = 0;           /* Bracket nesting level */
64         int blockquote = 0;     /* BLOCKQUOTE nesting level */
65         int styletag = 0;       /* STYLE tag nesting level */
66         int styletag_start = 0;
67         int bytes_processed = 0;
68         char nl[128];
69
70         strcpy(nl, "\n");
71         inptr = inputmsg;
72         strcpy(inbuf, "");
73         strcpy(outbuf, "");
74         if (msglen == 0) msglen = strlen(inputmsg);
75
76         outptr_buffer_size = strlen(inptr) + SIZ;
77         outptr = malloc(outptr_buffer_size);
78         if (outptr == NULL) return NULL;
79         strcpy(outptr, "");
80         output_len = 0;
81
82         do {
83                 /* Fill the input buffer */
84                 inbuf_len = strlen(inbuf);
85                 if ( (done_reading == 0) && (inbuf_len < (SIZ-128)) ) {
86
87                         ch = *inptr++;
88                         if (ch != 0) {
89                                 inbuf[inbuf_len++] = ch;
90                                 inbuf[inbuf_len] = 0;
91                         } 
92                         else {
93                                 done_reading = 1;
94                         }
95
96                         ++bytes_processed;
97                         if (bytes_processed > msglen) {
98                                 done_reading = 1;
99                         }
100
101                 }
102
103                 /* Do some parsing */
104                 if (!IsEmptyStr(inbuf)) {
105
106
107                     /* Fold in all the spacing */
108                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
109                         if (inbuf[i]==10) inbuf[i]=32;
110                         if (inbuf[i]==13) inbuf[i]=32;
111                         if (inbuf[i]==9) inbuf[i]=32;
112                     }
113                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
114                         while ((inbuf[i]==32)&&(inbuf[i+1]==32)) {
115                                 strcpy(&inbuf[i], &inbuf[i+1]);
116                         }
117                     }
118
119                     for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
120
121                         ch = inbuf[i];
122
123                         if (ch == '<') {
124                                 ++nest;
125                                 strcpy(tag, "");
126                         }
127
128                         else if (ch == '>') {   /* We have a tag. */
129                                 if (nest > 0) --nest;
130
131                                 /* Unqualify the tag (truncate at first space) */
132                                 if (strchr(tag, ' ') != NULL) {
133                                         strcpy(strchr(tag, ' '), "");
134                                 }
135                                 
136                                 if (!strcasecmp(tag, "P")) {
137                                         strcat(outbuf, nl);
138                                         strcat(outbuf, nl);
139                                 }
140
141                                 if (!strcasecmp(tag, "/DIV")) {
142                                         strcat(outbuf, nl);
143                                         strcat(outbuf, nl);
144                                 }
145
146                                 if (!strcasecmp(tag, "LI")) {
147                                         strcat(outbuf, nl);
148                                         strcat(outbuf, " * ");
149                                 }
150
151                                 else if (!strcasecmp(tag, "/UL")) {
152                                         strcat(outbuf, nl);
153                                         strcat(outbuf, nl);
154                                 }
155
156                                 else if (!strcasecmp(tag, "H1")) {
157                                         strcat(outbuf, nl);
158                                         strcat(outbuf, nl);
159                                 }
160
161                                 else if (!strcasecmp(tag, "H2")) {
162                                         strcat(outbuf, nl);
163                                         strcat(outbuf, nl);
164                                 }
165
166                                 else if (!strcasecmp(tag, "H3")) {
167                                         strcat(outbuf, nl);
168                                         strcat(outbuf, nl);
169                                 }
170
171                                 else if (!strcasecmp(tag, "H4")) {
172                                         strcat(outbuf, nl);
173                                         strcat(outbuf, nl);
174                                 }
175
176                                 else if (!strcasecmp(tag, "/H1")) {
177                                         strcat(outbuf, nl);
178                                 }
179
180                                 else if (!strcasecmp(tag, "/H2")) {
181                                         strcat(outbuf, nl);
182                                 }
183
184                                 else if (!strcasecmp(tag, "/H3")) {
185                                         strcat(outbuf, nl);
186                                 }
187
188                                 else if (!strcasecmp(tag, "/H4")) {
189                                         strcat(outbuf, nl);
190                                 }
191
192                                 else if (!strcasecmp(tag, "HR")) {
193                                         strcat(outbuf, nl);
194                                         strcat(outbuf, " ");
195                                         for (j=0; j<screenwidth-2; ++j)
196                                                 strcat(outbuf, "-");
197                                         strcat(outbuf, nl);
198                                 }
199
200                                 else if (
201                                         (!strcasecmp(tag, "B"))
202                                         || (!strcasecmp(tag, "/B"))
203                                         || (!strcasecmp(tag, "STRONG"))
204                                         || (!strcasecmp(tag, "/STRONG"))
205                                 ) {
206                                         strcat(outbuf, "*");
207                                         
208                                 }
209
210                                 else if (
211                                         (!strcasecmp(tag, "I"))
212                                         || (!strcasecmp(tag, "/I"))
213                                         || (!strcasecmp(tag, "EM"))
214                                         || (!strcasecmp(tag, "/EM"))
215                                 ) {
216                                         strcat(outbuf, "/");
217                                         
218                                 }
219
220                                 else if (
221                                         (!strcasecmp(tag, "U"))
222                                         || (!strcasecmp(tag, "/U"))
223                                 ) {
224                                         strcat(outbuf, "_");
225                                         
226                                 }
227
228                                 else if (!strcasecmp(tag, "BR")) {
229                                         strcat(outbuf, nl);
230                                 }
231
232                                 else if (!strcasecmp(tag, "TR")) {
233                                         strcat(outbuf, nl);
234                                 }
235
236                                 else if (!strcasecmp(tag, "/TABLE")) {
237                                         strcat(outbuf, nl);
238                                 }
239
240                                 else if (!strcasecmp(tag, "BLOCKQUOTE")) {
241                                         ++blockquote;
242                                         strcpy(nl, "\n");
243                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
244                                         strcat(outbuf, nl);
245                                 }
246
247                                 else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
248                                         strcat(outbuf, "\n");
249                                         --blockquote;
250                                         strcpy(nl, "\n");
251                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
252                                         strcat(outbuf, nl);
253                                 }
254
255                                 else if (!strcasecmp(tag, "STYLE")) {
256                                         ++styletag;
257                                         if (styletag == 1) {
258                                                 styletag_start = strlen(outbuf);
259                                         }
260                                 }
261
262                                 else if (!strcasecmp(tag, "/STYLE")) {
263                                         --styletag;
264                                         if (styletag == 0) {
265                                                 outbuf[styletag_start] = 0;
266                                         }
267                                 }
268
269                         }
270
271                         else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
272                                 tag[strlen(tag)+1] = 0;
273                                 tag[strlen(tag)] = ch;
274                         }
275                                 
276                         else if (!nest) {
277                                 outbuf[strlen(outbuf)+1] = 0;
278                                 outbuf[strlen(outbuf)] = ch;
279                         }
280                     }
281                     strcpy(inbuf, &inbuf[i]);
282                 }
283
284                 /* Convert &; tags to the forbidden characters */
285                 if (!IsEmptyStr(outbuf)) for (i=0; !IsEmptyStr(&outbuf[i]); ++i) {
286
287                         /* Character entity references */
288                         if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
289                                 outbuf[i] = ' ';
290                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
291                         }
292
293                         if (!strncasecmp(&outbuf[i], "&ensp;", 6)) {
294                                 outbuf[i] = ' ';
295                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
296                         }
297
298                         if (!strncasecmp(&outbuf[i], "&emsp;", 6)) {
299                                 outbuf[i] = ' ';
300                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
301                         }
302
303                         if (!strncasecmp(&outbuf[i], "&thinsp;", 8)) {
304                                 outbuf[i] = ' ';
305                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
306                         }
307
308                         else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
309                                 outbuf[i] = '<';
310                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
311                         }
312
313                         else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
314                                 outbuf[i] = '>';
315                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
316                         }
317
318                         else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
319                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
320                         }
321
322                         else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
323                                 outbuf[i] = '\"';
324                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
325                         }
326
327                         else if (!strncasecmp(&outbuf[i], "&lsquo;", 7)) {
328                                 outbuf[i] = '`';
329                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
330                         }
331
332                         else if (!strncasecmp(&outbuf[i], "&rsquo;", 7)) {
333                                 outbuf[i] = '\'';
334                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
335                         }
336
337                         else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
338                                 outbuf[i] = '(';
339                                 outbuf[i+1] = 'c';
340                                 outbuf[i+2] = ')';
341                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
342                         }
343
344                         else if (!strncasecmp(&outbuf[i], "&bull;", 6)) {
345                                 outbuf[i] = ' ';
346                                 outbuf[i+1] = '*';
347                                 outbuf[i+2] = ' ';
348                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
349                         }
350
351                         else if (!strncasecmp(&outbuf[i], "&hellip;", 8)) {
352                                 outbuf[i] = '.';
353                                 outbuf[i+1] = '.';
354                                 outbuf[i+2] = '.';
355                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
356                         }
357
358                         else if (!strncasecmp(&outbuf[i], "&trade;", 7)) {
359                                 outbuf[i] = '(';
360                                 outbuf[i+1] = 't';
361                                 outbuf[i+2] = 'm';
362                                 outbuf[i+3] = ')';
363                                 strcpy(&outbuf[i+4], &outbuf[i+7]);
364                         }
365
366                         else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
367                                 outbuf[i] = '(';
368                                 outbuf[i+1] = 'r';
369                                 outbuf[i+2] = ')';
370                                 strcpy(&outbuf[i+3], &outbuf[i+5]);
371                         }
372
373                         else if (!strncasecmp(&outbuf[i], "&frac14;", 8)) {
374                                 outbuf[i] = '1';
375                                 outbuf[i+1] = '/';
376                                 outbuf[i+2] = '4';
377                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
378                         }
379
380                         else if (!strncasecmp(&outbuf[i], "&frac12;", 8)) {
381                                 outbuf[i] = '1';
382                                 outbuf[i+1] = '/';
383                                 outbuf[i+2] = '2';
384                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
385                         }
386
387                         else if (!strncasecmp(&outbuf[i], "&frac34;", 8)) {
388                                 outbuf[i] = '3';
389                                 outbuf[i+1] = '/';
390                                 outbuf[i+2] = '4';
391                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
392                         }
393
394                         else if (!strncasecmp(&outbuf[i], "&ndash;", 7)) {
395                                 outbuf[i] = '-';
396                                 outbuf[i+1] = '-';
397                                 strcpy(&outbuf[i+2], &outbuf[i+7]);
398                         }
399
400                         else if (!strncasecmp(&outbuf[i], "&mdash;", 7)) {
401                                 outbuf[i] = '-';
402                                 outbuf[i+1] = '-';
403                                 outbuf[i+2] = '-';
404                                 strcpy(&outbuf[i+3], &outbuf[i+7]);
405                         }
406
407                         else if (!strncmp(&outbuf[i], "&Ccedil;", 8)) {
408                                 outbuf[i] = 'C';
409                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
410                         }
411
412                         else if (!strncasecmp(&outbuf[i], "&ccedil;", 8)) {
413                                 outbuf[i] = 'c';
414                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
415                         }
416
417                         else if (!strncmp(&outbuf[i], "&Egrave;", 8)) {
418                                 outbuf[i] = 'E';
419                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
420                         }
421
422                         else if (!strncasecmp(&outbuf[i], "&egrave;", 8)) {
423                                 outbuf[i] = 'e';
424                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
425                         }
426
427                         else if (!strncmp(&outbuf[i], "&Ecirc;", 7)) {
428                                 outbuf[i] = 'E';
429                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
430                         }
431
432                         else if (!strncasecmp(&outbuf[i], "&ecirc;", 7)) {
433                                 outbuf[i] = 'e';
434                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
435                         }
436
437                         else if (!strncmp(&outbuf[i], "&Eacute;", 8)) {
438                                 outbuf[i] = 'E';
439                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
440                         }
441
442                         else if (!strncasecmp(&outbuf[i], "&eacute;", 8)) {
443                                 outbuf[i] = 'e';
444                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
445                         }
446
447                         else if (!strncmp(&outbuf[i], "&Agrave;", 8)) {
448                                 outbuf[i] = 'A';
449                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
450                         }
451
452                         else if (!strncasecmp(&outbuf[i], "&agrave;", 8)) {
453                                 outbuf[i] = 'a';
454                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
455                         }
456
457                         else if (!strncasecmp(&outbuf[i], "&ldquo;", 7)) {
458                                 outbuf[i] = '\"';
459                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
460                         }
461
462                         else if (!strncasecmp(&outbuf[i], "&rdquo;", 7)) {
463                                 outbuf[i] = '\"';
464                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
465                         }
466
467                         else if (!strncasecmp(&outbuf[i], "&acute;", 7)) {
468                                 outbuf[i] = '\'';
469                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
470                         }
471
472                         else if (!strncasecmp(&outbuf[i], "&#8217;", 7)) {
473                                 outbuf[i] = '\'';
474                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
475                         }
476
477                         else if (!strncasecmp(&outbuf[i], "&#8211;", 7)) {
478                                 outbuf[i] = '-';
479                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
480                         }
481
482                         /* two-digit decimal equivalents */
483                         else if (outbuf[i] == '&'       &&
484                                  outbuf[i + 1] == '#'   &&
485                                  isdigit(outbuf[i + 2]) && 
486                                  isdigit(outbuf[i + 3]) &&
487                                  (outbuf[i+4] == ';') ) 
488                         {
489                                 scanch = 0;
490                                 sscanf(&outbuf[i+2], "%02d", &scanch);
491                                 outbuf[i] = scanch;
492                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
493                         }
494
495                         /* three-digit decimal equivalents */
496                         else if (outbuf[i] == '&'       &&
497                                  outbuf[i + 1] == '#'   &&
498                                  isdigit(outbuf[i + 2]) && 
499                                  isdigit(outbuf[i + 3]) && 
500                                  isdigit(outbuf[i + 4]) &&
501                                  (outbuf[i + 5] == ';') ) 
502                         {
503                                 scanch = 0;
504                                 sscanf(&outbuf[i+2], "%03d", &scanch);
505                                 outbuf[i] = scanch;
506                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
507                         }
508
509                         /* four-digit decimal equivalents */
510                         else if (outbuf[i] == '&'       &&
511                                  outbuf[i + 1] == '#'   &&
512                                  isdigit(outbuf[i + 2]) && 
513                                  isdigit(outbuf[i + 3]) && 
514                                  isdigit(outbuf[i + 4]) &&
515                                  isdigit(outbuf[i + 5]) &&
516                                  (outbuf[i + 6] == ';') ) 
517                         {
518                                 scanch = 0;
519                                 sscanf(&outbuf[i+2], "%04d", &scanch);
520                                 outbuf[i] = scanch;
521                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
522                         }
523
524                 }
525
526                 /* Make sure the output buffer is big enough */
527                 if ((output_len + strlen(outbuf) + SIZ) > outptr_buffer_size) {
528                         outptr_buffer_size += SIZ;
529                         outptr = realloc(outptr, outptr_buffer_size);
530                         if (outptr == NULL) {
531                                 abort();
532                         }
533                 }
534
535                 /* Output any lines terminated with hard line breaks */
536                 do {
537                         did_out = 0;
538                         if (strlen(outbuf) > 0) {
539                             for (i = 0; i<strlen(outbuf); ++i) {
540                                 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
541
542                                         strncpy(&outptr[output_len], outbuf, i+1);
543                                         output_len += (i+1);
544
545                                         if (do_citaformat) {
546                                                 strcpy(&outptr[output_len], " ");
547                                                 ++output_len;
548                                         }
549
550                                         strcpy(outbuf, &outbuf[i+1]);
551                                         i = 0;
552                                         did_out = 1;
553                                 }
554                         }
555                     }
556                 } while (did_out);
557
558                 /* Add soft line breaks */
559                 if (strlen(outbuf) > (screenwidth - 2 )) {
560                         rb = (-1);
561                         for (i=0; i<(screenwidth-2); ++i) {
562                                 if (outbuf[i]==32) rb = i;
563                         }
564                         if (rb>=0) {
565                                 strncpy(&outptr[output_len], outbuf, rb);
566                                 output_len += rb;
567                                 strcpy(&outptr[output_len], nl);
568                                 output_len += strlen(nl);
569                                 if (do_citaformat) {
570                                         strcpy(&outptr[output_len], " ");
571                                         ++output_len;
572                                 }
573                                 strcpy(outbuf, &outbuf[rb+1]);
574                         } else {
575                                 strncpy(&outptr[output_len], outbuf,
576                                         screenwidth-2);
577                                 output_len += (screenwidth-2);
578                                 strcpy(&outptr[output_len], nl);
579                                 output_len += strlen(nl);
580                                 if (do_citaformat) {
581                                         strcpy(&outptr[output_len], " ");
582                                         ++output_len;
583                                 }
584                                 strcpy(outbuf, &outbuf[screenwidth-2]);
585                         }
586                 }
587
588         } while (done_reading == 0);
589
590         strcpy(&outptr[output_len], outbuf);
591         output_len += strlen(outbuf);
592
593         /* Strip leading/trailing whitespace.  We can't do this with
594          * striplt() because it uses too many strlen()'s
595          */
596         while ((output_len > 0) && (isspace(outptr[0]))) {
597                 strcpy(outptr, &outptr[1]);
598                 --output_len;
599         }
600         while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
601                 outptr[output_len-1] = 0;
602                 --output_len;
603         }
604
605         if ((output_len > 0) && (outptr[output_len-1] != '\n')) {
606                 strcat(outptr, "\n");
607                 ++output_len;
608         }
609
610         return outptr;
611
612 }