html.c: removed extra newlines entering/exiting blockquote sections.
[citadel.git] / citadel / html.c
1 /*
2  * $Id$
3  *
4  * Functions which handle translation between HTML and plain text
5  * Copyright (c) 2000-2005 by Art Cancro and others.   This program is
6  * released under the terms of the GNU General Public License.
7  */
8
9 #include "sysdep.h"
10 #include <stdlib.h>
11 #include <unistd.h>
12 #include <stdio.h>
13 #include <fcntl.h>
14 #include <signal.h>
15
16 #if TIME_WITH_SYS_TIME
17 # include <sys/time.h>
18 # include <time.h>
19 #else
20 # if HAVE_SYS_TIME_H
21 #  include <sys/time.h>
22 # else
23 #  include <time.h>
24 # endif
25 #endif
26
27 #include <ctype.h>
28 #include <string.h>
29 #include <errno.h>
30 #include <limits.h>
31 #include "citadel.h"
32 #include "server.h"
33 #include "serv_extensions.h"
34 #include "control.h"
35 #include "sysdep_decls.h"
36 #include "support.h"
37 #include "config.h"
38 #include "msgbase.h"
39 #include "tools.h"
40 #include "room_ops.h"
41 #include "html.h"
42  
43
44 /*
45  * Convert HTML to plain text.
46  *
47  * inputmsg      = pointer to raw HTML message
48  * screenwidth   = desired output screenwidth
49  * do_citaformat = set to 1 to indent newlines with spaces
50  */
51 char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaformat) {
52         char inbuf[SIZ];
53         char outbuf[SIZ];
54         char tag[1024];
55         int done_reading = 0;
56         char *inptr;
57         char *outptr;
58         size_t outptr_buffer_size;
59         size_t output_len = 0;
60         int i, j, ch, did_out, rb, scanch;
61         int nest = 0;           /* Bracket nesting level */
62         int blockquote = 0;     /* BLOCKQUOTE nesting level */
63         int bytes_processed = 0;
64         char nl[128];
65
66         strcpy(nl, "\n");
67         inptr = inputmsg;
68         strcpy(inbuf, "");
69         strcpy(outbuf, "");
70         if (msglen == 0) msglen = strlen(inputmsg);
71
72         outptr_buffer_size = strlen(inptr) + SIZ;
73         outptr = malloc(outptr_buffer_size);
74         if (outptr == NULL) return NULL;
75         strcpy(outptr, "");
76         output_len = 0;
77
78         do {
79                 /* Fill the input buffer */
80                 if ( (done_reading == 0) && (strlen(inbuf) < (SIZ-128)) ) {
81
82                         ch = *inptr++;
83                         if (ch != 0) {
84                                 inbuf[strlen(inbuf)+1] = 0;
85                                 inbuf[strlen(inbuf)] = ch;
86                         } 
87                         else {
88                                 done_reading = 1;
89                         }
90
91                         ++bytes_processed;
92                         if (bytes_processed > msglen) {
93                                 done_reading = 1;
94                         }
95
96                 }
97
98                 /* Do some parsing */
99                 if (strlen(inbuf)>0) {
100
101                     /* Fold in all the spacing */
102                     for (i=0; i<strlen(inbuf); ++i) {
103                         if (inbuf[i]==10) inbuf[i]=32;
104                         if (inbuf[i]==13) inbuf[i]=32;
105                         if (inbuf[i]==9) inbuf[i]=32;
106                         /*** we like foreign characters now.
107                         if ((inbuf[i]<32) || (inbuf[i]>126)) {
108                                 inbuf[i] = '?';
109                         } */
110                     }
111                     for (i=0; i<strlen(inbuf); ++i) {
112                         while ((inbuf[i]==32)&&(inbuf[i+1]==32))
113                                 strcpy(&inbuf[i], &inbuf[i+1]);
114                     }
115
116                     for (i=0; i<strlen(inbuf); ++i) {
117
118                         ch = inbuf[i];
119
120                         if (ch == '<') {
121                                 ++nest;
122                                 strcpy(tag, "");
123                         }
124
125                         else if (ch == '>') {   /* We have a tag. */
126                                 if (nest > 0) --nest;
127
128                                 /* Unqualify the tag (truncate at first space) */
129                                 if (strchr(tag, ' ') != NULL) {
130                                         strcpy(strchr(tag, ' '), "");
131                                 }
132                                 
133                                 if (!strcasecmp(tag, "P")) {
134                                         strcat(outbuf, nl);
135                                         strcat(outbuf, nl);
136                                 }
137
138                                 if (!strcasecmp(tag, "/DIV")) {
139                                         strcat(outbuf, nl);
140                                         strcat(outbuf, nl);
141                                 }
142
143                                 if (!strcasecmp(tag, "LI")) {
144                                         strcat(outbuf, nl);
145                                         strcat(outbuf, " * ");
146                                 }
147
148                                 else if (!strcasecmp(tag, "/UL")) {
149                                         strcat(outbuf, nl);
150                                         strcat(outbuf, nl);
151                                 }
152
153                                 else if (!strcasecmp(tag, "H1")) {
154                                         strcat(outbuf, nl);
155                                         strcat(outbuf, nl);
156                                 }
157
158                                 else if (!strcasecmp(tag, "H2")) {
159                                         strcat(outbuf, nl);
160                                         strcat(outbuf, nl);
161                                 }
162
163                                 else if (!strcasecmp(tag, "H3")) {
164                                         strcat(outbuf, nl);
165                                         strcat(outbuf, nl);
166                                 }
167
168                                 else if (!strcasecmp(tag, "H4")) {
169                                         strcat(outbuf, nl);
170                                         strcat(outbuf, nl);
171                                 }
172
173                                 else if (!strcasecmp(tag, "/H1")) {
174                                         strcat(outbuf, nl);
175                                 }
176
177                                 else if (!strcasecmp(tag, "/H2")) {
178                                         strcat(outbuf, nl);
179                                 }
180
181                                 else if (!strcasecmp(tag, "/H3")) {
182                                         strcat(outbuf, nl);
183                                 }
184
185                                 else if (!strcasecmp(tag, "/H4")) {
186                                         strcat(outbuf, nl);
187                                 }
188
189                                 else if (!strcasecmp(tag, "HR")) {
190                                         strcat(outbuf, nl);
191                                         strcat(outbuf, " ");
192                                         for (j=0; j<screenwidth-2; ++j)
193                                                 strcat(outbuf, "-");
194                                         strcat(outbuf, nl);
195                                 }
196
197                                 else if (!strcasecmp(tag, "BR")) {
198                                         strcat(outbuf, nl);
199                                 }
200
201                                 else if (!strcasecmp(tag, "TR")) {
202                                         strcat(outbuf, nl);
203                                 }
204
205                                 else if (!strcasecmp(tag, "/TABLE")) {
206                                         strcat(outbuf, nl);
207                                 }
208
209                                 else if (!strcasecmp(tag, "BLOCKQUOTE")) {
210                                         ++blockquote;
211                                         strcpy(nl, "\n");
212                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
213                                         strcat(outbuf, nl);
214                                 }
215
216                                 else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
217                                         strcat(outbuf, "\n");
218                                         --blockquote;
219                                         strcpy(nl, "\n");
220                                         for (j=0; j<blockquote; ++j) strcat(nl, ">");
221                                         strcat(outbuf, nl);
222                                 }
223
224                         }
225
226                         else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
227                                 tag[strlen(tag)+1] = 0;
228                                 tag[strlen(tag)] = ch;
229                         }
230                                 
231                         else if (!nest) {
232                                 outbuf[strlen(outbuf)+1] = 0;
233                                 outbuf[strlen(outbuf)] = ch;
234                         }
235                     }
236                     strcpy(inbuf, &inbuf[i]);
237                 }
238
239                 /* Convert &; tags to the forbidden characters */
240                 if (strlen(outbuf)>0) for (i=0; i<strlen(outbuf); ++i) {
241
242                         /* Character entity references */
243                         if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
244                                 outbuf[i] = ' ';
245                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
246                         }
247
248                         if (!strncasecmp(&outbuf[i], "&ensp;", 6)) {
249                                 outbuf[i] = ' ';
250                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
251                         }
252
253                         if (!strncasecmp(&outbuf[i], "&emsp;", 6)) {
254                                 outbuf[i] = ' ';
255                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
256                         }
257
258                         if (!strncasecmp(&outbuf[i], "&thinsp;", 8)) {
259                                 outbuf[i] = ' ';
260                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
261                         }
262
263                         else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
264                                 outbuf[i] = '<';
265                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
266                         }
267
268                         else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
269                                 outbuf[i] = '>';
270                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
271                         }
272
273                         else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
274                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
275                         }
276
277                         else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
278                                 outbuf[i] = '\"';
279                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
280                         }
281
282                         else if (!strncasecmp(&outbuf[i], "&lsquo;", 7)) {
283                                 outbuf[i] = '`';
284                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
285                         }
286
287                         else if (!strncasecmp(&outbuf[i], "&rsquo;", 7)) {
288                                 outbuf[i] = '\'';
289                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
290                         }
291
292                         else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
293                                 outbuf[i] = '(';
294                                 outbuf[i+1] = 'c';
295                                 outbuf[i+2] = ')';
296                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
297                         }
298
299                         else if (!strncasecmp(&outbuf[i], "&hellip;", 8)) {
300                                 outbuf[i] = '.';
301                                 outbuf[i+1] = '.';
302                                 outbuf[i+2] = '.';
303                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
304                         }
305
306                         else if (!strncasecmp(&outbuf[i], "&trade;", 7)) {
307                                 outbuf[i] = '(';
308                                 outbuf[i+1] = 't';
309                                 outbuf[i+2] = 'm';
310                                 outbuf[i+3] = ')';
311                                 strcpy(&outbuf[i+4], &outbuf[i+7]);
312                         }
313
314                         else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
315                                 outbuf[i] = '(';
316                                 outbuf[i+1] = 'r';
317                                 outbuf[i+2] = ')';
318                                 strcpy(&outbuf[i+3], &outbuf[i+5]);
319                         }
320
321                         else if (!strncasecmp(&outbuf[i], "&frac14;", 8)) {
322                                 outbuf[i] = '1';
323                                 outbuf[i+1] = '/';
324                                 outbuf[i+2] = '4';
325                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
326                         }
327
328                         else if (!strncasecmp(&outbuf[i], "&frac12;", 8)) {
329                                 outbuf[i] = '1';
330                                 outbuf[i+1] = '/';
331                                 outbuf[i+2] = '2';
332                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
333                         }
334
335                         else if (!strncasecmp(&outbuf[i], "&frac34;", 8)) {
336                                 outbuf[i] = '3';
337                                 outbuf[i+1] = '/';
338                                 outbuf[i+2] = '4';
339                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
340                         }
341
342                         else if (!strncasecmp(&outbuf[i], "&ndash;", 7)) {
343                                 outbuf[i] = '-';
344                                 outbuf[i+1] = '-';
345                                 strcpy(&outbuf[i+2], &outbuf[i+7]);
346                         }
347
348                         else if (!strncasecmp(&outbuf[i], "&mdash;", 7)) {
349                                 outbuf[i] = '-';
350                                 outbuf[i+1] = '-';
351                                 outbuf[i+2] = '-';
352                                 strcpy(&outbuf[i+3], &outbuf[i+7]);
353                         }
354
355                         /* two-digit decimal equivalents */
356                         else if ((!strncmp(&outbuf[i], "&#", 2))
357                               && (outbuf[i+4] == ';') ) {
358                                 scanch = 0;
359                                 sscanf(&outbuf[i+2], "%02d", &scanch);
360                                 outbuf[i] = scanch;
361                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
362                         }
363
364                         /* three-digit decimal equivalents */
365                         else if ((!strncmp(&outbuf[i], "&#", 2))
366                               && (outbuf[i+5] == ';') ) {
367                                 scanch = 0;
368                                 sscanf(&outbuf[i+2], "%03d", &scanch);
369                                 outbuf[i] = scanch;
370                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
371                         }
372
373                 }
374
375                 /* Make sure the output buffer is big enough */
376                 if ((output_len + strlen(outbuf) + SIZ)
377                    > outptr_buffer_size) {
378                         outptr_buffer_size += SIZ;
379                         outptr = realloc(outptr, outptr_buffer_size);
380                 }
381
382                 /* Output any lines terminated with hard line breaks */
383                 do {
384                         did_out = 0;
385                         if (strlen(outbuf)>0) {
386                             for (i = 0; i<strlen(outbuf); ++i) {
387                                 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
388
389                                         strncpy(&outptr[output_len],
390                                                 outbuf, i+1);
391                                         output_len += (i+1);
392
393                                         if (do_citaformat) {
394                                                 strcpy(&outptr[output_len],
395                                                         " ");
396                                                 ++output_len;
397                                         }
398
399                                         strcpy(outbuf, &outbuf[i+1]);
400                                         i = 0;
401                                         did_out = 1;
402                                 }
403                         }
404                     }
405                 } while (did_out);
406
407                 /* Add soft line breaks */
408                 if (strlen(outbuf) > (screenwidth - 2 )) {
409                         rb = (-1);
410                         for (i=0; i<(screenwidth-2); ++i) {
411                                 if (outbuf[i]==32) rb = i;
412                         }
413                         if (rb>=0) {
414                                 strncpy(&outptr[output_len], outbuf, rb);
415                                 output_len += rb;
416                                 strcpy(&outptr[output_len], nl);
417                                 output_len += strlen(nl);
418                                 if (do_citaformat) {
419                                         strcpy(&outptr[output_len], " ");
420                                         ++output_len;
421                                 }
422                                 strcpy(outbuf, &outbuf[rb+1]);
423                         } else {
424                                 strncpy(&outptr[output_len], outbuf,
425                                         screenwidth-2);
426                                 output_len += (screenwidth-2);
427                                 strcpy(&outptr[output_len], nl);
428                                 output_len += strlen(nl);
429                                 if (do_citaformat) {
430                                         strcpy(&outptr[output_len], " ");
431                                         ++output_len;
432                                 }
433                                 strcpy(outbuf, &outbuf[screenwidth-2]);
434                         }
435                 }
436
437         } while (done_reading == 0);
438
439         strcpy(&outptr[output_len], outbuf);
440         output_len += strlen(outbuf);
441
442         /* Strip leading/trailing whitespace.  We can't do this with
443          * striplt() because it uses too many strlen()'s
444          */
445         while ((output_len > 0) && (isspace(outptr[0]))) {
446                 strcpy(outptr, &outptr[1]);
447                 --output_len;
448         }
449         while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
450                 outptr[output_len-1] = 0;
451                 --output_len;
452         }
453
454         if (outptr[output_len-1] != '\n') {
455                 strcat(outptr, "\n");
456                 ++output_len;
457         }
458
459         return outptr;
460
461 }