c820d1c96b90775e0f450056ffe15c542a5c29a2
[citadel.git] / citadel / html.c
1 /*
2  * $Id$
3  *
4  * Functions which handle translation between HTML and plain text
5  * Copyright (c) 2000-2001 by Art Cancro and others.   This program is
6  * released under the terms of the GNU General Public License.
7  */
8
9 #ifdef DLL_EXPORT
10 #define IN_LIBCIT
11 #endif
12
13 #include "sysdep.h"
14 #include <stdlib.h>
15 #include <unistd.h>
16 #include <stdio.h>
17 #include <fcntl.h>
18 #include <signal.h>
19
20 #if TIME_WITH_SYS_TIME
21 # include <sys/time.h>
22 # include <time.h>
23 #else
24 # if HAVE_SYS_TIME_H
25 #  include <sys/time.h>
26 # else
27 #  include <time.h>
28 # endif
29 #endif
30
31 #include <ctype.h>
32 #include <string.h>
33 #include <errno.h>
34 #include <limits.h>
35 #include "citadel.h"
36 #include "server.h"
37 #include "serv_extensions.h"
38 #include "control.h"
39 #include "sysdep_decls.h"
40 #include "support.h"
41 #include "config.h"
42 #include "msgbase.h"
43 #include "tools.h"
44 #include "room_ops.h"
45 #include "html.h"
46  
47
48 /*
49  * Convert HTML to plain text.
50  *
51  * inputmsg      = pointer to raw HTML message
52  * screenwidth   = desired output screenwidth
53  * do_citaformat = set to 1 to indent newlines with spaces
54  */
55 char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaformat) {
56         char inbuf[SIZ];
57         char outbuf[SIZ];
58         char tag[1024];
59         int done_reading = 0;
60         char *inptr;
61         char *outptr;
62         size_t outptr_buffer_size;
63         size_t output_len = 0;
64         int i, j, ch, did_out, rb, scanch;
65         int nest = 0;           /* Bracket nesting level */
66         int blockquote = 0;     /* BLOCKQUOTE nesting level */
67         int bytes_processed = 0;
68
69         inptr = inputmsg;
70         strcpy(inbuf, "");
71         strcpy(outbuf, "");
72         if (msglen == 0) msglen = strlen(inputmsg);
73
74         outptr_buffer_size = strlen(inptr) + SIZ;
75         outptr = malloc(outptr_buffer_size);
76         if (outptr == NULL) return NULL;
77         strcpy(outptr, "");
78         output_len = 0;
79
80         do {
81                 /* Fill the input buffer */
82                 if ( (done_reading == 0) && (strlen(inbuf) < (SIZ-128)) ) {
83
84                         ch = *inptr++;
85                         if (ch != 0) {
86                                 inbuf[strlen(inbuf)+1] = 0;
87                                 inbuf[strlen(inbuf)] = ch;
88                         } 
89                         else {
90                                 done_reading = 1;
91                         }
92
93                         ++bytes_processed;
94                         if (bytes_processed > msglen) {
95                                 done_reading = 1;
96                         }
97
98                 }
99
100                 /* Do some parsing */
101                 if (strlen(inbuf)>0) {
102
103                     /* Fold in all the spacing */
104                     for (i=0; i<strlen(inbuf); ++i) {
105                         if (inbuf[i]==10) inbuf[i]=32;
106                         if (inbuf[i]==13) inbuf[i]=32;
107                         if (inbuf[i]==9) inbuf[i]=32;
108                         /*** we like foreign characters now.
109                         if ((inbuf[i]<32) || (inbuf[i]>126)) {
110                                 inbuf[i] = '?';
111                         } */
112                     }
113                     for (i=0; i<strlen(inbuf); ++i) {
114                         while ((inbuf[i]==32)&&(inbuf[i+1]==32))
115                                 strcpy(&inbuf[i], &inbuf[i+1]);
116                     }
117
118                     for (i=0; i<strlen(inbuf); ++i) {
119
120                         ch = inbuf[i];
121
122                         if (ch == '<') {
123                                 ++nest;
124                                 strcpy(tag, "");
125                         }
126
127                         else if (ch == '>') {   /* We have a tag. */
128                                 if (nest > 0) --nest;
129
130                                 /* Unqualify the tag (truncate at first space) */
131                                 if (strchr(tag, ' ') != NULL) {
132                                         strcpy(strchr(tag, ' '), "");
133                                 }
134                                 
135                                 if (!strcasecmp(tag, "P")) {
136                                         strcat(outbuf, "\n\n");
137                                 }
138
139                                 if (!strcasecmp(tag, "/DIV")) {
140                                         strcat(outbuf, "\n\n");
141                                 }
142
143                                 if (!strcasecmp(tag, "LI")) {
144                                         strcat(outbuf, "\n * ");
145                                 }
146
147                                 else if (!strcasecmp(tag, "/UL")) {
148                                         strcat(outbuf, "\n\n");
149                                 }
150
151                                 else if (!strcasecmp(tag, "H1")) {
152                                         strcat(outbuf, "\n\n");
153                                 }
154
155                                 else if (!strcasecmp(tag, "H2")) {
156                                         strcat(outbuf, "\n\n");
157                                 }
158
159                                 else if (!strcasecmp(tag, "H3")) {
160                                         strcat(outbuf, "\n\n");
161                                 }
162
163                                 else if (!strcasecmp(tag, "H4")) {
164                                         strcat(outbuf, "\n\n");
165                                 }
166
167                                 else if (!strcasecmp(tag, "/H1")) {
168                                         strcat(outbuf, "\n");
169                                 }
170
171                                 else if (!strcasecmp(tag, "/H2")) {
172                                         strcat(outbuf, "\n");
173                                 }
174
175                                 else if (!strcasecmp(tag, "/H3")) {
176                                         strcat(outbuf, "\n");
177                                 }
178
179                                 else if (!strcasecmp(tag, "/H4")) {
180                                         strcat(outbuf, "\n");
181                                 }
182
183                                 else if (!strcasecmp(tag, "HR")) {
184                                         strcat(outbuf, "\n ");
185                                         for (j=0; j<screenwidth-2; ++j)
186                                                 strcat(outbuf, "-");
187                                         strcat(outbuf, "\n");
188                                 }
189
190                                 else if (!strcasecmp(tag, "BR")) {
191                                         strcat(outbuf, "\n");
192                                 }
193
194                                 else if (!strcasecmp(tag, "TR")) {
195                                         strcat(outbuf, "\n");
196                                 }
197
198                                 else if (!strcasecmp(tag, "/TABLE")) {
199                                         strcat(outbuf, "\n");
200                                 }
201
202                                 else if (!strcasecmp(tag, "BLOCKQUOTE")) {
203                                         strcat(outbuf, "\n\n <<\n");
204                                         ++blockquote;
205                                 }
206
207                                 else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
208                                         strcat(outbuf, "\n >>\n\n");
209                                         --blockquote;
210                                 }
211
212                         }
213
214                         else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
215                                 tag[strlen(tag)+1] = 0;
216                                 tag[strlen(tag)] = ch;
217                         }
218                                 
219                         else if (!nest) {
220                                 outbuf[strlen(outbuf)+1] = 0;
221                                 outbuf[strlen(outbuf)] = ch;
222                         }
223                     }
224                     strcpy(inbuf, &inbuf[i]);
225                 }
226
227                 /* Convert &; tags to the forbidden characters */
228                 if (strlen(outbuf)>0) for (i=0; i<strlen(outbuf); ++i) {
229
230                         /* Character entity references */
231                         if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
232                                 outbuf[i] = ' ';
233                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
234                         }
235
236                         if (!strncasecmp(&outbuf[i], "&ensp;", 6)) {
237                                 outbuf[i] = ' ';
238                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
239                         }
240
241                         if (!strncasecmp(&outbuf[i], "&emsp;", 6)) {
242                                 outbuf[i] = ' ';
243                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
244                         }
245
246                         if (!strncasecmp(&outbuf[i], "&thinsp;", 8)) {
247                                 outbuf[i] = ' ';
248                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
249                         }
250
251                         else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
252                                 outbuf[i] = '<';
253                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
254                         }
255
256                         else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
257                                 outbuf[i] = '>';
258                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
259                         }
260
261                         else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
262                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
263                         }
264
265                         else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
266                                 outbuf[i] = '\"';
267                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
268                         }
269
270                         else if (!strncasecmp(&outbuf[i], "&lsquo;", 7)) {
271                                 outbuf[i] = '`';
272                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
273                         }
274
275                         else if (!strncasecmp(&outbuf[i], "&rsquo;", 7)) {
276                                 outbuf[i] = '\'';
277                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
278                         }
279
280                         else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
281                                 outbuf[i] = '(';
282                                 outbuf[i+1] = 'c';
283                                 outbuf[i+2] = ')';
284                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
285                         }
286
287                         else if (!strncasecmp(&outbuf[i], "&hellip;", 8)) {
288                                 outbuf[i] = '.';
289                                 outbuf[i+1] = '.';
290                                 outbuf[i+2] = '.';
291                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
292                         }
293
294                         else if (!strncasecmp(&outbuf[i], "&trade;", 7)) {
295                                 outbuf[i] = '(';
296                                 outbuf[i+1] = 't';
297                                 outbuf[i+2] = 'm';
298                                 outbuf[i+3] = ')';
299                                 strcpy(&outbuf[i+4], &outbuf[i+7]);
300                         }
301
302                         else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
303                                 outbuf[i] = '(';
304                                 outbuf[i+1] = 'r';
305                                 outbuf[i+2] = ')';
306                                 strcpy(&outbuf[i+3], &outbuf[i+5]);
307                         }
308
309                         else if (!strncasecmp(&outbuf[i], "&frac14;", 8)) {
310                                 outbuf[i] = '1';
311                                 outbuf[i+1] = '/';
312                                 outbuf[i+2] = '4';
313                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
314                         }
315
316                         else if (!strncasecmp(&outbuf[i], "&frac12;", 8)) {
317                                 outbuf[i] = '1';
318                                 outbuf[i+1] = '/';
319                                 outbuf[i+2] = '2';
320                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
321                         }
322
323                         else if (!strncasecmp(&outbuf[i], "&frac34;", 8)) {
324                                 outbuf[i] = '3';
325                                 outbuf[i+1] = '/';
326                                 outbuf[i+2] = '4';
327                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
328                         }
329
330                         else if (!strncasecmp(&outbuf[i], "&ndash;", 7)) {
331                                 outbuf[i] = '-';
332                                 outbuf[i+1] = '-';
333                                 strcpy(&outbuf[i+2], &outbuf[i+7]);
334                         }
335
336                         else if (!strncasecmp(&outbuf[i], "&mdash;", 7)) {
337                                 outbuf[i] = '-';
338                                 outbuf[i+1] = '-';
339                                 outbuf[i+2] = '-';
340                                 strcpy(&outbuf[i+3], &outbuf[i+7]);
341                         }
342
343                         /* two-digit decimal equivalents */
344                         else if ((!strncmp(&outbuf[i], "&#", 2))
345                               && (outbuf[i+4] == ';') ) {
346                                 scanch = 0;
347                                 sscanf(&outbuf[i+2], "%02d", &scanch);
348                                 outbuf[i] = scanch;
349                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
350                         }
351
352                         /* three-digit decimal equivalents */
353                         else if ((!strncmp(&outbuf[i], "&#", 2))
354                               && (outbuf[i+5] == ';') ) {
355                                 scanch = 0;
356                                 sscanf(&outbuf[i+2], "%03d", &scanch);
357                                 outbuf[i] = scanch;
358                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
359                         }
360
361                 }
362
363                 /* Make sure the output buffer is big enough */
364                 if ((output_len + strlen(outbuf) + SIZ)
365                    > outptr_buffer_size) {
366                         outptr_buffer_size += SIZ;
367                         outptr = realloc(outptr, outptr_buffer_size);
368                 }
369
370                 /* Output any lines terminated with hard line breaks */
371                 do {
372                         did_out = 0;
373                         if (strlen(outbuf)>0) {
374                             for (i = 0; i<strlen(outbuf); ++i) {
375                                 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
376
377                                         strncpy(&outptr[output_len],
378                                                 outbuf, i+1);
379                                         output_len += (i+1);
380
381                                         if (do_citaformat) {
382                                                 strcpy(&outptr[output_len],
383                                                         " ");
384                                                 ++output_len;
385                                         }
386
387                                         strcpy(outbuf, &outbuf[i+1]);
388                                         i = 0;
389                                         did_out = 1;
390                                 }
391                         }
392                     }
393                 } while (did_out);
394
395                 /* Add soft line breaks */
396                 if (strlen(outbuf) > (screenwidth - 2 )) {
397                         rb = (-1);
398                         for (i=0; i<(screenwidth-2); ++i) {
399                                 if (outbuf[i]==32) rb = i;
400                         }
401                         if (rb>=0) {
402                                 strncpy(&outptr[output_len], outbuf, rb);
403                                 output_len += rb;
404                                 strcpy(&outptr[output_len], "\n");
405                                 output_len += 1;
406                                 if (do_citaformat) {
407                                         strcpy(&outptr[output_len], " ");
408                                         ++output_len;
409                                 }
410                                 strcpy(outbuf, &outbuf[rb+1]);
411                         } else {
412                                 strncpy(&outptr[output_len], outbuf,
413                                         screenwidth-2);
414                                 output_len += (screenwidth-2);
415                                 strcpy(&outptr[output_len], "\n");
416                                 output_len += 1;
417                                 if (do_citaformat) {
418                                         strcpy(&outptr[output_len], " ");
419                                         ++output_len;
420                                 }
421                                 strcpy(outbuf, &outbuf[screenwidth-2]);
422                         }
423                 }
424
425         } while (done_reading == 0);
426
427         strcpy(&outptr[output_len], outbuf);
428         output_len += strlen(outbuf);
429
430         /* Strip leading/trailing whitespace.  We can't do this with
431          * striplt() because it uses too many strlen()'s
432          */
433         while ((output_len > 0) && (isspace(outptr[0]))) {
434                 strcpy(outptr, &outptr[1]);
435                 --output_len;
436         }
437         while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
438                 outptr[output_len-1] = 0;
439                 --output_len;
440         }
441
442         if (outptr[output_len-1] != '\n') {
443                 strcat(outptr, "\n");
444                 ++output_len;
445         }
446
447         return outptr;
448
449 }