]> code.citadel.org Git - citadel.git/blob - citadel/html.c
* html.c: added support for some additional character entity references.
[citadel.git] / citadel / html.c
1 /*
2  * $Id$
3  *
4  * Functions which handle translation between HTML and plain text
5  * Copyright (c) 2000-2001 by Art Cancro and others.   This program is
6  * released under the terms of the GNU General Public License.
7  */
8
9 #ifdef DLL_EXPORT
10 #define IN_LIBCIT
11 #endif
12
13 #include "sysdep.h"
14 #include <stdlib.h>
15 #include <unistd.h>
16 #include <stdio.h>
17 #include <fcntl.h>
18 #include <signal.h>
19
20 #if TIME_WITH_SYS_TIME
21 # include <sys/time.h>
22 # include <time.h>
23 #else
24 # if HAVE_SYS_TIME_H
25 #  include <sys/time.h>
26 # else
27 #  include <time.h>
28 # endif
29 #endif
30
31 #include <ctype.h>
32 #include <string.h>
33 #include <errno.h>
34 #include <limits.h>
35 #include "citadel.h"
36 #include "server.h"
37 #include "serv_extensions.h"
38 #include "control.h"
39 #include "sysdep_decls.h"
40 #include "support.h"
41 #include "config.h"
42 #include "msgbase.h"
43 #include "tools.h"
44 #include "room_ops.h"
45 #include "html.h"
46  
47
48 /*
49  * Convert HTML to plain text.
50  *
51  * inputmsg      = pointer to raw HTML message
52  * screenwidth   = desired output screenwidth
53  * do_citaformat = set to 1 to indent newlines with spaces
54  */
55 char *html_to_ascii(char *inputmsg, int screenwidth, int do_citaformat) {
56         char inbuf[SIZ];
57         char outbuf[SIZ];
58         char tag[1024];
59         int done_reading = 0;
60         char *inptr;
61         char *outptr;
62         size_t outptr_buffer_size;
63         size_t output_len = 0;
64         int i, j, ch, did_out, rb, scanch;
65         int nest = 0;           /* Bracket nesting level */
66         int blockquote = 0;     /* BLOCKQUOTE nesting level */
67
68         inptr = inputmsg;
69         strcpy(inbuf, "");
70         strcpy(outbuf, "");
71
72         outptr_buffer_size = strlen(inptr) + SIZ;
73         outptr = malloc(outptr_buffer_size);
74         if (outptr == NULL) return NULL;
75         strcpy(outptr, "");
76         output_len = 0;
77
78         do {
79                 /* Fill the input buffer */
80                 if ( (done_reading == 0) && (strlen(inbuf) < (SIZ-128)) ) {
81
82                         ch = *inptr++;
83                         if (ch != 0) {
84                                 inbuf[strlen(inbuf)+1] = 0;
85                                 inbuf[strlen(inbuf)] = ch;
86                         } 
87                         else {
88                                 done_reading = 1;
89                         }
90
91                 }
92
93                 /* Do some parsing */
94                 if (strlen(inbuf)>0) {
95
96                     /* Fold in all the spacing */
97                     for (i=0; i<strlen(inbuf); ++i) {
98                         if (inbuf[i]==10) inbuf[i]=32;
99                         if (inbuf[i]==13) inbuf[i]=32;
100                         if (inbuf[i]==9) inbuf[i]=32;
101                         /*** we like foreign characters now.
102                         if ((inbuf[i]<32) || (inbuf[i]>126)) {
103                                 inbuf[i] = '?';
104                         } */
105                     }
106                     for (i=0; i<strlen(inbuf); ++i) {
107                         while ((inbuf[i]==32)&&(inbuf[i+1]==32))
108                                 strcpy(&inbuf[i], &inbuf[i+1]);
109                     }
110
111                     for (i=0; i<strlen(inbuf); ++i) {
112
113                         ch = inbuf[i];
114
115                         if (ch == '<') {
116                                 ++nest;
117                                 strcpy(tag, "");
118                         }
119
120                         else if (ch == '>') {   /* We have a tag. */
121                                 if (nest > 0) --nest;
122
123                                 /* Unqualify the tag (truncate at first space) */
124                                 if (strchr(tag, ' ') != NULL) {
125                                         strcpy(strchr(tag, ' '), "");
126                                 }
127                                 
128                                 if (!strcasecmp(tag, "P")) {
129                                         strcat(outbuf, "\n\n");
130                                 }
131
132                                 if (!strcasecmp(tag, "/DIV")) {
133                                         strcat(outbuf, "\n\n");
134                                 }
135
136                                 if (!strcasecmp(tag, "LI")) {
137                                         strcat(outbuf, "\n * ");
138                                 }
139
140                                 else if (!strcasecmp(tag, "/UL")) {
141                                         strcat(outbuf, "\n\n");
142                                 }
143
144                                 else if (!strcasecmp(tag, "H1")) {
145                                         strcat(outbuf, "\n\n");
146                                 }
147
148                                 else if (!strcasecmp(tag, "H2")) {
149                                         strcat(outbuf, "\n\n");
150                                 }
151
152                                 else if (!strcasecmp(tag, "H3")) {
153                                         strcat(outbuf, "\n\n");
154                                 }
155
156                                 else if (!strcasecmp(tag, "H4")) {
157                                         strcat(outbuf, "\n\n");
158                                 }
159
160                                 else if (!strcasecmp(tag, "/H1")) {
161                                         strcat(outbuf, "\n");
162                                 }
163
164                                 else if (!strcasecmp(tag, "/H2")) {
165                                         strcat(outbuf, "\n");
166                                 }
167
168                                 else if (!strcasecmp(tag, "/H3")) {
169                                         strcat(outbuf, "\n");
170                                 }
171
172                                 else if (!strcasecmp(tag, "/H4")) {
173                                         strcat(outbuf, "\n");
174                                 }
175
176                                 else if (!strcasecmp(tag, "HR")) {
177                                         strcat(outbuf, "\n ");
178                                         for (j=0; j<screenwidth-2; ++j)
179                                                 strcat(outbuf, "-");
180                                         strcat(outbuf, "\n");
181                                 }
182
183                                 else if (!strcasecmp(tag, "BR")) {
184                                         strcat(outbuf, "\n");
185                                 }
186
187                                 else if (!strcasecmp(tag, "TR")) {
188                                         strcat(outbuf, "\n");
189                                 }
190
191                                 else if (!strcasecmp(tag, "/TABLE")) {
192                                         strcat(outbuf, "\n");
193                                 }
194
195                                 else if (!strcasecmp(tag, "BLOCKQUOTE")) {
196                                         strcat(outbuf, "\n\n <<\n");
197                                         ++blockquote;
198                                 }
199
200                                 else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
201                                         strcat(outbuf, "\n >>\n\n");
202                                         --blockquote;
203                                 }
204
205                         }
206
207                         else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
208                                 tag[strlen(tag)+1] = 0;
209                                 tag[strlen(tag)] = ch;
210                         }
211                                 
212                         else if (!nest) {
213                                 outbuf[strlen(outbuf)+1] = 0;
214                                 outbuf[strlen(outbuf)] = ch;
215                         }
216                     }
217                     strcpy(inbuf, &inbuf[i]);
218                 }
219
220                 /* Convert &; tags to the forbidden characters */
221                 if (strlen(outbuf)>0) for (i=0; i<strlen(outbuf); ++i) {
222
223                         /* Character entity references */
224                         if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
225                                 outbuf[i] = ' ';
226                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
227                         }
228
229                         if (!strncasecmp(&outbuf[i], "&ensp;", 6)) {
230                                 outbuf[i] = ' ';
231                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
232                         }
233
234                         if (!strncasecmp(&outbuf[i], "&emsp;", 6)) {
235                                 outbuf[i] = ' ';
236                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
237                         }
238
239                         if (!strncasecmp(&outbuf[i], "&thinsp;", 8)) {
240                                 outbuf[i] = ' ';
241                                 strcpy(&outbuf[i+1], &outbuf[i+8]);
242                         }
243
244                         else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
245                                 outbuf[i] = '<';
246                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
247                         }
248
249                         else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
250                                 outbuf[i] = '>';
251                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
252                         }
253
254                         else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
255                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
256                         }
257
258                         else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
259                                 outbuf[i] = '\"';
260                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
261                         }
262
263                         else if (!strncasecmp(&outbuf[i], "&lsquo;", 7)) {
264                                 outbuf[i] = '`';
265                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
266                         }
267
268                         else if (!strncasecmp(&outbuf[i], "&rsquo;", 7)) {
269                                 outbuf[i] = '\'';
270                                 strcpy(&outbuf[i+1], &outbuf[i+7]);
271                         }
272
273                         else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
274                                 outbuf[i] = '(';
275                                 outbuf[i+1] = 'c';
276                                 outbuf[i+2] = ')';
277                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
278                         }
279
280                         else if (!strncasecmp(&outbuf[i], "&hellip;", 8)) {
281                                 outbuf[i] = '.';
282                                 outbuf[i+1] = '.';
283                                 outbuf[i+2] = '.';
284                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
285                         }
286
287                         else if (!strncasecmp(&outbuf[i], "&trade;", 7)) {
288                                 outbuf[i] = '(';
289                                 outbuf[i+1] = 't';
290                                 outbuf[i+2] = 'm';
291                                 outbuf[i+3] = ')';
292                                 strcpy(&outbuf[i+4], &outbuf[i+7]);
293                         }
294
295                         else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
296                                 outbuf[i] = '(';
297                                 outbuf[i+1] = 'r';
298                                 outbuf[i+2] = ')';
299                                 strcpy(&outbuf[i+3], &outbuf[i+5]);
300                         }
301
302                         else if (!strncasecmp(&outbuf[i], "&frac14;", 8)) {
303                                 outbuf[i] = '1';
304                                 outbuf[i+1] = '/';
305                                 outbuf[i+2] = '4';
306                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
307                         }
308
309                         else if (!strncasecmp(&outbuf[i], "&frac12;", 8)) {
310                                 outbuf[i] = '1';
311                                 outbuf[i+1] = '/';
312                                 outbuf[i+2] = '2';
313                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
314                         }
315
316                         else if (!strncasecmp(&outbuf[i], "&frac34;", 8)) {
317                                 outbuf[i] = '3';
318                                 outbuf[i+1] = '/';
319                                 outbuf[i+2] = '4';
320                                 strcpy(&outbuf[i+3], &outbuf[i+8]);
321                         }
322
323                         else if (!strncasecmp(&outbuf[i], "&ndash;", 7)) {
324                                 outbuf[i] = '-';
325                                 outbuf[i+1] = '-';
326                                 strcpy(&outbuf[i+2], &outbuf[i+7]);
327                         }
328
329                         else if (!strncasecmp(&outbuf[i], "&mdash;", 7)) {
330                                 outbuf[i] = '-';
331                                 outbuf[i+1] = '-';
332                                 outbuf[i+2] = '-';
333                                 strcpy(&outbuf[i+3], &outbuf[i+7]);
334                         }
335
336                         /* two-digit decimal equivalents */
337                         else if ((!strncmp(&outbuf[i], "&#", 2))
338                               && (outbuf[i+4] == ';') ) {
339                                 scanch = 0;
340                                 sscanf(&outbuf[i+2], "%02d", &scanch);
341                                 outbuf[i] = scanch;
342                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
343                         }
344
345                         /* three-digit decimal equivalents */
346                         else if ((!strncmp(&outbuf[i], "&#", 2))
347                               && (outbuf[i+5] == ';') ) {
348                                 scanch = 0;
349                                 sscanf(&outbuf[i+2], "%03d", &scanch);
350                                 outbuf[i] = scanch;
351                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
352                         }
353
354                 }
355
356                 /* Make sure the output buffer is big enough */
357                 if ((output_len + strlen(outbuf) + SIZ)
358                    > outptr_buffer_size) {
359                         outptr_buffer_size += SIZ;
360                         outptr = realloc(outptr, outptr_buffer_size);
361                 }
362
363                 /* Output any lines terminated with hard line breaks */
364                 do {
365                         did_out = 0;
366                         if (strlen(outbuf)>0) {
367                             for (i = 0; i<strlen(outbuf); ++i) {
368                                 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
369
370                                         strncpy(&outptr[output_len],
371                                                 outbuf, i+1);
372                                         output_len += (i+1);
373
374                                         if (do_citaformat) {
375                                                 strcpy(&outptr[output_len],
376                                                         " ");
377                                                 ++output_len;
378                                         }
379
380                                         strcpy(outbuf, &outbuf[i+1]);
381                                         i = 0;
382                                         did_out = 1;
383                                 }
384                         }
385                     }
386                 } while (did_out);
387
388                 /* Add soft line breaks */
389                 if (strlen(outbuf) > (screenwidth - 2 )) {
390                         rb = (-1);
391                         for (i=0; i<(screenwidth-2); ++i) {
392                                 if (outbuf[i]==32) rb = i;
393                         }
394                         if (rb>=0) {
395                                 strncpy(&outptr[output_len], outbuf, rb);
396                                 output_len += rb;
397                                 strcpy(&outptr[output_len], "\n");
398                                 output_len += 1;
399                                 if (do_citaformat) {
400                                         strcpy(&outptr[output_len], " ");
401                                         ++output_len;
402                                 }
403                                 strcpy(outbuf, &outbuf[rb+1]);
404                         } else {
405                                 strncpy(&outptr[output_len], outbuf,
406                                         screenwidth-2);
407                                 output_len += (screenwidth-2);
408                                 strcpy(&outptr[output_len], "\n");
409                                 output_len += 1;
410                                 if (do_citaformat) {
411                                         strcpy(&outptr[output_len], " ");
412                                         ++output_len;
413                                 }
414                                 strcpy(outbuf, &outbuf[screenwidth-2]);
415                         }
416                 }
417
418         } while (done_reading == 0);
419
420         strcpy(&outptr[output_len], outbuf);
421         output_len += strlen(outbuf);
422
423         /* Strip leading/trailing whitespace.  We can't do this with
424          * striplt() because it uses too many strlen()'s
425          */
426         while ((output_len > 0) && (isspace(outptr[0]))) {
427                 strcpy(outptr, &outptr[1]);
428                 --output_len;
429         }
430         while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
431                 outptr[output_len-1] = 0;
432                 --output_len;
433         }
434
435         if (outptr[output_len-1] != '\n') {
436                 strcat(outptr, "\n");
437                 ++output_len;
438         }
439
440         return outptr;
441
442 }