]> code.citadel.org Git - citadel.git/blob - citadel/html.c
* html.c: don't truncate messages when wacky characters are
[citadel.git] / citadel / html.c
1 /*
2  * $Id$
3  *
4  * Functions which handle translation between HTML and plain text
5  * Copyright (c) 2000-2001 by Art Cancro and others.   This program is
6  * released under the terms of the GNU General Public License.
7  */
8
9 #ifdef DLL_EXPORT
10 #define IN_LIBCIT
11 #endif
12
13 #include "sysdep.h"
14 #include <stdlib.h>
15 #include <unistd.h>
16 #include <stdio.h>
17 #include <fcntl.h>
18 #include <signal.h>
19
20 #if TIME_WITH_SYS_TIME
21 # include <sys/time.h>
22 # include <time.h>
23 #else
24 # if HAVE_SYS_TIME_H
25 #  include <sys/time.h>
26 # else
27 #  include <time.h>
28 # endif
29 #endif
30
31 #include <ctype.h>
32 #include <string.h>
33 #include <errno.h>
34 #include <limits.h>
35 #include <syslog.h>
36 #include "citadel.h"
37 #include "server.h"
38 #include "serv_extensions.h"
39 #include "control.h"
40 #include "sysdep_decls.h"
41 #include "support.h"
42 #include "config.h"
43 #include "msgbase.h"
44 #include "tools.h"
45 #include "room_ops.h"
46 #include "html.h"
47  
48
49 /*
50  * Convert HTML to plain text.
51  *
52  * inputmsg      = pointer to raw HTML message
53  * screenwidth   = desired output screenwidth
54  * do_citaformat = set to 1 to indent newlines with spaces
55  */
56 char *html_to_ascii(char *inputmsg, int screenwidth, int do_citaformat) {
57         char inbuf[SIZ];
58         char outbuf[SIZ];
59         char tag[1024];
60         int done_reading = 0;
61         char *inptr;
62         char *outptr;
63         size_t outptr_buffer_size;
64         size_t output_len = 0;
65         int i, j, ch, did_out, rb;
66         int nest = 0;           /* Bracket nesting level */
67
68         inptr = inputmsg;
69         strcpy(inbuf, "");
70         strcpy(outbuf, "");
71
72         outptr_buffer_size = strlen(inptr) + SIZ;
73         outptr = mallok(outptr_buffer_size);
74         if (outptr == NULL) return NULL;
75         strcpy(outptr, "");
76         output_len = 0;
77
78         do {
79                 /* Fill the input buffer */
80                 if ( (done_reading == 0) && (strlen(inbuf) < (SIZ-128)) ) {
81
82                         ch = *inptr++;
83                         if (ch != 0) {
84                                 inbuf[strlen(inbuf)+1] = 0;
85                                 inbuf[strlen(inbuf)] = ch;
86                         } 
87                         else {
88                                 done_reading = 1;
89                         }
90
91                 }
92
93                 /* Do some parsing */
94                 if (strlen(inbuf)>0) {
95
96                     /* Fold in all the spacing */
97                     for (i=0; i<strlen(inbuf); ++i) {
98                         if (inbuf[i]==10) inbuf[i]=32;
99                         if (inbuf[i]==13) inbuf[i]=32;
100                         if (inbuf[i]==9) inbuf[i]=32;
101                         if ((inbuf[i]<32) || (inbuf[i]>126)) {
102                                 inbuf[i] = '?';
103                                 /* strcpy(&inbuf[i], &inbuf[i+1]); */
104                         }
105                     }
106                     for (i=0; i<strlen(inbuf); ++i) {
107                         while ((inbuf[i]==32)&&(inbuf[i+1]==32))
108                                 strcpy(&inbuf[i], &inbuf[i+1]);
109                     }
110
111                     for (i=0; i<strlen(inbuf); ++i) {
112
113                         ch = inbuf[i];
114
115                         if (ch == '<') {
116                                 ++nest;
117                                 strcpy(tag, "");
118                         }
119
120                         else if (ch == '>') {
121                                 if (nest > 0) --nest;
122                                 
123                                 if (!strcasecmp(tag, "P")) {
124                                         strcat(outbuf, "\n\n");
125                                 }
126
127                                 if (!strcasecmp(tag, "/DIV")) {
128                                         strcat(outbuf, "\n\n");
129                                 }
130
131                                 else if (!strcasecmp(tag, "H1")) {
132                                         strcat(outbuf, "\n\n");
133                                 }
134
135                                 else if (!strcasecmp(tag, "H2")) {
136                                         strcat(outbuf, "\n\n");
137                                 }
138
139                                 else if (!strcasecmp(tag, "H3")) {
140                                         strcat(outbuf, "\n\n");
141                                 }
142
143                                 else if (!strcasecmp(tag, "H4")) {
144                                         strcat(outbuf, "\n\n");
145                                 }
146
147                                 else if (!strcasecmp(tag, "/H1")) {
148                                         strcat(outbuf, "\n");
149                                 }
150
151                                 else if (!strcasecmp(tag, "/H2")) {
152                                         strcat(outbuf, "\n");
153                                 }
154
155                                 else if (!strcasecmp(tag, "/H3")) {
156                                         strcat(outbuf, "\n");
157                                 }
158
159                                 else if (!strcasecmp(tag, "/H4")) {
160                                         strcat(outbuf, "\n");
161                                 }
162
163                                 else if (!strcasecmp(tag, "HR")) {
164                                         strcat(outbuf, "\n ");
165                                         for (j=0; j<screenwidth-2; ++j)
166                                                 strcat(outbuf, "-");
167                                         strcat(outbuf, "\n");
168                                 }
169
170                                 else if (!strcasecmp(tag, "BR")) {
171                                         strcat(outbuf, "\n");
172                                 }
173
174                                 else if (!strcasecmp(tag, "TR")) {
175                                         strcat(outbuf, "\n");
176                                 }
177
178                                 else if (!strcasecmp(tag, "/TABLE")) {
179                                         strcat(outbuf, "\n");
180                                 }
181
182                         }
183
184                         else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
185                                 tag[strlen(tag)+1] = 0;
186                                 tag[strlen(tag)] = ch;
187                         }
188                                 
189                         else if (!nest) {
190                                 outbuf[strlen(outbuf)+1] = 0;
191                                 outbuf[strlen(outbuf)] = ch;
192                         }
193                     }
194                     strcpy(inbuf, &inbuf[i]);
195                 }
196
197                 /* Convert &; tags to the forbidden characters */
198                 if (strlen(outbuf)>0) for (i=0; i<strlen(outbuf); ++i) {
199
200                         if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
201                                 outbuf[i] = ' ';
202                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
203                         }
204
205                         else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
206                                 outbuf[i] = '<';
207                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
208                         }
209
210                         else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
211                                 outbuf[i] = '>';
212                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
213                         }
214
215                         else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
216                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
217                         }
218
219                         else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
220                                 outbuf[i] = '\"';
221                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
222                         }
223
224                         else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
225                                 outbuf[i] = '(';
226                                 outbuf[i+1] = 'c';
227                                 outbuf[i+2] = ')';
228                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
229                         }
230
231                         else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
232                                 outbuf[i] = '(';
233                                 outbuf[i+1] = 'r';
234                                 outbuf[i+2] = ')';
235                                 strcpy(&outbuf[i+3], &outbuf[i+5]);
236                         }
237
238                 }
239
240                 /* Make sure the output buffer is big enough */
241                 if ((output_len + strlen(outbuf) + SIZ)
242                    > outptr_buffer_size) {
243                         outptr_buffer_size += SIZ;
244                         outptr = realloc(outptr, outptr_buffer_size);
245                 }
246
247                 /* Output any lines terminated with hard line breaks */
248                 do {
249                         did_out = 0;
250                         if (strlen(outbuf)>0)
251                             for (i = 0; i<strlen(outbuf); ++i) {
252                                 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
253
254                                         strncpy(&outptr[output_len],
255                                                 outbuf, i+1);
256                                         output_len += (i+1);
257
258                                         if (do_citaformat) {
259                                                 strcpy(&outptr[output_len],
260                                                         " ");
261                                                 ++output_len;
262                                         }
263
264                                         strcpy(outbuf, &outbuf[i+1]);
265                                         i = 0;
266                                         did_out = 1;
267                                 }
268                         }
269                 } while (did_out);
270
271                 /* Add soft line breaks */
272                 if (strlen(outbuf) > (screenwidth - 2)) {
273                         rb = (-1);
274                         for (i=0; i<(screenwidth-2); ++i) {
275                                 if (outbuf[i]==32) rb = i;
276                         }
277                         if (rb>=0) {
278                                 strncpy(&outptr[output_len], outbuf, rb);
279                                 output_len += rb;
280                                 strcpy(&outptr[output_len], "\n");
281                                 output_len += 1;
282                                 if (do_citaformat) {
283                                         strcpy(&outptr[output_len], " ");
284                                         ++output_len;
285                                 }
286                                 strcpy(outbuf, &outbuf[rb+1]);
287                         } else {
288                                 strncpy(&outptr[output_len], outbuf,
289                                         screenwidth-2);
290                                 output_len += (screenwidth-2);
291                                 strcpy(&outptr[output_len], "\n");
292                                 output_len += 1;
293                                 if (do_citaformat) {
294                                         strcpy(&outptr[output_len], " ");
295                                         ++output_len;
296                                 }
297                                 strcpy(outbuf, &outbuf[screenwidth-2]);
298                         }
299                 }
300
301         } while (done_reading == 0);
302
303         strcpy(&outptr[output_len], outbuf);
304         output_len += strlen(outbuf);
305
306         /* Strip leading/trailing whitespace.  We can't do this with
307          * striplt() because it uses too many strlen()'s
308          */
309         while ((output_len > 0) && (isspace(outptr[0]))) {
310                 strcpy(outptr, &outptr[1]);
311                 --output_len;
312         }
313         while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
314                 outptr[output_len-1] = 0;
315                 --output_len;
316         }
317
318         if (outptr[output_len-1] != '\n') {
319                 strcat(outptr, "\n");
320                 ++output_len;
321         }
322
323         return outptr;
324
325 }