* Removed the built-in memory leak checker. It wasn't threadsafe and
[citadel.git] / citadel / html.c
1 /*
2  * $Id$
3  *
4  * Functions which handle translation between HTML and plain text
5  * Copyright (c) 2000-2001 by Art Cancro and others.   This program is
6  * released under the terms of the GNU General Public License.
7  */
8
9 #ifdef DLL_EXPORT
10 #define IN_LIBCIT
11 #endif
12
13 #include "sysdep.h"
14 #include <stdlib.h>
15 #include <unistd.h>
16 #include <stdio.h>
17 #include <fcntl.h>
18 #include <signal.h>
19
20 #if TIME_WITH_SYS_TIME
21 # include <sys/time.h>
22 # include <time.h>
23 #else
24 # if HAVE_SYS_TIME_H
25 #  include <sys/time.h>
26 # else
27 #  include <time.h>
28 # endif
29 #endif
30
31 #include <ctype.h>
32 #include <string.h>
33 #include <errno.h>
34 #include <limits.h>
35 #include "citadel.h"
36 #include "server.h"
37 #include "serv_extensions.h"
38 #include "control.h"
39 #include "sysdep_decls.h"
40 #include "support.h"
41 #include "config.h"
42 #include "msgbase.h"
43 #include "tools.h"
44 #include "room_ops.h"
45 #include "html.h"
46  
47
48 /*
49  * Convert HTML to plain text.
50  *
51  * inputmsg      = pointer to raw HTML message
52  * screenwidth   = desired output screenwidth
53  * do_citaformat = set to 1 to indent newlines with spaces
54  */
55 char *html_to_ascii(char *inputmsg, int screenwidth, int do_citaformat) {
56         char inbuf[SIZ];
57         char outbuf[SIZ];
58         char tag[1024];
59         int done_reading = 0;
60         char *inptr;
61         char *outptr;
62         size_t outptr_buffer_size;
63         size_t output_len = 0;
64         int i, j, ch, did_out, rb;
65         int nest = 0;           /* Bracket nesting level */
66
67         inptr = inputmsg;
68         strcpy(inbuf, "");
69         strcpy(outbuf, "");
70
71         outptr_buffer_size = strlen(inptr) + SIZ;
72         outptr = malloc(outptr_buffer_size);
73         if (outptr == NULL) return NULL;
74         strcpy(outptr, "");
75         output_len = 0;
76
77         do {
78                 /* Fill the input buffer */
79                 if ( (done_reading == 0) && (strlen(inbuf) < (SIZ-128)) ) {
80
81                         ch = *inptr++;
82                         if (ch != 0) {
83                                 inbuf[strlen(inbuf)+1] = 0;
84                                 inbuf[strlen(inbuf)] = ch;
85                         } 
86                         else {
87                                 done_reading = 1;
88                         }
89
90                 }
91
92                 /* Do some parsing */
93                 if (strlen(inbuf)>0) {
94
95                     /* Fold in all the spacing */
96                     for (i=0; i<strlen(inbuf); ++i) {
97                         if (inbuf[i]==10) inbuf[i]=32;
98                         if (inbuf[i]==13) inbuf[i]=32;
99                         if (inbuf[i]==9) inbuf[i]=32;
100                         if ((inbuf[i]<32) || (inbuf[i]>126)) {
101                                 inbuf[i] = '?';
102                                 /* strcpy(&inbuf[i], &inbuf[i+1]); */
103                         }
104                     }
105                     for (i=0; i<strlen(inbuf); ++i) {
106                         while ((inbuf[i]==32)&&(inbuf[i+1]==32))
107                                 strcpy(&inbuf[i], &inbuf[i+1]);
108                     }
109
110                     for (i=0; i<strlen(inbuf); ++i) {
111
112                         ch = inbuf[i];
113
114                         if (ch == '<') {
115                                 ++nest;
116                                 strcpy(tag, "");
117                         }
118
119                         else if (ch == '>') {
120                                 if (nest > 0) --nest;
121                                 
122                                 if (!strcasecmp(tag, "P")) {
123                                         strcat(outbuf, "\n\n");
124                                 }
125
126                                 if (!strcasecmp(tag, "/DIV")) {
127                                         strcat(outbuf, "\n\n");
128                                 }
129
130                                 else if (!strcasecmp(tag, "H1")) {
131                                         strcat(outbuf, "\n\n");
132                                 }
133
134                                 else if (!strcasecmp(tag, "H2")) {
135                                         strcat(outbuf, "\n\n");
136                                 }
137
138                                 else if (!strcasecmp(tag, "H3")) {
139                                         strcat(outbuf, "\n\n");
140                                 }
141
142                                 else if (!strcasecmp(tag, "H4")) {
143                                         strcat(outbuf, "\n\n");
144                                 }
145
146                                 else if (!strcasecmp(tag, "/H1")) {
147                                         strcat(outbuf, "\n");
148                                 }
149
150                                 else if (!strcasecmp(tag, "/H2")) {
151                                         strcat(outbuf, "\n");
152                                 }
153
154                                 else if (!strcasecmp(tag, "/H3")) {
155                                         strcat(outbuf, "\n");
156                                 }
157
158                                 else if (!strcasecmp(tag, "/H4")) {
159                                         strcat(outbuf, "\n");
160                                 }
161
162                                 else if (!strcasecmp(tag, "HR")) {
163                                         strcat(outbuf, "\n ");
164                                         for (j=0; j<screenwidth-2; ++j)
165                                                 strcat(outbuf, "-");
166                                         strcat(outbuf, "\n");
167                                 }
168
169                                 else if (!strcasecmp(tag, "BR")) {
170                                         strcat(outbuf, "\n");
171                                 }
172
173                                 else if (!strcasecmp(tag, "TR")) {
174                                         strcat(outbuf, "\n");
175                                 }
176
177                                 else if (!strcasecmp(tag, "/TABLE")) {
178                                         strcat(outbuf, "\n");
179                                 }
180
181                         }
182
183                         else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
184                                 tag[strlen(tag)+1] = 0;
185                                 tag[strlen(tag)] = ch;
186                         }
187                                 
188                         else if (!nest) {
189                                 outbuf[strlen(outbuf)+1] = 0;
190                                 outbuf[strlen(outbuf)] = ch;
191                         }
192                     }
193                     strcpy(inbuf, &inbuf[i]);
194                 }
195
196                 /* Convert &; tags to the forbidden characters */
197                 if (strlen(outbuf)>0) for (i=0; i<strlen(outbuf); ++i) {
198
199                         if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
200                                 outbuf[i] = ' ';
201                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
202                         }
203
204                         else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
205                                 outbuf[i] = '<';
206                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
207                         }
208
209                         else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
210                                 outbuf[i] = '>';
211                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
212                         }
213
214                         else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
215                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
216                         }
217
218                         else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
219                                 outbuf[i] = '\"';
220                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
221                         }
222
223                         else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
224                                 outbuf[i] = '(';
225                                 outbuf[i+1] = 'c';
226                                 outbuf[i+2] = ')';
227                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
228                         }
229
230                         else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
231                                 outbuf[i] = '(';
232                                 outbuf[i+1] = 'r';
233                                 outbuf[i+2] = ')';
234                                 strcpy(&outbuf[i+3], &outbuf[i+5]);
235                         }
236
237                 }
238
239                 /* Make sure the output buffer is big enough */
240                 if ((output_len + strlen(outbuf) + SIZ)
241                    > outptr_buffer_size) {
242                         outptr_buffer_size += SIZ;
243                         outptr = realloc(outptr, outptr_buffer_size);
244                 }
245
246                 /* Output any lines terminated with hard line breaks */
247                 do {
248                         did_out = 0;
249                         if (strlen(outbuf)>0)
250                             for (i = 0; i<strlen(outbuf); ++i) {
251                                 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
252
253                                         strncpy(&outptr[output_len],
254                                                 outbuf, i+1);
255                                         output_len += (i+1);
256
257                                         if (do_citaformat) {
258                                                 strcpy(&outptr[output_len],
259                                                         " ");
260                                                 ++output_len;
261                                         }
262
263                                         strcpy(outbuf, &outbuf[i+1]);
264                                         i = 0;
265                                         did_out = 1;
266                                 }
267                         }
268                 } while (did_out);
269
270                 /* Add soft line breaks */
271                 if (strlen(outbuf) > (screenwidth - 2)) {
272                         rb = (-1);
273                         for (i=0; i<(screenwidth-2); ++i) {
274                                 if (outbuf[i]==32) rb = i;
275                         }
276                         if (rb>=0) {
277                                 strncpy(&outptr[output_len], outbuf, rb);
278                                 output_len += rb;
279                                 strcpy(&outptr[output_len], "\n");
280                                 output_len += 1;
281                                 if (do_citaformat) {
282                                         strcpy(&outptr[output_len], " ");
283                                         ++output_len;
284                                 }
285                                 strcpy(outbuf, &outbuf[rb+1]);
286                         } else {
287                                 strncpy(&outptr[output_len], outbuf,
288                                         screenwidth-2);
289                                 output_len += (screenwidth-2);
290                                 strcpy(&outptr[output_len], "\n");
291                                 output_len += 1;
292                                 if (do_citaformat) {
293                                         strcpy(&outptr[output_len], " ");
294                                         ++output_len;
295                                 }
296                                 strcpy(outbuf, &outbuf[screenwidth-2]);
297                         }
298                 }
299
300         } while (done_reading == 0);
301
302         strcpy(&outptr[output_len], outbuf);
303         output_len += strlen(outbuf);
304
305         /* Strip leading/trailing whitespace.  We can't do this with
306          * striplt() because it uses too many strlen()'s
307          */
308         while ((output_len > 0) && (isspace(outptr[0]))) {
309                 strcpy(outptr, &outptr[1]);
310                 --output_len;
311         }
312         while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
313                 outptr[output_len-1] = 0;
314                 --output_len;
315         }
316
317         if (outptr[output_len-1] != '\n') {
318                 strcat(outptr, "\n");
319                 ++output_len;
320         }
321
322         return outptr;
323
324 }