d717393b4de4cfe77739d2e6c9e116ccc22a3006
[citadel.git] / citadel / html.c
1 /*
2  * $Id$
3  *
4  * Functions which handle translation between HTML and plain text
5  * Copyright (c) 2000-2001 by Art Cancro and others.   This program is
6  * released under the terms of the GNU General Public License.
7  */
8
9 #ifdef DLL_EXPORT
10 #define IN_LIBCIT
11 #endif
12
13 #include "sysdep.h"
14 #include <stdlib.h>
15 #include <unistd.h>
16 #include <stdio.h>
17 #include <fcntl.h>
18 #include <signal.h>
19
20 #if TIME_WITH_SYS_TIME
21 # include <sys/time.h>
22 # include <time.h>
23 #else
24 # if HAVE_SYS_TIME_H
25 #  include <sys/time.h>
26 # else
27 #  include <time.h>
28 # endif
29 #endif
30
31 #include <ctype.h>
32 #include <string.h>
33 #include <errno.h>
34 #include <limits.h>
35 #include <syslog.h>
36 #include "citadel.h"
37 #include "server.h"
38 #include "dynloader.h"
39 #include "control.h"
40 #include "sysdep_decls.h"
41 #include "support.h"
42 #include "config.h"
43 #include "msgbase.h"
44 #include "tools.h"
45 #include "room_ops.h"
46 #include "html.h"
47  
48
49 /*
50  * Convert HTML to plain text.
51  *
52  * inputmsg      = pointer to raw HTML message
53  * screenwidth   = desired output screenwidth
54  * do_citaformat = set to 1 to indent newlines with spaces
55  */
56 char *html_to_ascii(char *inputmsg, int screenwidth, int do_citaformat) {
57         char inbuf[SIZ];
58         char outbuf[SIZ];
59         char tag[1024];
60         int done_reading = 0;
61         char *inptr;
62         char *outptr;
63         size_t outptr_buffer_size;
64         size_t output_len = 0;
65         int i, j, ch, did_out, rb;
66         int nest = 0;           /* Bracket nesting level */
67
68         inptr = inputmsg;
69         strcpy(inbuf, "");
70         strcpy(outbuf, "");
71
72         outptr_buffer_size = strlen(inptr) + SIZ;
73         outptr = mallok(outptr_buffer_size);
74         if (outptr == NULL) return NULL;
75         strcpy(outptr, "");
76         output_len = 0;
77
78         do {
79                 /* Fill the input buffer */
80                 if ( (done_reading == 0) && (strlen(inbuf) < 128) ) {
81
82                         ch = *inputmsg++;
83                         if (ch > 0) {
84                                 inbuf[strlen(inbuf)+1] = 0;
85                                 inbuf[strlen(inbuf)] = ch;
86                         } 
87                         else {
88                                 done_reading = 1;
89                         }
90
91                 }
92
93                 /* Do some parsing */
94                 if (strlen(inbuf)>0) {
95
96                     /* Fold in all the spacing */
97                     for (i=0; i<strlen(inbuf); ++i) {
98                         if (inbuf[i]==10) inbuf[i]=32;
99                         if (inbuf[i]==13) inbuf[i]=32;
100                         if (inbuf[i]==9) inbuf[i]=32;
101                         if ((inbuf[i]<32) || (inbuf[i]>126))
102                                 strcpy(&inbuf[i], &inbuf[i+1]);
103                     }
104                     for (i=0; i<strlen(inbuf); ++i) {
105                         while ((inbuf[i]==32)&&(inbuf[i+1]==32))
106                                 strcpy(&inbuf[i], &inbuf[i+1]);
107                     }
108
109                     for (i=0; i<strlen(inbuf); ++i) {
110
111                         ch = inbuf[i];
112
113                         if (ch == '<') {
114                                 ++nest;
115                                 strcpy(tag, "");
116                         }
117
118                         else if (ch == '>') {
119                                 if (nest > 0) --nest;
120                                 
121                                 if (!strcasecmp(tag, "P")) {
122                                         strcat(outbuf, "\n\n");
123                                 }
124
125                                 if (!strcasecmp(tag, "/DIV")) {
126                                         strcat(outbuf, "\n\n");
127                                 }
128
129                                 else if (!strcasecmp(tag, "H1")) {
130                                         strcat(outbuf, "\n\n");
131                                 }
132
133                                 else if (!strcasecmp(tag, "H2")) {
134                                         strcat(outbuf, "\n\n");
135                                 }
136
137                                 else if (!strcasecmp(tag, "H3")) {
138                                         strcat(outbuf, "\n\n");
139                                 }
140
141                                 else if (!strcasecmp(tag, "H4")) {
142                                         strcat(outbuf, "\n\n");
143                                 }
144
145                                 else if (!strcasecmp(tag, "/H1")) {
146                                         strcat(outbuf, "\n");
147                                 }
148
149                                 else if (!strcasecmp(tag, "/H2")) {
150                                         strcat(outbuf, "\n");
151                                 }
152
153                                 else if (!strcasecmp(tag, "/H3")) {
154                                         strcat(outbuf, "\n");
155                                 }
156
157                                 else if (!strcasecmp(tag, "/H4")) {
158                                         strcat(outbuf, "\n");
159                                 }
160
161                                 else if (!strcasecmp(tag, "HR")) {
162                                         strcat(outbuf, "\n ");
163                                         for (j=0; j<screenwidth-2; ++j)
164                                                 strcat(outbuf, "-");
165                                         strcat(outbuf, "\n");
166                                 }
167
168                                 else if (!strcasecmp(tag, "BR")) {
169                                         strcat(outbuf, "\n");
170                                 }
171
172                                 else if (!strcasecmp(tag, "TR")) {
173                                         strcat(outbuf, "\n");
174                                 }
175
176                                 else if (!strcasecmp(tag, "/TABLE")) {
177                                         strcat(outbuf, "\n");
178                                 }
179
180                         }
181
182                         else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
183                                 tag[strlen(tag)+1] = 0;
184                                 tag[strlen(tag)] = ch;
185                         }
186                                 
187                         else if (!nest) {
188                                 outbuf[strlen(outbuf)+1] = 0;
189                                 outbuf[strlen(outbuf)] = ch;
190                         }
191                     }
192                     strcpy(inbuf, &inbuf[i]);
193                 }
194
195                 /* Convert &; tags to the forbidden characters */
196                 if (strlen(outbuf)>0) for (i=0; i<strlen(outbuf); ++i) {
197
198                         if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
199                                 outbuf[i] = ' ';
200                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
201                         }
202
203                         else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
204                                 outbuf[i] = '<';
205                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
206                         }
207
208                         else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
209                                 outbuf[i] = '>';
210                                 strcpy(&outbuf[i+1], &outbuf[i+4]);
211                         }
212
213                         else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
214                                 strcpy(&outbuf[i+1], &outbuf[i+5]);
215                         }
216
217                         else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
218                                 outbuf[i] = '\"';
219                                 strcpy(&outbuf[i+1], &outbuf[i+6]);
220                         }
221
222                         else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
223                                 outbuf[i] = '(';
224                                 outbuf[i+1] = 'c';
225                                 outbuf[i+2] = ')';
226                                 strcpy(&outbuf[i+3], &outbuf[i+6]);
227                         }
228
229                         else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
230                                 outbuf[i] = '(';
231                                 outbuf[i+1] = 'r';
232                                 outbuf[i+2] = ')';
233                                 strcpy(&outbuf[i+3], &outbuf[i+5]);
234                         }
235
236                 }
237
238                 /* Make sure the output buffer is big enough */
239                 if ((output_len + strlen(outbuf) + 128)
240                    > outptr_buffer_size) {
241                         outptr_buffer_size += 128;
242                         outptr = realloc(outptr, outptr_buffer_size);
243                 }
244
245                 /* Output any lines terminated with hard line breaks */
246                 do {
247                         did_out = 0;
248                         if (strlen(outbuf)>0)
249                             for (i = 0; i<strlen(outbuf); ++i) {
250                                 if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
251
252                                         strncpy(&outptr[output_len],
253                                                 outbuf, i+1);
254                                         output_len += (i+1);
255
256                                         if (do_citaformat) {
257                                                 strcpy(&outptr[output_len],
258                                                         " ");
259                                                 ++output_len;
260                                         }
261
262                                         strcpy(outbuf, &outbuf[i+1]);
263                                         i = 0;
264                                         did_out = 1;
265                                 }
266                         }
267                 } while (did_out);
268
269                 /* Add soft line breaks */
270                 if (strlen(outbuf) > (screenwidth - 2)) {
271                         rb = (-1);
272                         for (i=0; i<(screenwidth-2); ++i) {
273                                 if (outbuf[i]==32) rb = i;
274                         }
275                         if (rb>=0) {
276                                 strncpy(&outptr[output_len], outbuf, rb);
277                                 output_len += rb;
278                                 strcpy(&outptr[output_len], "\n");
279                                 output_len += 1;
280                                 if (do_citaformat) {
281                                         strcpy(&outptr[output_len], " ");
282                                         ++output_len;
283                                 }
284                                 strcpy(outbuf, &outbuf[rb+1]);
285                         } else {
286                                 strncpy(&outptr[output_len], outbuf,
287                                         screenwidth-2);
288                                 output_len += (screenwidth-2);
289                                 strcpy(&outptr[output_len], "\n");
290                                 output_len += 1;
291                                 if (do_citaformat) {
292                                         strcpy(&outptr[output_len], " ");
293                                         ++output_len;
294                                 }
295                                 strcpy(outbuf, &outbuf[screenwidth-2]);
296                         }
297                 }
298
299         } while (done_reading == 0);
300
301         strcpy(&outptr[output_len], outbuf);
302         output_len += strlen(outbuf);
303
304         /* Strip leading/trailing whitespace.  We can't do this with
305          * striplt() because it uses too many strlen()'s
306          */
307         while ((output_len > 0) && (isspace(outptr[0]))) {
308                 strcpy(outptr, &outptr[1]);
309                 --output_len;
310         }
311         while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
312                 outptr[output_len-1] = 0;
313                 --output_len;
314         }
315
316         if (outptr[output_len-1] != '\n') {
317                 strcat(outptr, "\n");
318                 ++output_len;
319         }
320
321         return outptr;
322
323 }