110d9682e31d937ffa35a47ee621b75f860431eb
[citadel.git] / webcit / html2html.c
1 /*
2  * $Id$
3  *
4  * Output an HTML message, modifying it slightly to make sure it plays nice
5  * with the rest of our web framework.
6  *
7  */
8
9 #include <ctype.h>
10 #include <stdlib.h>
11 #include <unistd.h>
12 #include <stdio.h>
13 #include <fcntl.h>
14 #include <signal.h>
15 #include <sys/types.h>
16 #include <sys/wait.h>
17 #include <sys/socket.h>
18 #include <limits.h>
19 #include <netinet/in.h>
20 #include <netdb.h>
21 #include <string.h>
22 #include <pwd.h>
23 #include <errno.h>
24 #include <stdarg.h>
25 #include <pthread.h>
26 #include <signal.h>
27
28 #ifdef HAVE_ICONV
29 #include <iconv.h>
30 #endif
31
32 #include "webcit.h"
33 #include "vcard.h"
34 #include "webserver.h"
35
36
37 /*
38  * Sanitize and enhance an HTML message for display.
39  * Also convert weird character sets to UTF-8 if necessary.
40  */
41 void output_html(char *charset) {
42         char buf[SIZ];
43         char *msg;
44         char *ptr;
45         char *msgstart;
46         char *msgend;
47         char *converted_msg;
48         int buffer_length = 1;
49         int line_length = 0;
50         int content_length = 0;
51         int output_length = 0;
52         char new_window[SIZ];
53         int brak = 0;
54         int alevel = 0;
55         int i;
56         int linklen;
57 #ifdef HAVE_ICONV
58         iconv_t ic = (iconv_t)(-1) ;
59         char *ibuf;                   /* Buffer of characters to be converted */
60         char *obuf;                   /* Buffer for converted characters      */
61         size_t ibuflen;               /* Length of input buffer               */
62         size_t obuflen;               /* Length of output buffer              */
63         char *osav;                   /* Saved pointer to output buffer       */
64 #endif
65
66         msg = strdup("");
67         sprintf(new_window, "<A TARGET=\"%s\" HREF=", TARGET);
68
69         while (serv_getln(buf, sizeof buf), strcmp(buf, "000")) {
70                 line_length = strlen(buf);
71                 buffer_length = content_length + line_length + 2;
72                 msg = realloc(msg, buffer_length);
73                 if (msg == NULL) {
74                         wprintf("<B>realloc() error!  "
75                                 "couldn't get %d bytes: %s</B><br /><br />\n",
76                                 buffer_length + 1,
77                                 strerror(errno));
78                         return;
79                 }
80                 strcpy(&msg[content_length], buf);
81                 content_length += line_length;
82                 strcpy(&msg[content_length], "\n");
83                 content_length += 1;
84         }
85
86 #ifdef HAVE_ICONV
87         if ( (strcasecmp(charset, "us-ascii"))
88            && (strcasecmp(charset, "UTF-8")) ) {
89                 ic = iconv_open("UTF-8", charset);
90                 if (ic == (iconv_t)(-1) ) {
91                         lprintf(5, "iconv_open() failed: %s\n", strerror(errno));
92                 }
93         }
94         if (ic != (iconv_t)(-1) ) {
95                 ibuf = msg;
96                 ibuflen = content_length;
97                 obuflen = content_length + (content_length / 2) ;
98                 obuf = (char *) malloc(obuflen);
99                 osav = obuf;
100                 iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen);
101                 content_length = content_length + (content_length / 2) - obuflen;
102                 osav[content_length] = 0;
103                 free(msg);
104                 msg = osav;
105                 iconv_close(ic);
106         }
107 #endif
108
109         ptr = msg;
110         msgstart = msg;
111         msgend = &msg[content_length];
112
113         while (ptr < msgend) {
114
115                 /* Advance to next tag */
116                 ptr = strchr(ptr, '<');
117                 if ((ptr == NULL) || (ptr >= msgend)) break;
118                 ++ptr;
119                 if ((ptr == NULL) || (ptr >= msgend)) break;
120
121                 /* Any of these tags cause everything up to and including
122                  * the tag to be removed.
123                  */     
124                 if ( (!strncasecmp(ptr, "HTML", 4))
125                    ||(!strncasecmp(ptr, "HEAD", 4))
126                    ||(!strncasecmp(ptr, "/HEAD", 5))
127                    ||(!strncasecmp(ptr, "BODY", 4)) ) {
128                         ptr = strchr(ptr, '>');
129                         if ((ptr == NULL) || (ptr >= msgend)) break;
130                         ++ptr;
131                         if ((ptr == NULL) || (ptr >= msgend)) break;
132                         msgstart = ptr;
133                 }
134
135                 /* Any of these tags cause everything including and following
136                  * the tag to be removed.
137                  */
138                 if ( (!strncasecmp(ptr, "/HTML", 5))
139                    ||(!strncasecmp(ptr, "/BODY", 5)) ) {
140                         --ptr;
141                         msgend = ptr;
142                         strcpy(ptr, "");
143                         
144                 }
145
146                 ++ptr;
147         }
148
149         converted_msg = malloc(content_length);
150         strcpy(converted_msg, "");
151         ptr = msgstart;
152         while (ptr < msgend) {
153                 /* Change mailto: links to WebCit mail, by replacing the
154                  * link with one that points back to our mail room.  Due to
155                  * the way we parse URL's, it'll even handle mailto: links
156                  * that have "?subject=" in them.
157                  */
158                 if (!strncasecmp(ptr, "<A HREF=\"mailto:", 16)) {
159                         content_length += 64;
160                         converted_msg = realloc(converted_msg, content_length);
161                         sprintf(&converted_msg[output_length],
162                                 "<A HREF=\"/display_enter"
163                                 "?force_room=_MAIL_&recp=");
164                         output_length += 47;
165                         ptr = &ptr[16];
166                         ++alevel;
167                 }
168                 /* Make links open in a separate window */
169                 else if (!strncasecmp(ptr, "<A HREF=", 8)) {
170                         content_length += 64;
171                         converted_msg = realloc(converted_msg, content_length);
172                         sprintf(&converted_msg[output_length], new_window);
173                         output_length += strlen(new_window);
174                         ptr = &ptr[8];
175                         ++alevel;
176                 }
177                 /* Turn anything that looks like a URL into a real link, as long
178                  * as it's not inside a tag already
179                  */
180                 else if ( (brak == 0) && (alevel == 0)
181                      && (!strncasecmp(ptr, "http://", 7))) {
182                                 linklen = 0;
183                                 /* Find the end of the link */
184                                 for (i=0; i<=strlen(ptr); ++i) {
185                                         if ((ptr[i]==0)
186                                            ||(isspace(ptr[i]))
187                                            ||(ptr[i]==10)
188                                            ||(ptr[i]==13)
189                                            ||(ptr[i]=='(')
190                                            ||(ptr[i]==')')
191                                            ||(ptr[i]=='<')
192                                            ||(ptr[i]=='>')
193                                            ||(ptr[i]=='[')
194                                            ||(ptr[i]==']')
195                                         ) linklen = i;
196                                         if (linklen > 0) break;
197                                 }
198                                 if (linklen > 0) {
199                                         content_length += (32 + linklen);
200                                         converted_msg = realloc(converted_msg, content_length);
201                                         sprintf(&converted_msg[output_length], new_window);
202                                         output_length += strlen(new_window);
203                                         converted_msg[output_length] = '\"';
204                                         converted_msg[++output_length] = 0;
205                                         for (i=0; i<linklen; ++i) {
206                                                 converted_msg[output_length] = ptr[i];
207                                                 converted_msg[++output_length] = 0;
208                                         }
209                                         sprintf(&converted_msg[output_length], "\">");
210                                         output_length += 2;
211                                         for (i=0; i<linklen; ++i) {
212                                                 converted_msg[output_length] = *ptr++;
213                                                 converted_msg[++output_length] = 0;
214                                         }
215                                         sprintf(&converted_msg[output_length], "</A>");
216                                         output_length += 4;
217                                 }
218                 }
219                 else {
220                         /*
221                          * We need to know when we're inside a tag,
222                          * so we don't turn things that look like URL's into
223                          * links, when they're already links - or image sources.
224                          */
225                         if (*ptr == '<') ++brak;
226                         if (*ptr == '>') --brak;
227                         if (!strncasecmp(ptr, "</A>", 3)) --alevel;
228                         converted_msg[output_length] = *ptr++;
229                         converted_msg[++output_length] = 0;
230                 }
231         }
232
233         /* Output our big pile of markup */
234         client_write(converted_msg, output_length);
235
236         /* A little trailing vertical whitespace... */
237         wprintf("<br /><br />\n");
238
239         /* Now give back the memory */
240         free(converted_msg);
241         free(msg);
242 }
243