/**
* \defgroup HTML2HTML Output an HTML message, modifying it slightly to make sure it plays nice
* with the rest of our web framework.
- *
+ * \ingroup WebcitHttpServer
*/
/*@{*/
#include "webcit.h"
#include "webserver.h"
+/**
+ * \brief Strip surrounding single or double quotes from a string.
+ *
+ * \param s String to be stripped.
+ */
+void stripquotes(char *s)
+{
+ int len;
+
+ if (!s) return;
+
+ len = strlen(s);
+ if (len < 2) return;
+
+ if ( ( (s[0] == '\"') && (s[len-1] == '\"') ) || ( (s[0] == '\'') && (s[len-1] == '\'') ) ) {
+ s[len-1] = 0;
+ strcpy(s, &s[1]);
+ }
+}
+
+
+/**
+ * \brief Check to see if a META tag has overridden the declared MIME character set.
+ *
+ * \param charset Character set name (left unchanged if we don't do anything)
+ * \param meta_http_equiv Content of the "http-equiv" portion of the META tag
+ * \param meta_content Content of the "content" portion of the META tag
+ */
+void extract_charset_from_meta(char *charset, char *meta_http_equiv, char *meta_content)
+{
+ char *ptr;
+ char buf[64];
+
+ if (!charset) return;
+ if (!meta_http_equiv) return;
+ if (!meta_content) return;
+
+
+ if (strcasecmp(meta_http_equiv, "Content-type")) return;
+
+ ptr = strchr(meta_content, ';');
+ if (!ptr) return;
+
+ safestrncpy(buf, ++ptr, sizeof buf);
+ striplt(buf);
+ if (!strncasecmp(buf, "charset=", 8)) {
+ strcpy(charset, &buf[8]);
+ }
+}
+
+
+
/**
* \brief Sanitize and enhance an HTML message for display.
- * Also convert weird character sets to UTF-8 if necessary.
- * \param charset the input charset
+ * Also convert weird character sets to UTF-8 if necessary.
+ *
+ * \param supplied_charset the input charset as declared in the MIME headers
*/
-void output_html(char *charset, int treat_as_wiki) {
+void output_html(char *supplied_charset, int treat_as_wiki) {
char buf[SIZ];
char *msg;
char *ptr;
int alevel = 0;
int i;
int linklen;
+ char charset[128];
#ifdef HAVE_ICONV
iconv_t ic = (iconv_t)(-1) ;
char *ibuf; /**< Buffer of characters to be converted */
char *osav; /**< Saved pointer to output buffer */
#endif
+ safestrncpy(charset, supplied_charset, sizeof charset);
msg = strdup("");
sprintf(new_window, "<a target=\"%s\" href=", TARGET);
content_length += 1;
}
-#ifdef HAVE_ICONV
- if ( (strcasecmp(charset, "us-ascii"))
- && (strcasecmp(charset, "UTF-8"))
- && (strcasecmp(charset, ""))
- ) {
- ic = iconv_open("UTF-8", charset);
- if (ic == (iconv_t)(-1) ) {
- lprintf(5, "%s:%d iconv_open() failed: %s\n",
- __FILE__, __LINE__, strerror(errno));
- }
- }
- if (ic != (iconv_t)(-1) ) {
- ibuf = msg;
- ibuflen = content_length;
- obuflen = content_length + (content_length / 2) ;
- obuf = (char *) malloc(obuflen);
- osav = obuf;
- iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen);
- content_length = content_length + (content_length / 2) - obuflen;
- osav[content_length] = 0;
- free(msg);
- msg = osav;
- iconv_close(ic);
- }
-#endif
-
+ /** Do a first pass to isolate the message body */
ptr = msg;
msgstart = msg;
msgend = &msg[content_length];
++ptr;
if ((ptr == NULL) || (ptr >= msgend)) break;
+ /**
+ * Look for META tags. Some messages (particularly in
+ * Asian locales) illegally declare a message's character
+ * set in the HTML instead of in the MIME headers. This
+ * is wrong but we have to work around it anyway.
+ */
+ if (!strncasecmp(ptr, "META", 4)) {
+
+ char *meta_start;
+ char *meta_end;
+ int meta_length;
+ char *meta;
+ char *meta_http_equiv;
+ char *meta_content;
+ char *spaceptr;
+
+ meta_start = &ptr[4];
+ meta_end = strchr(ptr, '>');
+ if ((meta_end != NULL) && (meta_end <= msgend)) {
+ meta_length = meta_end - meta_start + 1;
+ meta = malloc(meta_length + 1);
+ safestrncpy(meta, meta_start, meta_length);
+ meta[meta_length] = 0;
+ striplt(meta);
+ if (!strncasecmp(meta, "HTTP-EQUIV=", 11)) {
+ meta_http_equiv = strdup(&meta[11]);
+ spaceptr = strchr(meta_http_equiv, ' ');
+ if (spaceptr != NULL) {
+ *spaceptr = 0;
+ meta_content = strdup(++spaceptr);
+ if (!strncasecmp(meta_content, "content=", 8)) {
+ strcpy(meta_content, &meta_content[8]);
+ stripquotes(meta_http_equiv);
+ stripquotes(meta_content);
+ extract_charset_from_meta(charset,
+ meta_http_equiv, meta_content);
+ }
+ free(meta_content);
+ }
+ free(meta_http_equiv);
+ }
+ free(meta);
+ }
+ }
+
/**
* Any of these tags cause everything up to and including
* the tag to be removed.
++ptr;
}
+ if (msgstart > msg) {
+ strcpy(msg, msgstart);
+ }
+ /** Convert foreign character sets to UTF-8 if necessary. */
+#ifdef HAVE_ICONV
+ if ( (strcasecmp(charset, "us-ascii"))
+ && (strcasecmp(charset, "UTF-8"))
+ && (strcasecmp(charset, ""))
+ ) {
+ lprintf(9, "Converting %s to UTF-8\n", charset);
+ ic = ctdl_iconv_open("UTF-8", charset);
+ if (ic == (iconv_t)(-1) ) {
+ lprintf(5, "%s:%d iconv_open() failed: %s\n",
+ __FILE__, __LINE__, strerror(errno));
+ }
+ }
+ if (ic != (iconv_t)(-1) ) {
+ ibuf = msg;
+ ibuflen = content_length;
+ obuflen = content_length + (content_length / 2) ;
+ obuf = (char *) malloc(obuflen);
+ osav = obuf;
+ iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen);
+ content_length = content_length + (content_length / 2) - obuflen;
+ osav[content_length] = 0;
+ free(msg);
+ msg = osav;
+ iconv_close(ic);
+ }
+#endif
+
+ /**
+ * At this point, the message has been stripped down to
+ * only the content inside the <BODY></BODY> tags, and has
+ * been converted to UTF-8 if it was originally in a foreign
+ * character set. The text is also guaranteed to be null
+ * terminated now.
+ */
+
+ /** Now go through the message, parsing tags as necessary. */
converted_msg = malloc(content_length);
strcpy(converted_msg, "");
- ptr = msgstart;
+ ptr = msg;
+ msgend = strchr(msg, 0);
while (ptr < msgend) {
+
/**
* Change mailto: links to WebCit mail, by replacing the
* link with one that points back to our mail room. Due to
ptr = &ptr[8];
}
else if ( (treat_as_wiki) && (strncasecmp(ptr, "<a href=\"wiki?", 14)) ) {
- lprintf(9, "converting wiki link\n");
content_length += 64;
converted_msg = realloc(converted_msg, content_length);
sprintf(&converted_msg[output_length], "<a href=\"wiki?page=");
}
}
+ /** uncomment these two lines to override conversion */
+ /** memcpy(converted_msg, msg, content_length); */
+ /** output_length = content_length; */
+
/** Output our big pile of markup */
client_write(converted_msg, output_length);