X-Git-Url: https://code.citadel.org/?a=blobdiff_plain;f=webcit%2Fhtml2html.c;h=00a2adcbb9301260c93462cac4515b6ea11dd3b2;hb=efb97b068c75a55d164acee228ed1199b66ef08c;hp=6eb490a602faf891c2a2fbb1fa5584da65157768;hpb=5f6d11980b48033e589189f1e9ca597dee456a46;p=citadel.git diff --git a/webcit/html2html.c b/webcit/html2html.c index 6eb490a60..00a2adcbb 100644 --- a/webcit/html2html.c +++ b/webcit/html2html.c @@ -1,37 +1,76 @@ /* * $Id$ - * - * Output an HTML message, modifying it slightly to make sure it plays nice + */ +/** + * \defgroup HTML2HTML Output an HTML message, modifying it slightly to make sure it plays nice * with the rest of our web framework. - * + * \ingroup WebcitHttpServer */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +/*@{*/ #include "webcit.h" #include "vcard.h" #include "webserver.h" -/* +/** + * \brief Strip surrounding single or double quotes from a string. + * + * \param s String to be stripped. + */ +void stripquotes(char *s) +{ + int len; + + if (!s) return; + + len = strlen(s); + if (len < 2) return; + + if ( ( (s[0] == '\"') && (s[len-1] == '\"') ) || ( (s[0] == '\'') && (s[len-1] == '\'') ) ) { + s[len-1] = 0; + strcpy(s, &s[1]); + } +} + + +/** + * \brief Check to see if a META tag has overridden the declared MIME character set. + * + * \param charset Character set name (left unchanged if we don't do anything) + * \param meta_http_equiv Content of the "http-equiv" portion of the META tag + * \param meta_content Content of the "content" portion of the META tag */ -void output_html(void) { +void extract_charset_from_meta(char *charset, char *meta_http_equiv, char *meta_content) +{ + char *ptr; + char buf[64]; + + if (!charset) return; + if (!meta_http_equiv) return; + if (!meta_content) return; + + + if (strcasecmp(meta_http_equiv, "Content-type")) return; + + ptr = strchr(meta_content, ';'); + if (!ptr) return; + + safestrncpy(buf, ++ptr, sizeof buf); + striplt(buf); + if (!strncasecmp(buf, "charset=", 8)) { + strcpy(charset, &buf[8]); + } +} + + + +/** + * \brief Sanitize and enhance an HTML message for display. + * Also convert weird character sets to UTF-8 if necessary. + * + * \param supplied_charset the input charset as declared in the MIME headers + */ +void output_html(char *supplied_charset, int treat_as_wiki) { char buf[SIZ]; char *msg; char *ptr; @@ -47,19 +86,30 @@ void output_html(void) { int alevel = 0; int i; int linklen; + char charset[128]; +#ifdef HAVE_ICONV + iconv_t ic = (iconv_t)(-1) ; + char *ibuf; /**< Buffer of characters to be converted */ + char *obuf; /**< Buffer for converted characters */ + size_t ibuflen; /**< Length of input buffer */ + size_t obuflen; /**< Length of output buffer */ + char *osav; /**< Saved pointer to output buffer */ +#endif + safestrncpy(charset, supplied_charset, sizeof charset); msg = strdup(""); - sprintf(new_window, "realloc() error! " - "couldn't get %d bytes: %s

\n", + wprintf(""); + wprintf(_("realloc() error! couldn't get %d bytes: %s"), buffer_length + 1, strerror(errno)); + wprintf("

\n"); return; } strcpy(&msg[content_length], buf); @@ -68,19 +118,66 @@ void output_html(void) { content_length += 1; } + /** Do a first pass to isolate the message body */ ptr = msg; msgstart = msg; msgend = &msg[content_length]; while (ptr < msgend) { - /* Advance to next tag */ + /** Advance to next tag */ ptr = strchr(ptr, '<'); if ((ptr == NULL) || (ptr >= msgend)) break; ++ptr; if ((ptr == NULL) || (ptr >= msgend)) break; - /* Any of these tags cause everything up to and including + /** + * Look for META tags. Some messages (particularly in + * Asian locales) illegally declare a message's character + * set in the HTML instead of in the MIME headers. This + * is wrong but we have to work around it anyway. + */ + if (!strncasecmp(ptr, "META", 4)) { + + char *meta_start; + char *meta_end; + int meta_length; + char *meta; + char *meta_http_equiv; + char *meta_content; + char *spaceptr; + + meta_start = &ptr[4]; + meta_end = strchr(ptr, '>'); + if ((meta_end != NULL) && (meta_end <= msgend)) { + meta_length = meta_end - meta_start + 1; + meta = malloc(meta_length + 1); + safestrncpy(meta, meta_start, meta_length); + meta[meta_length] = 0; + striplt(meta); + if (!strncasecmp(meta, "HTTP-EQUIV=", 11)) { + meta_http_equiv = strdup(&meta[11]); + spaceptr = strchr(meta_http_equiv, ' '); + if (spaceptr != NULL) { + *spaceptr = 0; + meta_content = strdup(++spaceptr); + if (!strncasecmp(meta_content, "content=", 8)) { + strcpy(meta_content, &meta_content[8]); + stripquotes(meta_http_equiv); + stripquotes(meta_content); + extract_charset_from_meta(charset, + meta_http_equiv, meta_content); + } + free(meta_content); + } + free(meta_http_equiv); + } + free(meta); + } + } + + /** + * Any of these tags cause everything up to and including * the tag to be removed. */ if ( (!strncasecmp(ptr, "HTML", 4)) @@ -94,7 +191,8 @@ void output_html(void) { msgstart = ptr; } - /* Any of these tags cause everything including and following + /** + * Any of these tags cause everything including and following * the tag to be removed. */ if ( (!strncasecmp(ptr, "/HTML", 5)) @@ -107,42 +205,98 @@ void output_html(void) { ++ptr; } + if (msgstart > msg) { + strcpy(msg, msgstart); + } + + /** Convert foreign character sets to UTF-8 if necessary. */ +#ifdef HAVE_ICONV + if ( (strcasecmp(charset, "us-ascii")) + && (strcasecmp(charset, "UTF-8")) + && (strcasecmp(charset, "")) + ) { + lprintf(9, "Converting %s to UTF-8\n", charset); + ic = iconv_open("UTF-8", charset); + if (ic == (iconv_t)(-1) ) { + lprintf(5, "%s:%d iconv_open() failed: %s\n", + __FILE__, __LINE__, strerror(errno)); + } + } + if (ic != (iconv_t)(-1) ) { + ibuf = msg; + ibuflen = content_length; + obuflen = content_length + (content_length / 2) ; + obuf = (char *) malloc(obuflen); + osav = obuf; + iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen); + content_length = content_length + (content_length / 2) - obuflen; + osav[content_length] = 0; + free(msg); + msg = osav; + iconv_close(ic); + } +#endif + + /** FIXME At this point, shigerugo's messages are still clean. + * Figure out what is mangling them below. + */ + /** Now go through the message, parsing tags as necessary. */ converted_msg = malloc(content_length); strcpy(converted_msg, ""); ptr = msgstart; while (ptr < msgend) { - /* Change mailto: links to WebCit mail, by replacing the + /** + * Change mailto: links to WebCit mail, by replacing the * link with one that points back to our mail room. Due to * the way we parse URL's, it'll even handle mailto: links * that have "?subject=" in them. */ - if (!strncasecmp(ptr, "

\n"); + /** A little trailing vertical whitespace... */ + wprintf("

\n"); - /* Now give back the memory */ + /** Now give back the memory */ free(converted_msg); free(msg); } +/*@}*/