X-Git-Url: https://code.citadel.org/?a=blobdiff_plain;f=webcit%2Fhtml2html.c;h=a91239d0a1a2fce03f64bae5b304cadd791b7277;hb=2118818bd933b8f0c694b7124c236aaa1f934a10;hp=fdab14556feeb0dba4f808114ea061d1bbf7fa45;hpb=9ab2858c5a9fe3bee998f3345528e9e48651c38a;p=citadel.git diff --git a/webcit/html2html.c b/webcit/html2html.c index fdab14556..a91239d0a 100644 --- a/webcit/html2html.c +++ b/webcit/html2html.c @@ -1,85 +1,209 @@ /* * $Id$ - * - * Output an HTML message, modifying it slightly to make sure it plays nice + */ +/** + * \defgroup HTML2HTML Output an HTML message, modifying it slightly to make sure it plays nice * with the rest of our web framework. - * + * \ingroup WebcitHttpServer */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +/*@{*/ #include "webcit.h" -#include "vcard.h" #include "webserver.h" -/* +/** + * \brief Strip surrounding single or double quotes from a string. + * + * \param s String to be stripped. + */ +void stripquotes(char *s) +{ + int len; + + if (!s) return; + + len = strlen(s); + if (len < 2) return; + + if ( ( (s[0] == '\"') && (s[len-1] == '\"') ) || ( (s[0] == '\'') && (s[len-1] == '\'') ) ) { + s[len-1] = 0; + strcpy(s, &s[1]); + } +} + + +/** + * \brief Check to see if a META tag has overridden the declared MIME character set. + * + * \param charset Character set name (left unchanged if we don't do anything) + * \param meta_http_equiv Content of the "http-equiv" portion of the META tag + * \param meta_content Content of the "content" portion of the META tag + */ +void extract_charset_from_meta(char *charset, char *meta_http_equiv, char *meta_content) +{ + char *ptr; + char buf[64]; + + if (!charset) return; + if (!meta_http_equiv) return; + if (!meta_content) return; + + + if (strcasecmp(meta_http_equiv, "Content-type")) return; + + ptr = strchr(meta_content, ';'); + if (!ptr) return; + + safestrncpy(buf, ++ptr, sizeof buf); + striplt(buf); + if (!strncasecmp(buf, "charset=", 8)) { + strcpy(charset, &buf[8]); + + /* + * The brain-damaged webmail program in Microsoft Exchange declares + * a charset of "unicode" when they really mean "UTF-8". GNU iconv + * treats "unicode" as an alias for "UTF-16" so we have to manually + * fix this here, otherwise messages generated in Exchange webmail + * show up as a big pile of weird characters. + */ + if (!strcasecmp(charset, "unicode")) { + strcpy(charset, "UTF-8"); + } + + } +} + + + +/** + * \brief Sanitize and enhance an HTML message for display. + * Also convert weird character sets to UTF-8 if necessary. + * Also fixup img src="cid:..." type inline images to fetch the image + * + * \param supplied_charset the input charset as declared in the MIME headers */ -void output_html(void) { +void output_html(const char *supplied_charset, int treat_as_wiki, int msgnum, StrBuf *Source, StrBuf *Target) { char buf[SIZ]; char *msg; char *ptr; char *msgstart; char *msgend; - char *converted_msg; + StrBuf *converted_msg; int buffer_length = 1; int line_length = 0; int content_length = 0; - int output_length = 0; char new_window[SIZ]; int brak = 0; + int alevel = 0; + int scriptlevel = 0; + int script_start_pos = (-1); int i; int linklen; + char charset[128]; +#ifdef HAVE_ICONV + iconv_t ic = (iconv_t)(-1) ; + char *ibuf; /**< Buffer of characters to be converted */ + char *obuf; /**< Buffer for converted characters */ + size_t ibuflen; /**< Length of input buffer */ + size_t obuflen; /**< Length of output buffer */ + char *osav; /**< Saved pointer to output buffer */ +#endif + if (Target == NULL) + Target = WC->WBuf; + safestrncpy(charset, supplied_charset, sizeof charset); msg = strdup(""); - sprintf(new_window, "realloc() error! " - "couldn't get %d bytes: %s

\n", + ptr = realloc(msg, buffer_length); + if (ptr == NULL) { + StrBufAppendPrintf(Target, ""); + StrBufAppendPrintf(Target, _("realloc() error! couldn't get %d bytes: %s"), buffer_length + 1, strerror(errno)); + StrBufAppendPrintf(Target, "

\n"); + while (serv_getln(buf, sizeof buf), strcmp(buf, "000")) { + /** flush */ + } + free(msg); return; } + msg = ptr; strcpy(&msg[content_length], buf); content_length += line_length; strcpy(&msg[content_length], "\n"); content_length += 1; } + else { + content_length = StrLength(Source); + free(msg); + msg = (char*) ChrPtr(Source);/* TODO: remove cast */ + buffer_length = content_length; + } - ptr = msg; + /** Do a first pass to isolate the message body */ + ptr = msg + 1; msgstart = msg; msgend = &msg[content_length]; while (ptr < msgend) { - /* Advance to next tag */ + /** Advance to next tag */ ptr = strchr(ptr, '<'); if ((ptr == NULL) || (ptr >= msgend)) break; ++ptr; if ((ptr == NULL) || (ptr >= msgend)) break; - /* Any of these tags cause everything up to and including + /** + * Look for META tags. Some messages (particularly in + * Asian locales) illegally declare a message's character + * set in the HTML instead of in the MIME headers. This + * is wrong but we have to work around it anyway. + */ + if (!strncasecmp(ptr, "META", 4)) { + + char *meta_start; + char *meta_end; + int meta_length; + char *meta; + char *meta_http_equiv; + char *meta_content; + char *spaceptr; + + meta_start = &ptr[4]; + meta_end = strchr(ptr, '>'); + if ((meta_end != NULL) && (meta_end <= msgend)) { + meta_length = meta_end - meta_start + 1; + meta = malloc(meta_length + 1); + safestrncpy(meta, meta_start, meta_length); + meta[meta_length] = 0; + striplt(meta); + if (!strncasecmp(meta, "HTTP-EQUIV=", 11)) { + meta_http_equiv = strdup(&meta[11]); + spaceptr = strchr(meta_http_equiv, ' '); + if (spaceptr != NULL) { + *spaceptr = 0; + meta_content = strdup(++spaceptr); + if (!strncasecmp(meta_content, "content=", 8)) { + strcpy(meta_content, &meta_content[8]); + stripquotes(meta_http_equiv); + stripquotes(meta_content); + extract_charset_from_meta(charset, + meta_http_equiv, meta_content); + } + free(meta_content); + } + free(meta_http_equiv); + } + free(meta); + } + } + + /** + * Any of these tags cause everything up to and including * the tag to be removed. */ if ( (!strncasecmp(ptr, "HTML", 4)) @@ -93,7 +217,8 @@ void output_html(void) { msgstart = ptr; } - /* Any of these tags cause everything including and following + /** + * Any of these tags cause everything including and following * the tag to be removed. */ if ( (!strncasecmp(ptr, "/HTML", 5)) @@ -106,95 +231,396 @@ void output_html(void) { ++ptr; } + if (msgstart > msg) { + strcpy(msg, msgstart); + } - converted_msg = malloc(content_length); - strcpy(converted_msg, ""); - ptr = msgstart; + /** Now go through the message, parsing tags as necessary. */ + converted_msg = NewStrBufPlain(NULL, content_length + 8192); + + + /** Convert foreign character sets to UTF-8 if necessary. */ +#ifdef HAVE_ICONV + if ( (strcasecmp(charset, "us-ascii")) + && (strcasecmp(charset, "UTF-8")) + && (strcasecmp(charset, "")) + ) { + lprintf(9, "Converting %s to UTF-8\n", charset); + ctdl_iconv_open("UTF-8", charset, &ic); + if (ic == (iconv_t)(-1) ) { + lprintf(5, "%s:%d iconv_open() failed: %s\n", + __FILE__, __LINE__, strerror(errno)); + } + } + if (Source == NULL) { + if (ic != (iconv_t)(-1) ) { + ibuf = msg; + ibuflen = content_length; + obuflen = content_length + (content_length / 2) ; + obuf = (char *) malloc(obuflen); + osav = obuf; + iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen); + content_length = content_length + (content_length / 2) - obuflen; + osav[content_length] = 0; + free(msg); + msg = osav; + iconv_close(ic); + } + } + else { + if (ic != (iconv_t)(-1) ) { + StrBuf *Buf = NewStrBufPlain(NULL, StrLength(Source) + 8096);; + StrBufConvert(Source, Buf, &ic); + FreeStrBuf(&Buf); + iconv_close(ic); + msg = (char*)ChrPtr(Source); /* TODO: get rid of this. */ + } + } + +#endif + + /** + * At this point, the message has been stripped down to + * only the content inside the tags, and has + * been converted to UTF-8 if it was originally in a foreign + * character set. The text is also guaranteed to be null + * terminated now. + */ + + if (converted_msg == NULL) { + StrBufAppendPrintf(Target, "Error %d: %s
%s:%d", errno, strerror(errno), __FILE__, __LINE__); + goto BAIL; + } + + ptr = msg; + msgend = strchr(msg, 0); while (ptr < msgend) { - /* Change mailto: links to WebCit mail, by replacing the + + /** Try to sanitize the html of any rogue scripts */ + if (!strncasecmp(ptr, "'))) + ) { + /* open external links to new window */ + StrBufAppendPrintf(converted_msg, new_window); + ptr = &ptr[8]; + } + else if ( (treat_as_wiki) && (strncasecmp(ptr, "
'); + char* src=strstr(ptr, " src=\"cid:"); + char *cid_start, *cid_end; + ++brak; + + if (src && + (cid_start=strchr(src,':')) && + (cid_end=strchr(cid_start,'"')) && + (cid_end < tag_end)) { + + /* copy tag and attributes up to src="cid: */ + StrBufAppendBufPlain(converted_msg, ptr, src - ptr, 0); + cid_start++; + + /* add in /webcit/mimepart//CID/ + trailing / stops dumb URL filters getting excited */ + StrBufAppendPrintf(converted_msg, + "src=\"/webcit/mimepart/%d/",msgnum); + StrBufAppendBufPlain(converted_msg, cid_start, cid_end - cid_start, 0); + StrBufAppendBufPlain(converted_msg, "/\"", -1, 0); + + ptr = cid_end+1; + } + StrBufAppendBufPlain(converted_msg, ptr, tag_end - ptr, 0); + ptr = tag_end; } - /* Turn anything that looks like a URL into a real link, as long + + /** + * Turn anything that looks like a URL into a real link, as long * as it's not inside a tag already */ - else if ( (brak == 0) + else if ( (brak == 0) && (alevel == 0) && (!strncasecmp(ptr, "http://", 7))) { + /** Find the end of the link */ + int strlenptr; linklen = 0; - /* Find the end of the link */ - for (i=0; i<=strlen(ptr); ++i) { + + strlenptr = strlen(ptr); + for (i=0; i<=strlenptr; ++i) { if ((ptr[i]==0) ||(isspace(ptr[i])) ||(ptr[i]==10) ||(ptr[i]==13) + ||(ptr[i]=='(') ||(ptr[i]==')') + ||(ptr[i]=='<') ||(ptr[i]=='>') + ||(ptr[i]=='[') ||(ptr[i]==']') + ||(ptr[i]=='"') + ||(ptr[i]=='\'') ) linklen = i; + /* did s.b. send us an entity? */ + if (ptr[i] == '&') { + if ((ptr[i+2] ==';') || + (ptr[i+3] ==';') || + (ptr[i+5] ==';') || + (ptr[i+6] ==';') || + (ptr[i+7] ==';')) + linklen = i; + } if (linklen > 0) break; } if (linklen > 0) { - content_length += (32 + linklen); - converted_msg = realloc(converted_msg, content_length); - sprintf(&converted_msg[output_length], new_window); - output_length += strlen(new_window); - converted_msg[output_length] = '\"'; - converted_msg[++output_length] = 0; - for (i=0; i"); - output_length += 2; - for (i=0; i"); - output_length += 4; + if (ltreviewptr != 0) + *ltreviewptr = '<'; + + ptr[len] = linkedchar; + + content_length += (32 + linklen); + StrBufAppendPrintf(converted_msg, "%s\"", new_window); + StrBufAppendBufPlain(converted_msg, ptr, linklen, 0); + StrBufAppendPrintf(converted_msg, "\">"); + StrBufAppendBufPlain(converted_msg, ptr, linklen, 0); + ptr += linklen; + StrBufAppendPrintf(converted_msg, ""); } } else { - /* - * We need to know when we're inside a tag, - * so we don't turn things that look like URL's into - * links, when they're already links - or image sources. - */ - if (*ptr == '<') ++brak; - if (*ptr == '>') --brak; - converted_msg[output_length] = *ptr++; - converted_msg[++output_length] = 0; + StrBufAppendBufPlain(converted_msg, ptr, 1, 0); + ptr++; + } + + /** + * We need to know when we're inside a tag, + * so we don't turn things that look like URL's into + * links, when they're already links - or image sources. + */ + if (*(ptr-1) == '<') { + ++brak; + } + if (*(ptr-1) == '>') { + --brak; + if ((scriptlevel == 0) && (script_start_pos >= 0)) { + StrBufCutRight(converted_msg, StrLength(converted_msg) - script_start_pos); + script_start_pos = (-1); + } + } + if (!strncasecmp(ptr, "", 3)) --alevel; + } + + /** uncomment these two lines to override conversion */ + /** memcpy(converted_msg, msg, content_length); */ + /** output_length = content_length; */ + + /** Output our big pile of markup */ + StrBufAppendBuf(Target, converted_msg, 0); + +BAIL: /** A little trailing vertical whitespace... */ + StrBufAppendPrintf(Target, "

\n"); + + /** Now give back the memory */ + FreeStrBuf(&converted_msg); + if ((msg != NULL) && (Source == NULL)) free(msg); +} + + + + + + +/* + * Look for URL's embedded in a buffer and make them linkable. We use a + * target window in order to keep the Citadel session in its own window. + */ +void UrlizeText(StrBuf* Target, StrBuf *Source, StrBuf *WrkBuf) +{ + int len, UrlLen, Offset, TrailerLen; + const char *start, *end, *pos; + + FlushStrBuf(Target); + + start = NULL; + len = StrLength(Source); + end = ChrPtr(Source) + len; + for (pos = ChrPtr(Source); (pos < end) && (start == NULL); ++pos) { + if (!strncasecmp(pos, "http://", 7)) + start = pos; + else if (!strncasecmp(pos, "ftp://", 6)) + start = pos; + } + + if (start == NULL) { + StrBufAppendBuf(Target, Source, 0); + return; + } + FlushStrBuf(WrkBuf); + + for (pos = ChrPtr(Source) + len; pos > start; --pos) { + if ( (!isprint(*pos)) + || (isspace(*pos)) + || (*pos == '{') + || (*pos == '}') + || (*pos == '|') + || (*pos == '\\') + || (*pos == '^') + || (*pos == '[') + || (*pos == ']') + || (*pos == '`') + || (*pos == '<') + || (*pos == '>') + || (*pos == '(') + || (*pos == ')') + ) { + end = pos; } } + + UrlLen = end - start; + StrBufAppendBufPlain(WrkBuf, start, UrlLen, 0); + + Offset = start - ChrPtr(Source); + if (Offset != 0) + StrBufAppendBufPlain(Target, ChrPtr(Source), Offset, 0); + StrBufAppendPrintf(Target, "%ca href=%c%s%c TARGET=%c%s%c%c%s%c/A%c", + LB, QU, ChrPtr(WrkBuf), QU, QU, TARGET, + QU, RB, ChrPtr(WrkBuf), LB, RB); + + TrailerLen = StrLength(Source) - (end - ChrPtr(Source)); + if (TrailerLen > 0) + StrBufAppendBufPlain(Target, end, TrailerLen, 0); +} +void url(char *buf, size_t bufsize) +{ + int len, UrlLen, Offset, TrailerLen, outpos; + char *start, *end, *pos; + char urlbuf[SIZ]; + char outbuf[SIZ]; + + start = NULL; + len = strlen(buf); + if (len > bufsize) { + lprintf(1, "URL: content longer than buffer!"); + return; + } + end = buf + len; + for (pos = buf; (pos < end) && (start == NULL); ++pos) { + if (!strncasecmp(pos, "http://", 7)) + start = pos; + if (!strncasecmp(pos, "ftp://", 6)) + start = pos; + } + + if (start == NULL) + return; - /* Output our big pile of markup */ - client_write(converted_msg, output_length); + for (pos = buf+len; pos > start; --pos) { + if ( (!isprint(*pos)) + || (isspace(*pos)) + || (*pos == '{') + || (*pos == '}') + || (*pos == '|') + || (*pos == '\\') + || (*pos == '^') + || (*pos == '[') + || (*pos == ']') + || (*pos == '`') + || (*pos == '<') + || (*pos == '>') + || (*pos == '(') + || (*pos == ')') + ) { + end = pos; + } + } + + UrlLen = end - start; + if (UrlLen > sizeof(urlbuf)){ + lprintf(1, "URL: content longer than buffer!"); + return; + } + memcpy(urlbuf, start, UrlLen); + urlbuf[UrlLen] = '\0'; - /* A little trailing vertical whitespace... */ - wprintf("

\n"); + Offset = start - buf; + if ((Offset != 0) && (Offset < sizeof(outbuf))) + memcpy(outbuf, buf, Offset); + outpos = snprintf(&outbuf[Offset], sizeof(outbuf) - Offset, + "%ca href=%c%s%c TARGET=%c%s%c%c%s%c/A%c", + LB, QU, urlbuf, QU, QU, TARGET, QU, RB, urlbuf, LB, RB); + if (outpos >= sizeof(outbuf) - Offset) { + lprintf(1, "URL: content longer than buffer!"); + return; + } - /* Now give back the memory */ - free(converted_msg); - free(msg); + TrailerLen = len - (end - start); + if (TrailerLen > 0) + memcpy(outbuf + Offset + outpos, end, TrailerLen); + if (Offset + outpos + TrailerLen > bufsize) { + lprintf(1, "URL: content longer than buffer!"); + return; + } + memcpy (buf, outbuf, Offset + outpos + TrailerLen); + *(buf + Offset + outpos + TrailerLen) = '\0'; } + + + +/*@}*/