X-Git-Url: https://code.citadel.org/?a=blobdiff_plain;f=webcit-ng%2Fhtml2html.c;h=68c567b2a1359ee601dc73f97372ba52eff577f7;hb=HEAD;hp=8704525bea4c73e284a3069e5e0f43163d96b607;hpb=3063427564f6fd6d8844ac1cb5e7092320f76173;p=citadel.git

diff --git a/webcit-ng/html2html.c b/webcit-ng/html2html.c
deleted file mode 100644
index 8704525be..000000000
--- a/webcit-ng/html2html.c
+++ /dev/null
@@ -1,642 +0,0 @@
-//
-// Output an HTML message, modifying it slightly to make sure it plays nice
-// with the rest of our web framework.
-//
-// Copyright (c) 2005-2021 by the citadel.org team
-//
-// This program is open source software.  It runs great on the
-// Linux operating system (and probably elsewhere).  You can use,
-// copy, and run it under the terms of the GNU General Public
-// License version 3.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-
-#include "webcit.h"
-
-
-// Strip surrounding single or double quotes from a string.
-void stripquotes(char *s) {
-	int len;
-
-	if (!s)
-		return;
-
-	len = strlen(s);
-	if (len < 2)
-		return;
-
-	if (((s[0] == '\"') && (s[len - 1] == '\"')) || ((s[0] == '\'') && (s[len - 1] == '\''))) {
-		s[len - 1] = 0;
-		strcpy(s, &s[1]);
-	}
-}
-
-
-// Check to see if a META tag has overridden the declared MIME character set.
-//
-// charset		Character set name (left unchanged if we don't do anything)
-// meta_http_equiv	Content of the "http-equiv" portion of the META tag
-// meta_content		Content of the "content" portion of the META tag
-void extract_charset_from_meta(char *charset, char *meta_http_equiv, char *meta_content) {
-	char *ptr;
-	char buf[64];
-
-	if (!charset)
-		return;
-	if (!meta_http_equiv)
-		return;
-	if (!meta_content)
-		return;
-
-	if (strcasecmp(meta_http_equiv, "Content-type"))
-		return;
-
-	ptr = strchr(meta_content, ';');
-	if (!ptr)
-		return;
-
-	safestrncpy(buf, ++ptr, sizeof buf);
-	striplt(buf);
-	if (!strncasecmp(buf, "charset=", 8)) {
-		strcpy(charset, &buf[8]);
-
-		// The brain-damaged webmail program in Microsoft Exchange declares
-		// a charset of "unicode" when they really mean "UTF-8".  GNU iconv
-		// treats "unicode" as an alias for "UTF-16" so we have to manually
-		// fix this here, otherwise messages generated in Exchange webmail
-		// show up as a big pile of weird characters.
-		if (!strcasecmp(charset, "unicode")) {
-			strcpy(charset, "UTF-8");
-		}
-
-		// Remove wandering punctuation
-		if ((ptr = strchr(charset, '\"')))
-			*ptr = 0;
-		striplt(charset);
-	}
-}
-
-
-// Sanitize and enhance an HTML message for display.
-// Also convert weird character sets to UTF-8 if necessary.
-// Also fixup img src="cid:..." type inline images to fetch the image
-StrBuf *html2html(const char *supplied_charset, int treat_as_wiki, char *roomname, long msgnum, StrBuf *Source) {
-	char buf[SIZ];
-	char *msg;
-	char *ptr;
-	char *msgstart;
-	char *msgend;
-	StrBuf *converted_msg;
-	int buffer_length = 1;
-	int line_length = 0;
-	int content_length = 0;
-	char new_window[SIZ];
-	int brak = 0;
-	int alevel = 0;
-	int scriptlevel = 0;
-	int script_start_pos = (-1);
-	int i;
-	int linklen;
-	char charset[128];
-	StrBuf *BodyArea = NULL;
-
-	iconv_t ic = (iconv_t) (-1);
-	char *ibuf;		// Buffer of characters to be converted
-	char *obuf;		// Buffer for converted characters
-	size_t ibuflen;		// Length of input buffer
-	size_t obuflen;		// Length of output buffer
-	char *osav;		// Saved pointer to output buffer
-
-	StrBuf *Target = NewStrBuf();
-	if (Target == NULL) {
-		return (NULL);
-	}
-
-	safestrncpy(charset, supplied_charset, sizeof charset);
-	sprintf(new_window, "<a target=\"%s\" href=", TARGET);
-
-	content_length = StrLength(Source);
-	msg = (char *) ChrPtr(Source);
-	buffer_length = content_length;
-
-	// Do a first pass to isolate the message body
-	ptr = msg + 1;
-	msgstart = msg;
-	msgend = &msg[content_length];
-
-	while (ptr < msgend) {
-
-		// Advance to next tag
-		ptr = strchr(ptr, '<');
-		if ((ptr == NULL) || (ptr >= msgend))
-			break;
-		++ptr;
-		if ((ptr == NULL) || (ptr >= msgend))
-			break;
-
-		//  Look for META tags.  Some messages (particularly in
-		//  Asian locales) illegally declare a message's character
-		//  set in the HTML instead of in the MIME headers.  This
-		//  is wrong but we have to work around it anyway.
-		if (!strncasecmp(ptr, "META", 4)) {
-
-			char *meta_start;
-			char *meta_end;
-			int meta_length;
-			char *meta;
-			char *meta_http_equiv;
-			char *meta_content;
-			char *spaceptr;
-
-			meta_start = &ptr[4];
-			meta_end = strchr(ptr, '>');
-			if ((meta_end != NULL) && (meta_end <= msgend)) {
-				meta_length = meta_end - meta_start + 1;
-				meta = malloc(meta_length + 1);
-				safestrncpy(meta, meta_start, meta_length);
-				meta[meta_length] = 0;
-				striplt(meta);
-				if (!strncasecmp(meta, "HTTP-EQUIV=", 11)) {
-					meta_http_equiv = strdup(&meta[11]);
-					spaceptr = strchr(meta_http_equiv, ' ');
-					if (spaceptr != NULL) {
-						*spaceptr = 0;
-						meta_content = strdup(++spaceptr);
-						if (!strncasecmp(meta_content, "content=", 8)) {
-							strcpy(meta_content, &meta_content[8]);
-							stripquotes(meta_http_equiv);
-							stripquotes(meta_content);
-							extract_charset_from_meta(charset, meta_http_equiv, meta_content);
-						}
-						free(meta_content);
-					}
-					free(meta_http_equiv);
-				}
-				free(meta);
-			}
-		}
-
-		// Any of these tags cause everything up to and including
-		// the tag to be removed.
-		if ((!strncasecmp(ptr, "HTML", 4))
-		    || (!strncasecmp(ptr, "HEAD", 4))
-		    || (!strncasecmp(ptr, "/HEAD", 5))
-		    || (!strncasecmp(ptr, "BODY", 4))) {
-			char *pBody = NULL;
-
-			if (!strncasecmp(ptr, "BODY", 4)) {
-				pBody = ptr;
-			}
-			ptr = strchr(ptr, '>');
-			if ((ptr == NULL) || (ptr >= msgend))
-				break;
-			if ((pBody != NULL) && (ptr - pBody > 4)) {
-				char *src;
-				char *cid_start, *cid_end;
-
-				*ptr = '\0';
-				pBody += 4;
-				while ((isspace(*pBody)) && (pBody < ptr))
-					pBody++;
-				BodyArea = NewStrBufPlain(NULL, ptr - pBody);
-
-				if (pBody < ptr) {
-					src = strstr(pBody, "cid:");
-					if (src) {
-						cid_start = src + 4;
-						cid_end = cid_start;
-						while ((*cid_end != '"') && !isspace(*cid_end) && (cid_end < ptr))
-							cid_end++;
-
-						// copy tag and attributes up to src="cid:
-						StrBufAppendBufPlain(BodyArea, pBody, src - pBody, 0);
-
-						// add in /webcit/mimepart/<msgno>/CID/ 
-						// trailing / stops dumb URL filters getting excited
-						StrBufAppendPrintf(BodyArea, "/webcit/mimepart/%ld/", msgnum);
-						StrBufAppendBufPlain(BodyArea, cid_start, cid_end - cid_start, 0);
-
-						if (ptr - cid_end > 0)
-							StrBufAppendBufPlain(BodyArea, cid_end + 1, ptr - cid_end, 0);
-					}
-					else {
-						StrBufAppendBufPlain(BodyArea, pBody, ptr - pBody, 0);
-					}
-				}
-				*ptr = '>';
-			}
-			++ptr;
-			if ((ptr == NULL) || (ptr >= msgend))
-				break;
-			msgstart = ptr;
-		}
-
-		// Any of these tags cause everything including and following
-		// the tag to be removed.
-		if ((!strncasecmp(ptr, "/HTML", 5)) || (!strncasecmp(ptr, "/BODY", 5))) {
-			--ptr;
-			msgend = ptr;
-			strcpy(ptr, "");
-		}
-
-		++ptr;
-	}
-	if (msgstart > msg) {
-		strcpy(msg, msgstart);
-	}
-
-	// Now go through the message, parsing tags as necessary.
-	converted_msg = NewStrBufPlain(NULL, content_length + 8192);
-
-	// Convert foreign character sets to UTF-8 if necessary
-	if ((strcasecmp(charset, "us-ascii"))
-	    && (strcasecmp(charset, "UTF-8"))
-	    && (strcasecmp(charset, ""))
-	    ) {
-		syslog(LOG_DEBUG, "Converting %s to UTF-8", charset);
-		ctdl_iconv_open("UTF-8", charset, &ic);
-		if (ic == (iconv_t) (-1)) {
-			syslog(LOG_WARNING, "%s:%d iconv_open() failed: %s", __FILE__, __LINE__, strerror(errno));
-		}
-	}
-	if (Source == NULL) {
-		if (ic != (iconv_t) (-1)) {
-			ibuf = msg;
-			ibuflen = content_length;
-			obuflen = content_length + (content_length / 2);
-			obuf = (char *) malloc(obuflen);
-			osav = obuf;
-			iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen);
-			content_length = content_length + (content_length / 2) - obuflen;
-			osav[content_length] = 0;
-			free(msg);
-			msg = osav;
-			iconv_close(ic);
-		}
-	}
-	else {
-		if (ic != (iconv_t) (-1)) {
-			StrBuf *Buf = NewStrBufPlain(NULL, StrLength(Source) + 8096);;
-			StrBufConvert(Source, Buf, &ic);
-			FreeStrBuf(&Buf);
-			iconv_close(ic);
-			msg = (char *) ChrPtr(Source);	// TODO: get rid of this.
-		}
-	}
-
-	// At this point, the message has been stripped down to
-	// only the content inside the <BODY></BODY> tags, and has
-	// been converted to UTF-8 if it was originally in a foreign
-	// character set.  The text is also guaranteed to be null
-	// terminated now.
-
-	if (converted_msg == NULL) {
-		StrBufAppendPrintf(Target, "Error %d: %s<br>%s:%d", errno, strerror(errno), __FILE__, __LINE__);
-		goto BAIL;
-	}
-
-	if (BodyArea != NULL) {	// Any attributes that were declared in the <body> tag
-		StrBufAppendBufPlain(converted_msg, HKEY("<div "), 0);	// are instead declared in this <div> tag
-		StrBufAppendBuf(converted_msg, BodyArea, 0);
-		StrBufAppendBufPlain(converted_msg, HKEY(">"), 0);
-	}
-	ptr = msg;
-	msgend = strchr(msg, 0);
-	while (ptr < msgend) {
-
-		// Try to sanitize the html of any rogue scripts
-		if (!strncasecmp(ptr, "<script", 7)) {
-			if (scriptlevel == 0) {
-				script_start_pos = StrLength(converted_msg);
-			}
-			++scriptlevel;
-		}
-		if (!strncasecmp(ptr, "</script", 8)) {
-			--scriptlevel;
-		}
-
-		// Change mailto: links to WebCit mail, by replacing the
-		// link with one that points back to our mail room.  Due to
-		// the way we parse URL's, it'll even handle mailto: links
-		// that have "?subject=" in them.
-		// FIXME change URL syntax for webcit-ng
-		if (!strncasecmp(ptr, "<a href=\"mailto:", 16)) {
-			content_length += 64;
-			StrBufAppendPrintf(converted_msg, "<a href=\"display_enter?force_room=_MAIL_?recp=");
-			ptr = &ptr[16];
-			++alevel;
-			++brak;
-		}
-
-		// Make external links open in a separate window
-		else if (!strncasecmp(ptr, "<a href=\"", 9)) {
-			++alevel;
-			++brak;
-			if (((strchr(ptr, ':') < strchr(ptr, '/'))) && ((strchr(ptr, '/') < strchr(ptr, '>')))) {
-				// open external links to new window
-				StrBufAppendPrintf(converted_msg, new_window);
-				ptr = &ptr[8];
-			}
-			else if ((treat_as_wiki)
-				   && (strncasecmp(ptr, "<a href=\"wiki?", 14))
-				   && (strncasecmp(ptr, "<a href=\"dotgoto?", 17))
-				   && (strncasecmp(ptr, "<a href=\"knrooms?", 17))
-			    ) {
-				content_length += 64;
-				StrBufAppendPrintf(converted_msg, "<a href=\"wiki?go=");
-				//StrBufUrlescAppend(converted_msg, "FIXME ROOM NAME", NULL);                   // FIXME make compatible with webcit-ng
-				StrBufAppendPrintf(converted_msg, "?page=");
-				ptr = &ptr[9];
-			}
-			else {
-				StrBufAppendPrintf(converted_msg, "<a href=\"");
-				ptr = &ptr[9];
-			}
-		}
-
-		// Fixup <img src="cid:... ...> to fetch the mime part
-		else if (!strncasecmp(ptr, "<img ", 5)) {
-			char *cid_start, *cid_end;
-			char *tag_end = strchr(ptr, '>');
-			char *src;
-			// FIXME - handle this situation (maybe someone opened an <img cid... 
-			// and then ended the message)
-			if (!tag_end) {
-				syslog(LOG_DEBUG, "tag_end is null and ptr is:");
-				syslog(LOG_DEBUG, "%s", ptr);
-				syslog(LOG_DEBUG, "Theoretical bytes remaining: %d", (int) (msgend - ptr));
-			}
-
-			src = strstr(ptr, "src=\"cid:");
-			++brak;
-
-			if (src && isspace(*(src - 1))
-			    && tag_end && (cid_start = strchr(src, ':'))
-			    && (cid_end = strchr(cid_start, '"'))
-			    && (cid_end < tag_end)
-			    ) {
-				// copy tag and attributes up to src="cid:
-				StrBufAppendBufPlain(converted_msg, ptr, src - ptr, 0);
-				cid_start++;
-
-				// add in /webcit/mimepart/<msgnum>/CID/ 
-				// trailing / stops dumb URL filters getting excited
-				StrBufAppendPrintf(converted_msg, " src=\"/ctdl/r/");
-				StrBufXMLEscAppend(converted_msg, NULL, roomname, strlen(roomname), 0);
-				syslog(LOG_DEBUG, "room name is '%s'", roomname);
-				StrBufAppendPrintf(converted_msg, "/%ld/", msgnum);
-				StrBufAppendBufPlain(converted_msg, cid_start, cid_end - cid_start, 0);
-				StrBufAppendBufPlain(converted_msg, "\"", -1, 0);
-				ptr = cid_end + 1;
-			}
-			StrBufAppendBufPlain(converted_msg, ptr, tag_end - ptr, 0);
-			ptr = tag_end;
-		}
-
-		// Turn anything that looks like a URL into a real link, as long
-		// as it's not inside a tag already
-		else if ((brak == 0) && (alevel == 0) && ((!strncasecmp(ptr, "http://", 7)) || (!strncasecmp(ptr, "https://", 8)))) {
-			// Find the end of the link
-			int strlenptr;
-			linklen = 0;
-
-			strlenptr = strlen(ptr);
-			for (i = 0; i <= strlenptr; ++i) {
-				if ((ptr[i] == 0)
-				    || (isspace(ptr[i]))
-				    || (ptr[i] == 10)
-				    || (ptr[i] == 13)
-				    || (ptr[i] == '(')
-				    || (ptr[i] == ')')
-				    || (ptr[i] == '<')
-				    || (ptr[i] == '>')
-				    || (ptr[i] == '[')
-				    || (ptr[i] == ']')
-				    || (ptr[i] == '"')
-				    || (ptr[i] == '\'')
-				    )
-					linklen = i;
-				// entity tag?
-				if (ptr[i] == '&') {
-					if ((ptr[i + 2] == ';') ||
-					    (ptr[i + 3] == ';') ||
-					    (ptr[i + 5] == ';') || (ptr[i + 6] == ';') || (ptr[i + 7] == ';'))
-						linklen = i;
-				}
-				if (linklen > 0)
-					break;
-			}
-			if (linklen > 0) {
-				char *ltreviewptr;
-				char *nbspreviewptr;
-				char linkedchar;
-				int len;
-
-				len = linklen;
-				linkedchar = ptr[len];
-				ptr[len] = '\0';
-				// spot for some subject strings tinymce tends to give us.
-				ltreviewptr = strchr(ptr, '<');
-				if (ltreviewptr != NULL) {
-					*ltreviewptr = '\0';
-					linklen = ltreviewptr - ptr;
-				}
-
-				nbspreviewptr = strstr(ptr, "&nbsp;");
-				if (nbspreviewptr != NULL) {
-					// nbspreviewptr = '\0';
-					linklen = nbspreviewptr - ptr;
-				}
-				if (ltreviewptr != 0)
-					*ltreviewptr = '<';
-
-				ptr[len] = linkedchar;
-
-				content_length += (32 + linklen);
-				StrBufAppendPrintf(converted_msg, "%s\"", new_window);
-				StrBufAppendBufPlain(converted_msg, ptr, linklen, 0);
-				StrBufAppendPrintf(converted_msg, "\">");
-				StrBufAppendBufPlain(converted_msg, ptr, linklen, 0);
-				ptr += linklen;
-				StrBufAppendPrintf(converted_msg, "</a>");
-			}
-		}
-		else {
-			StrBufAppendBufPlain(converted_msg, ptr, 1, 0);
-			ptr++;
-		}
-
-		if ((ptr >= msg) && (ptr <= msgend)) {
-			// We need to know when we're inside a tag,
-			// so we don't turn things that look like URL's into
-			// links, when they're already links - or image sources.
-			if ((ptr > msg) && (*(ptr - 1) == '<')) {
-				++brak;
-			}
-			if ((ptr > msg) && (*(ptr - 1) == '>')) {
-				--brak;
-				if ((scriptlevel == 0) && (script_start_pos >= 0)) {
-					StrBufCutRight(converted_msg, StrLength(converted_msg) - script_start_pos);
-					script_start_pos = (-1);
-				}
-			}
-			if (!strncasecmp(ptr, "</a>", 3))
-				--alevel;
-		}
-	}
-
-	if (BodyArea != NULL) {
-		StrBufAppendBufPlain(converted_msg, HKEY("</div>"), 0);	// Close the div where we declared attributes copied
-		FreeStrBuf(&BodyArea);	// from the original <body> tag
-	}
-
-	//      uncomment these two lines to override conversion
-	//      memcpy(converted_msg, msg, content_length);
-	//      output_length = content_length;
-
-	// Output our big pile of markup
-	StrBufAppendBuf(Target, converted_msg, 0);
-
-      BAIL:			// A little trailing vertical whitespace...
-	StrBufAppendPrintf(Target, "<br>\n");
-
-	// Now give back the memory
-	FreeStrBuf(&converted_msg);
-	if ((msg != NULL) && (Source == NULL))
-		free(msg);
-	return (Target);
-}
-
-
-// Look for URL's embedded in a buffer and make them linkable.  We use a
-// target window in order to keep the Citadel session in its own window.
-void UrlizeText(StrBuf * Target, StrBuf * Source, StrBuf * WrkBuf) {
-	int len, UrlLen, Offset, TrailerLen;
-	const char *start, *end, *pos;
-
-	FlushStrBuf(Target);
-	start = NULL;
-	len = StrLength(Source);
-	end = ChrPtr(Source) + len;
-	for (pos = ChrPtr(Source); (pos < end) && (start == NULL); ++pos) {
-		if (!strncasecmp(pos, "http://", 7))
-			start = pos;
-		else if (!strncasecmp(pos, "ftp://", 6))
-			start = pos;
-	}
-
-	if (start == NULL) {
-		StrBufAppendBuf(Target, Source, 0);
-		return;
-	}
-	FlushStrBuf(WrkBuf);
-
-	for (pos = ChrPtr(Source) + len; pos > start; --pos) {
-		if ((!isprint(*pos))
-		    || (isspace(*pos))
-		    || (*pos == '{')
-		    || (*pos == '}')
-		    || (*pos == '|')
-		    || (*pos == '\\')
-		    || (*pos == '^')
-		    || (*pos == '[')
-		    || (*pos == ']')
-		    || (*pos == '`')
-		    || (*pos == '<')
-		    || (*pos == '>')
-		    || (*pos == '(')
-		    || (*pos == ')')
-		    ) {
-			end = pos;
-		}
-	}
-
-	UrlLen = end - start;
-	StrBufAppendBufPlain(WrkBuf, start, UrlLen, 0);
-
-	Offset = start - ChrPtr(Source);
-	if (Offset != 0)
-		StrBufAppendBufPlain(Target, ChrPtr(Source), Offset, 0);
-	StrBufAppendPrintf(Target, "%ca href=%c%s%c TARGET=%c%s%c%c%s%c/A%c",
-			   LB, QU, ChrPtr(WrkBuf), QU, QU, TARGET, QU, RB, ChrPtr(WrkBuf), LB, RB);
-
-	TrailerLen = StrLength(Source) - (end - ChrPtr(Source));
-	if (TrailerLen > 0)
-		StrBufAppendBufPlain(Target, end, TrailerLen, 0);
-}
-
-
-void url(char *buf, size_t bufsize) {
-	int len, UrlLen, Offset, TrailerLen, outpos;
-	char *start, *end, *pos;
-	char urlbuf[SIZ];
-	char outbuf[SIZ];
-
-	start = NULL;
-	len = strlen(buf);
-	if (len > bufsize) {
-		syslog(LOG_WARNING, "URL: content longer than buffer!");
-		return;
-	}
-	end = buf + len;
-	for (pos = buf; (pos < end) && (start == NULL); ++pos) {
-		if (!strncasecmp(pos, "http://", 7))
-			start = pos;
-		if (!strncasecmp(pos, "ftp://", 6))
-			start = pos;
-	}
-
-	if (start == NULL)
-		return;
-
-	for (pos = buf + len; pos > start; --pos) {
-		if ((!isprint(*pos))
-		    || (isspace(*pos))
-		    || (*pos == '{')
-		    || (*pos == '}')
-		    || (*pos == '|')
-		    || (*pos == '\\')
-		    || (*pos == '^')
-		    || (*pos == '[')
-		    || (*pos == ']')
-		    || (*pos == '`')
-		    || (*pos == '<')
-		    || (*pos == '>')
-		    || (*pos == '(')
-		    || (*pos == ')')
-		    ) {
-			end = pos;
-		}
-	}
-
-	UrlLen = end - start;
-	if (UrlLen > sizeof(urlbuf)) {
-		syslog(LOG_WARNING, "URL: content longer than buffer!");
-		return;
-	}
-	memcpy(urlbuf, start, UrlLen);
-	urlbuf[UrlLen] = '\0';
-
-	Offset = start - buf;
-	if ((Offset != 0) && (Offset < sizeof(outbuf)))
-		memcpy(outbuf, buf, Offset);
-	outpos = snprintf(&outbuf[Offset], sizeof(outbuf) - Offset,
-			  "%ca href=%c%s%c TARGET=%c%s%c%c%s%c/A%c", LB, QU, urlbuf, QU, QU, TARGET, QU, RB, urlbuf, LB, RB);
-	if (outpos >= sizeof(outbuf) - Offset) {
-		syslog(LOG_WARNING, "URL: content longer than buffer!");
-		return;
-	}
-
-	TrailerLen = len - (end - start);
-	if (TrailerLen > 0)
-		memcpy(outbuf + Offset + outpos, end, TrailerLen);
-	if (Offset + outpos + TrailerLen > bufsize) {
-		syslog(LOG_WARNING, "URL: content longer than buffer!");
-		return;
-	}
-	memcpy(buf, outbuf, Offset + outpos + TrailerLen);
-	*(buf + Offset + outpos + TrailerLen) = '\0';
-}