From a2401aea9ac970fa35337c5665f0b725bcdb4ccc Mon Sep 17 00:00:00 2001
From: =?utf8?q?Wilfried=20G=C3=B6esgens?= <willi@citadel.org>
Date: Sat, 24 May 2008 17:18:52 +0000
Subject: [PATCH] * use iconv in citserver too * DE-QP and IConv translate
 harvested mail addresses

---
 citadel/configure.ac          |  60 +++++++++
 citadel/internet_addressing.c | 241 ++++++++++++++++++++++++++++++++++
 2 files changed, 301 insertions(+)

diff --git a/citadel/configure.ac b/citadel/configure.ac
index cd32b4496..78929b3a6 100644
--- a/citadel/configure.ac
+++ b/citadel/configure.ac
@@ -129,6 +129,65 @@ AC_ARG_WITH(docdir,
 )
 
 
+dnl Here is the check for a libc integrated iconv
+AC_ARG_ENABLE(iconv,
+	[  --disable-iconv         do not use iconv charset conversion],
+	ok_iconv=no, ok_iconv=yes)
+
+AC_MSG_CHECKING(Checking to see if your system supports iconv)
+AC_TRY_RUN([
+ 	#include <iconv.h>
+ 	main() {
+ 		iconv_t ic = (iconv_t)(-1) ;
+ 		ic = iconv_open("UTF-8", "us-ascii");
+ 		iconv_close(ic);
+ 		exit(0);
+ 	}
+ ],
+		[
+		  ok_iconv=yes
+		  AC_MSG_RESULT([yes])
+		],
+		[ 
+		  ok_iconv=no
+		  AC_MSG_RESULT([no])
+		]
+)
+
+dnl Check for iconv in external libiconv
+if test "$ok_iconv" = no; then
+	AC_MSG_CHECKING(Checking for an external libiconv)
+	OLD_LDFLAGS="$LDFLAGS"
+	LDFLAGS="$LDFLAGS -liconv"
+	AC_TRY_RUN([
+			#include <iconv.h>
+			main() {
+				iconv_t ic = (iconv_t)(-1) ;
+				ic = iconv_open("UTF-8", "us-ascii");
+				iconv_close(ic);
+			}
+		],
+			[
+			  ok_iconv=yes
+			  AC_MSG_RESULT([yes])
+			],
+			[ 
+			  ok_iconv=no
+			  LDFLAGS="$OLD_LDFLAGS"
+			  AC_MSG_RESULT([no])
+			]
+		)
+fi	
+if test "$ok_iconv" != "no"; then
+	AC_MSG_RESULT(WebCit will be built with character set conversion.)
+	AC_DEFINE(HAVE_ICONV,[],[whether we have iconv for charset conversion])
+else
+	AC_MSG_RESULT(WebCit will be built without character set conversion.)
+fi
+
+AC_CHECK_LIB(intl, libintl_bindtextdomain, [LDFLAGS="$LDFLAGS -lintl"])
+
+
 
 AC_ARG_ENABLE(threaded-client, [  --disable-threaded-client
 			  disable multithreaded client])
@@ -908,6 +967,7 @@ fi
 echo ------------------------------------------------------------------------
 echo 'zlib compression:                ' $ok_zlib
 echo 'LDAP support:                    ' $ok_ldap
+echo 'Character set conversion support:' $ok_iconv
 echo 'DSpam Scanning support:          ' $ok_libdspam
 echo 
 echo 'Note: if you are not using Linux, make sure you are using GNU make'
diff --git a/citadel/internet_addressing.c b/citadel/internet_addressing.c
index 3b7e0cd90..53924ba80 100644
--- a/citadel/internet_addressing.c
+++ b/citadel/internet_addressing.c
@@ -50,6 +50,246 @@
 #endif
 
 
+#ifdef HAVE_ICONV
+#include <iconv.h>
+
+/*
+ * Wrapper around iconv_open()
+ * Our version adds aliases for non-standard Microsoft charsets
+ * such as 'MS950', aliasing them to names like 'CP950'
+ *
+ * tocode	Target encoding
+ * fromcode	Source encoding
+ */
+iconv_t ctdl_iconv_open(const char *tocode, const char *fromcode)
+{
+	iconv_t ic = (iconv_t)(-1) ;
+	ic = iconv_open(tocode, fromcode);
+	if (ic == (iconv_t)(-1) ) {
+		char alias_fromcode[64];
+		if ( (strlen(fromcode) == 5) && (!strncasecmp(fromcode, "MS", 2)) ) {
+			safestrncpy(alias_fromcode, fromcode, sizeof alias_fromcode);
+			alias_fromcode[0] = 'C';
+			alias_fromcode[1] = 'P';
+			ic = iconv_open(tocode, alias_fromcode);
+		}
+	}
+	return(ic);
+}
+
+
+
+inline char *FindNextEnd (char *bptr)
+{
+	char * end;
+	/* Find the next ?Q? */
+	end = strchr(bptr + 2, '?');
+	if (end == NULL) return NULL;
+	if (((*(end + 1) == 'B') || (*(end + 1) == 'Q')) && 
+	    (*(end + 2) == '?')) {
+		/* skip on to the end of the cluster, the next ?= */
+		end = strstr(end + 3, "?=");
+	}
+	else
+		/* sort of half valid encoding, try to find an end. */
+		end = strstr(bptr, "?=");
+	return end;
+}
+
+/*
+ * Handle subjects with RFC2047 encoding such as:
+ * =?koi8-r?B?78bP0s3Mxc7JxSDXz9rE1dvO2c3JINvB0sHNySDP?=
+ */
+void utf8ify_rfc822_string(char *buf) {
+	char *start, *end, *next, *nextend, *ptr;
+	char newbuf[1024];
+	char charset[128];
+	char encoding[16];
+	char istr[1024];
+	iconv_t ic = (iconv_t)(-1) ;
+	char *ibuf;			/**< Buffer of characters to be converted */
+	char *obuf;			/**< Buffer for converted characters */
+	size_t ibuflen;			/**< Length of input buffer */
+	size_t obuflen;			/**< Length of output buffer */
+	char *isav;			/**< Saved pointer to input buffer */
+	char *osav;			/**< Saved pointer to output buffer */
+	int passes = 0;
+	int i, len, delta;
+	int illegal_non_rfc2047_encoding = 0;
+
+	/* Sometimes, badly formed messages contain strings which were simply
+	 *  written out directly in some foreign character set instead of
+	 *  using RFC2047 encoding.  This is illegal but we will attempt to
+	 *  handle it anyway by converting from a user-specified default
+	 *  charset to UTF-8 if we see any nonprintable characters.
+	 */
+	len = strlen(buf);
+	for (i=0; i<len; ++i) {
+		if ((buf[i] < 32) || (buf[i] > 126)) {
+			illegal_non_rfc2047_encoding = 1;
+			i = len; ///< take a shortcut, it won't be more than one.
+		}
+	}
+	if (illegal_non_rfc2047_encoding) {
+		const char *default_header_charset = "iso-8859-1";
+		if ( (strcasecmp(default_header_charset, "UTF-8")) && (strcasecmp(default_header_charset, "us-ascii")) ) {
+			ic = ctdl_iconv_open("UTF-8", default_header_charset);
+			if (ic != (iconv_t)(-1) ) {
+				ibuf = malloc(1024);
+				isav = ibuf;
+				safestrncpy(ibuf, buf, 1024);
+				ibuflen = strlen(ibuf);
+				obuflen = 1024;
+				obuf = (char *) malloc(obuflen);
+				osav = obuf;
+				iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen);
+				osav[1024-obuflen] = 0;
+				strcpy(buf, osav);
+				free(osav);
+				iconv_close(ic);
+				free(isav);
+			}
+		}
+	}
+
+	/* pre evaluate the first pair */
+	nextend = end = NULL;
+	len = strlen(buf);
+	start = strstr(buf, "=?");
+	if (start != NULL) 
+		end = FindNextEnd (start);
+
+	while ((start != NULL) && (end != NULL))
+	{
+		next = strstr(end, "=?");
+		if (next != NULL)
+			nextend = FindNextEnd(next);
+		if (nextend == NULL)
+			next = NULL;
+
+		/* did we find two partitions */
+		if ((next != NULL) && 
+		    ((next - end) > 2))
+		{
+			ptr = end + 2;
+			while ((ptr < next) && 
+			       (isspace(*ptr) ||
+				(*ptr == '\r') ||
+				(*ptr == '\n') || 
+				(*ptr == '\t')))
+				ptr ++;
+			/* did we find a gab just filled with blanks? */
+			if (ptr == next)
+			{
+				memmove (end + 2,
+					 next,
+					 len - (next - start));
+
+				/* now terminate the gab at the end */
+				delta = (next - end) - 2;
+				len -= delta;
+				buf[len] = '\0';
+
+				/* move next to its new location. */
+				next -= delta;
+				nextend -= delta;
+			}
+		}
+		/* our next-pair is our new first pair now. */
+		start = next;
+		end = nextend;
+	}
+
+	/* Now we handle foreign character sets properly encoded
+	 * in RFC2047 format.
+	 */
+	while (start=strstr(buf, "=?"), end=FindNextEnd((start != NULL)? start : buf),
+		((start != NULL) && (end != NULL) && (end > start)) )
+	{
+		extract_token(charset, start, 1, '?', sizeof charset);
+		extract_token(encoding, start, 2, '?', sizeof encoding);
+		extract_token(istr, start, 3, '?', sizeof istr);
+
+		ibuf = malloc(1024);
+		isav = ibuf;
+		if (!strcasecmp(encoding, "B")) {	/**< base64 */
+			ibuflen = CtdlDecodeBase64(ibuf, istr, strlen(istr));
+		}
+		else if (!strcasecmp(encoding, "Q")) {	/**< quoted-printable */
+			size_t len;
+			long pos;
+			
+			len = strlen(istr);
+			pos = 0;
+			while (pos < len)
+			{
+				if (istr[pos] == '_') istr[pos] = ' ';
+				pos++;
+			}
+
+			ibuflen = CtdlDecodeQuotedPrintable(ibuf, istr, len);
+		}
+		else {
+			strcpy(ibuf, istr);		/**< unknown encoding */
+			ibuflen = strlen(istr);
+		}
+
+		ic = ctdl_iconv_open("UTF-8", charset);
+		if (ic != (iconv_t)(-1) ) {
+			obuflen = 1024;
+			obuf = (char *) malloc(obuflen);
+			osav = obuf;
+			iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen);
+			osav[1024-obuflen] = 0;
+
+			end = start;
+			end++;
+			strcpy(start, "");
+			remove_token(end, 0, '?');
+			remove_token(end, 0, '?');
+			remove_token(end, 0, '?');
+			remove_token(end, 0, '?');
+			strcpy(end, &end[1]);
+
+			snprintf(newbuf, sizeof newbuf, "%s%s%s", buf, osav, end);
+			strcpy(buf, newbuf);
+			free(osav);
+			iconv_close(ic);
+		}
+		else {
+			end = start;
+			end++;
+			strcpy(start, "");
+			remove_token(end, 0, '?');
+			remove_token(end, 0, '?');
+			remove_token(end, 0, '?');
+			remove_token(end, 0, '?');
+			strcpy(end, &end[1]);
+
+			snprintf(newbuf, sizeof newbuf, "%s(unreadable)%s", buf, end);
+			strcpy(buf, newbuf);
+		}
+
+		free(isav);
+
+		/*
+		 * Since spammers will go to all sorts of absurd lengths to get their
+		 * messages through, there are LOTS of corrupt headers out there.
+		 * So, prevent a really badly formed RFC2047 header from throwing
+		 * this function into an infinite loop.
+		 */
+		++passes;
+		if (passes > 20) return;
+	}
+
+}
+#else
+inline void utf8ify_rfc822_string(char *a){};
+
+#endif
+
+
+
 struct trynamebuf {
 	char buffer1[SIZ];
 	char buffer2[SIZ];
@@ -758,6 +998,7 @@ char *harvest_collected_addresses(struct CtdlMessage *msg) {
 		if (msg->cm_fields[field] != NULL) {
 			for (j=0; j<num_tokens(msg->cm_fields[field], ','); ++j) {
 				extract_token(addr, msg->cm_fields[field], j, ',', sizeof addr);
+				utf8ify_rfc822_string(addr);
 				process_rfc822_addr(addr, user, node, name);
 				h = CtdlHostAlias(node);
 				if ( (h != hostalias_localhost) && (h != hostalias_directory) ) {
-- 
2.30.2