From a2401aea9ac970fa35337c5665f0b725bcdb4ccc Mon Sep 17 00:00:00 2001 From: =?utf8?q?Wilfried=20G=C3=B6esgens?= Date: Sat, 24 May 2008 17:18:52 +0000 Subject: [PATCH] * use iconv in citserver too * DE-QP and IConv translate harvested mail addresses --- citadel/configure.ac | 60 +++++++++ citadel/internet_addressing.c | 241 ++++++++++++++++++++++++++++++++++ 2 files changed, 301 insertions(+) diff --git a/citadel/configure.ac b/citadel/configure.ac index cd32b4496..78929b3a6 100644 --- a/citadel/configure.ac +++ b/citadel/configure.ac @@ -129,6 +129,65 @@ AC_ARG_WITH(docdir, ) +dnl Here is the check for a libc integrated iconv +AC_ARG_ENABLE(iconv, + [ --disable-iconv do not use iconv charset conversion], + ok_iconv=no, ok_iconv=yes) + +AC_MSG_CHECKING(Checking to see if your system supports iconv) +AC_TRY_RUN([ + #include + main() { + iconv_t ic = (iconv_t)(-1) ; + ic = iconv_open("UTF-8", "us-ascii"); + iconv_close(ic); + exit(0); + } + ], + [ + ok_iconv=yes + AC_MSG_RESULT([yes]) + ], + [ + ok_iconv=no + AC_MSG_RESULT([no]) + ] +) + +dnl Check for iconv in external libiconv +if test "$ok_iconv" = no; then + AC_MSG_CHECKING(Checking for an external libiconv) + OLD_LDFLAGS="$LDFLAGS" + LDFLAGS="$LDFLAGS -liconv" + AC_TRY_RUN([ + #include + main() { + iconv_t ic = (iconv_t)(-1) ; + ic = iconv_open("UTF-8", "us-ascii"); + iconv_close(ic); + } + ], + [ + ok_iconv=yes + AC_MSG_RESULT([yes]) + ], + [ + ok_iconv=no + LDFLAGS="$OLD_LDFLAGS" + AC_MSG_RESULT([no]) + ] + ) +fi +if test "$ok_iconv" != "no"; then + AC_MSG_RESULT(WebCit will be built with character set conversion.) + AC_DEFINE(HAVE_ICONV,[],[whether we have iconv for charset conversion]) +else + AC_MSG_RESULT(WebCit will be built without character set conversion.) +fi + +AC_CHECK_LIB(intl, libintl_bindtextdomain, [LDFLAGS="$LDFLAGS -lintl"]) + + AC_ARG_ENABLE(threaded-client, [ --disable-threaded-client disable multithreaded client]) @@ -908,6 +967,7 @@ fi echo ------------------------------------------------------------------------ echo 'zlib compression: ' $ok_zlib echo 'LDAP support: ' $ok_ldap +echo 'Character set conversion support:' $ok_iconv echo 'DSpam Scanning support: ' $ok_libdspam echo echo 'Note: if you are not using Linux, make sure you are using GNU make' diff --git a/citadel/internet_addressing.c b/citadel/internet_addressing.c index 3b7e0cd90..53924ba80 100644 --- a/citadel/internet_addressing.c +++ b/citadel/internet_addressing.c @@ -50,6 +50,246 @@ #endif +#ifdef HAVE_ICONV +#include + +/* + * Wrapper around iconv_open() + * Our version adds aliases for non-standard Microsoft charsets + * such as 'MS950', aliasing them to names like 'CP950' + * + * tocode Target encoding + * fromcode Source encoding + */ +iconv_t ctdl_iconv_open(const char *tocode, const char *fromcode) +{ + iconv_t ic = (iconv_t)(-1) ; + ic = iconv_open(tocode, fromcode); + if (ic == (iconv_t)(-1) ) { + char alias_fromcode[64]; + if ( (strlen(fromcode) == 5) && (!strncasecmp(fromcode, "MS", 2)) ) { + safestrncpy(alias_fromcode, fromcode, sizeof alias_fromcode); + alias_fromcode[0] = 'C'; + alias_fromcode[1] = 'P'; + ic = iconv_open(tocode, alias_fromcode); + } + } + return(ic); +} + + + +inline char *FindNextEnd (char *bptr) +{ + char * end; + /* Find the next ?Q? */ + end = strchr(bptr + 2, '?'); + if (end == NULL) return NULL; + if (((*(end + 1) == 'B') || (*(end + 1) == 'Q')) && + (*(end + 2) == '?')) { + /* skip on to the end of the cluster, the next ?= */ + end = strstr(end + 3, "?="); + } + else + /* sort of half valid encoding, try to find an end. */ + end = strstr(bptr, "?="); + return end; +} + +/* + * Handle subjects with RFC2047 encoding such as: + * =?koi8-r?B?78bP0s3Mxc7JxSDXz9rE1dvO2c3JINvB0sHNySDP?= + */ +void utf8ify_rfc822_string(char *buf) { + char *start, *end, *next, *nextend, *ptr; + char newbuf[1024]; + char charset[128]; + char encoding[16]; + char istr[1024]; + iconv_t ic = (iconv_t)(-1) ; + char *ibuf; /**< Buffer of characters to be converted */ + char *obuf; /**< Buffer for converted characters */ + size_t ibuflen; /**< Length of input buffer */ + size_t obuflen; /**< Length of output buffer */ + char *isav; /**< Saved pointer to input buffer */ + char *osav; /**< Saved pointer to output buffer */ + int passes = 0; + int i, len, delta; + int illegal_non_rfc2047_encoding = 0; + + /* Sometimes, badly formed messages contain strings which were simply + * written out directly in some foreign character set instead of + * using RFC2047 encoding. This is illegal but we will attempt to + * handle it anyway by converting from a user-specified default + * charset to UTF-8 if we see any nonprintable characters. + */ + len = strlen(buf); + for (i=0; i 126)) { + illegal_non_rfc2047_encoding = 1; + i = len; ///< take a shortcut, it won't be more than one. + } + } + if (illegal_non_rfc2047_encoding) { + const char *default_header_charset = "iso-8859-1"; + if ( (strcasecmp(default_header_charset, "UTF-8")) && (strcasecmp(default_header_charset, "us-ascii")) ) { + ic = ctdl_iconv_open("UTF-8", default_header_charset); + if (ic != (iconv_t)(-1) ) { + ibuf = malloc(1024); + isav = ibuf; + safestrncpy(ibuf, buf, 1024); + ibuflen = strlen(ibuf); + obuflen = 1024; + obuf = (char *) malloc(obuflen); + osav = obuf; + iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen); + osav[1024-obuflen] = 0; + strcpy(buf, osav); + free(osav); + iconv_close(ic); + free(isav); + } + } + } + + /* pre evaluate the first pair */ + nextend = end = NULL; + len = strlen(buf); + start = strstr(buf, "=?"); + if (start != NULL) + end = FindNextEnd (start); + + while ((start != NULL) && (end != NULL)) + { + next = strstr(end, "=?"); + if (next != NULL) + nextend = FindNextEnd(next); + if (nextend == NULL) + next = NULL; + + /* did we find two partitions */ + if ((next != NULL) && + ((next - end) > 2)) + { + ptr = end + 2; + while ((ptr < next) && + (isspace(*ptr) || + (*ptr == '\r') || + (*ptr == '\n') || + (*ptr == '\t'))) + ptr ++; + /* did we find a gab just filled with blanks? */ + if (ptr == next) + { + memmove (end + 2, + next, + len - (next - start)); + + /* now terminate the gab at the end */ + delta = (next - end) - 2; + len -= delta; + buf[len] = '\0'; + + /* move next to its new location. */ + next -= delta; + nextend -= delta; + } + } + /* our next-pair is our new first pair now. */ + start = next; + end = nextend; + } + + /* Now we handle foreign character sets properly encoded + * in RFC2047 format. + */ + while (start=strstr(buf, "=?"), end=FindNextEnd((start != NULL)? start : buf), + ((start != NULL) && (end != NULL) && (end > start)) ) + { + extract_token(charset, start, 1, '?', sizeof charset); + extract_token(encoding, start, 2, '?', sizeof encoding); + extract_token(istr, start, 3, '?', sizeof istr); + + ibuf = malloc(1024); + isav = ibuf; + if (!strcasecmp(encoding, "B")) { /**< base64 */ + ibuflen = CtdlDecodeBase64(ibuf, istr, strlen(istr)); + } + else if (!strcasecmp(encoding, "Q")) { /**< quoted-printable */ + size_t len; + long pos; + + len = strlen(istr); + pos = 0; + while (pos < len) + { + if (istr[pos] == '_') istr[pos] = ' '; + pos++; + } + + ibuflen = CtdlDecodeQuotedPrintable(ibuf, istr, len); + } + else { + strcpy(ibuf, istr); /**< unknown encoding */ + ibuflen = strlen(istr); + } + + ic = ctdl_iconv_open("UTF-8", charset); + if (ic != (iconv_t)(-1) ) { + obuflen = 1024; + obuf = (char *) malloc(obuflen); + osav = obuf; + iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen); + osav[1024-obuflen] = 0; + + end = start; + end++; + strcpy(start, ""); + remove_token(end, 0, '?'); + remove_token(end, 0, '?'); + remove_token(end, 0, '?'); + remove_token(end, 0, '?'); + strcpy(end, &end[1]); + + snprintf(newbuf, sizeof newbuf, "%s%s%s", buf, osav, end); + strcpy(buf, newbuf); + free(osav); + iconv_close(ic); + } + else { + end = start; + end++; + strcpy(start, ""); + remove_token(end, 0, '?'); + remove_token(end, 0, '?'); + remove_token(end, 0, '?'); + remove_token(end, 0, '?'); + strcpy(end, &end[1]); + + snprintf(newbuf, sizeof newbuf, "%s(unreadable)%s", buf, end); + strcpy(buf, newbuf); + } + + free(isav); + + /* + * Since spammers will go to all sorts of absurd lengths to get their + * messages through, there are LOTS of corrupt headers out there. + * So, prevent a really badly formed RFC2047 header from throwing + * this function into an infinite loop. + */ + ++passes; + if (passes > 20) return; + } + +} +#else +inline void utf8ify_rfc822_string(char *a){}; + +#endif + + + struct trynamebuf { char buffer1[SIZ]; char buffer2[SIZ]; @@ -758,6 +998,7 @@ char *harvest_collected_addresses(struct CtdlMessage *msg) { if (msg->cm_fields[field] != NULL) { for (j=0; jcm_fields[field], ','); ++j) { extract_token(addr, msg->cm_fields[field], j, ',', sizeof addr); + utf8ify_rfc822_string(addr); process_rfc822_addr(addr, user, node, name); h = CtdlHostAlias(node); if ( (h != hostalias_localhost) && (h != hostalias_directory) ) { -- 2.30.2