* use iconv in citserver too
authorWilfried Göesgens <willi@citadel.org>
Sat, 24 May 2008 17:18:52 +0000 (17:18 +0000)
committerWilfried Göesgens <willi@citadel.org>
Sat, 24 May 2008 17:18:52 +0000 (17:18 +0000)
* DE-QP and IConv translate harvested mail addresses

citadel/configure.ac
citadel/internet_addressing.c

index cd32b4496a2ae7ed921d5431471d5daa1a767c58..78929b3a696814b17331a2de8518474fa4f1d81b 100644 (file)
@@ -129,6 +129,65 @@ AC_ARG_WITH(docdir,
 )
 
 
+dnl Here is the check for a libc integrated iconv
+AC_ARG_ENABLE(iconv,
+       [  --disable-iconv         do not use iconv charset conversion],
+       ok_iconv=no, ok_iconv=yes)
+
+AC_MSG_CHECKING(Checking to see if your system supports iconv)
+AC_TRY_RUN([
+       #include <iconv.h>
+       main() {
+               iconv_t ic = (iconv_t)(-1) ;
+               ic = iconv_open("UTF-8", "us-ascii");
+               iconv_close(ic);
+               exit(0);
+       }
+ ],
+               [
+                 ok_iconv=yes
+                 AC_MSG_RESULT([yes])
+               ],
+               [ 
+                 ok_iconv=no
+                 AC_MSG_RESULT([no])
+               ]
+)
+
+dnl Check for iconv in external libiconv
+if test "$ok_iconv" = no; then
+       AC_MSG_CHECKING(Checking for an external libiconv)
+       OLD_LDFLAGS="$LDFLAGS"
+       LDFLAGS="$LDFLAGS -liconv"
+       AC_TRY_RUN([
+                       #include <iconv.h>
+                       main() {
+                               iconv_t ic = (iconv_t)(-1) ;
+                               ic = iconv_open("UTF-8", "us-ascii");
+                               iconv_close(ic);
+                       }
+               ],
+                       [
+                         ok_iconv=yes
+                         AC_MSG_RESULT([yes])
+                       ],
+                       [ 
+                         ok_iconv=no
+                         LDFLAGS="$OLD_LDFLAGS"
+                         AC_MSG_RESULT([no])
+                       ]
+               )
+fi     
+if test "$ok_iconv" != "no"; then
+       AC_MSG_RESULT(WebCit will be built with character set conversion.)
+       AC_DEFINE(HAVE_ICONV,[],[whether we have iconv for charset conversion])
+else
+       AC_MSG_RESULT(WebCit will be built without character set conversion.)
+fi
+
+AC_CHECK_LIB(intl, libintl_bindtextdomain, [LDFLAGS="$LDFLAGS -lintl"])
+
+
 
 AC_ARG_ENABLE(threaded-client, [  --disable-threaded-client
                          disable multithreaded client])
@@ -908,6 +967,7 @@ fi
 echo ------------------------------------------------------------------------
 echo 'zlib compression:                ' $ok_zlib
 echo 'LDAP support:                    ' $ok_ldap
+echo 'Character set conversion support:' $ok_iconv
 echo 'DSpam Scanning support:          ' $ok_libdspam
 echo 
 echo 'Note: if you are not using Linux, make sure you are using GNU make'
index 3b7e0cd9039f38784d5b378c066853ec64e72964..53924ba809785c049db3cbe91dd5a4e961da2dee 100644 (file)
 #endif
 
 
+#ifdef HAVE_ICONV
+#include <iconv.h>
+
+/*
+ * Wrapper around iconv_open()
+ * Our version adds aliases for non-standard Microsoft charsets
+ * such as 'MS950', aliasing them to names like 'CP950'
+ *
+ * tocode      Target encoding
+ * fromcode    Source encoding
+ */
+iconv_t ctdl_iconv_open(const char *tocode, const char *fromcode)
+{
+       iconv_t ic = (iconv_t)(-1) ;
+       ic = iconv_open(tocode, fromcode);
+       if (ic == (iconv_t)(-1) ) {
+               char alias_fromcode[64];
+               if ( (strlen(fromcode) == 5) && (!strncasecmp(fromcode, "MS", 2)) ) {
+                       safestrncpy(alias_fromcode, fromcode, sizeof alias_fromcode);
+                       alias_fromcode[0] = 'C';
+                       alias_fromcode[1] = 'P';
+                       ic = iconv_open(tocode, alias_fromcode);
+               }
+       }
+       return(ic);
+}
+
+
+
+inline char *FindNextEnd (char *bptr)
+{
+       char * end;
+       /* Find the next ?Q? */
+       end = strchr(bptr + 2, '?');
+       if (end == NULL) return NULL;
+       if (((*(end + 1) == 'B') || (*(end + 1) == 'Q')) && 
+           (*(end + 2) == '?')) {
+               /* skip on to the end of the cluster, the next ?= */
+               end = strstr(end + 3, "?=");
+       }
+       else
+               /* sort of half valid encoding, try to find an end. */
+               end = strstr(bptr, "?=");
+       return end;
+}
+
+/*
+ * Handle subjects with RFC2047 encoding such as:
+ * =?koi8-r?B?78bP0s3Mxc7JxSDXz9rE1dvO2c3JINvB0sHNySDP?=
+ */
+void utf8ify_rfc822_string(char *buf) {
+       char *start, *end, *next, *nextend, *ptr;
+       char newbuf[1024];
+       char charset[128];
+       char encoding[16];
+       char istr[1024];
+       iconv_t ic = (iconv_t)(-1) ;
+       char *ibuf;                     /**< Buffer of characters to be converted */
+       char *obuf;                     /**< Buffer for converted characters */
+       size_t ibuflen;                 /**< Length of input buffer */
+       size_t obuflen;                 /**< Length of output buffer */
+       char *isav;                     /**< Saved pointer to input buffer */
+       char *osav;                     /**< Saved pointer to output buffer */
+       int passes = 0;
+       int i, len, delta;
+       int illegal_non_rfc2047_encoding = 0;
+
+       /* Sometimes, badly formed messages contain strings which were simply
+        *  written out directly in some foreign character set instead of
+        *  using RFC2047 encoding.  This is illegal but we will attempt to
+        *  handle it anyway by converting from a user-specified default
+        *  charset to UTF-8 if we see any nonprintable characters.
+        */
+       len = strlen(buf);
+       for (i=0; i<len; ++i) {
+               if ((buf[i] < 32) || (buf[i] > 126)) {
+                       illegal_non_rfc2047_encoding = 1;
+                       i = len; ///< take a shortcut, it won't be more than one.
+               }
+       }
+       if (illegal_non_rfc2047_encoding) {
+               const char *default_header_charset = "iso-8859-1";
+               if ( (strcasecmp(default_header_charset, "UTF-8")) && (strcasecmp(default_header_charset, "us-ascii")) ) {
+                       ic = ctdl_iconv_open("UTF-8", default_header_charset);
+                       if (ic != (iconv_t)(-1) ) {
+                               ibuf = malloc(1024);
+                               isav = ibuf;
+                               safestrncpy(ibuf, buf, 1024);
+                               ibuflen = strlen(ibuf);
+                               obuflen = 1024;
+                               obuf = (char *) malloc(obuflen);
+                               osav = obuf;
+                               iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen);
+                               osav[1024-obuflen] = 0;
+                               strcpy(buf, osav);
+                               free(osav);
+                               iconv_close(ic);
+                               free(isav);
+                       }
+               }
+       }
+
+       /* pre evaluate the first pair */
+       nextend = end = NULL;
+       len = strlen(buf);
+       start = strstr(buf, "=?");
+       if (start != NULL) 
+               end = FindNextEnd (start);
+
+       while ((start != NULL) && (end != NULL))
+       {
+               next = strstr(end, "=?");
+               if (next != NULL)
+                       nextend = FindNextEnd(next);
+               if (nextend == NULL)
+                       next = NULL;
+
+               /* did we find two partitions */
+               if ((next != NULL) && 
+                   ((next - end) > 2))
+               {
+                       ptr = end + 2;
+                       while ((ptr < next) && 
+                              (isspace(*ptr) ||
+                               (*ptr == '\r') ||
+                               (*ptr == '\n') || 
+                               (*ptr == '\t')))
+                               ptr ++;
+                       /* did we find a gab just filled with blanks? */
+                       if (ptr == next)
+                       {
+                               memmove (end + 2,
+                                        next,
+                                        len - (next - start));
+
+                               /* now terminate the gab at the end */
+                               delta = (next - end) - 2;
+                               len -= delta;
+                               buf[len] = '\0';
+
+                               /* move next to its new location. */
+                               next -= delta;
+                               nextend -= delta;
+                       }
+               }
+               /* our next-pair is our new first pair now. */
+               start = next;
+               end = nextend;
+       }
+
+       /* Now we handle foreign character sets properly encoded
+        * in RFC2047 format.
+        */
+       while (start=strstr(buf, "=?"), end=FindNextEnd((start != NULL)? start : buf),
+               ((start != NULL) && (end != NULL) && (end > start)) )
+       {
+               extract_token(charset, start, 1, '?', sizeof charset);
+               extract_token(encoding, start, 2, '?', sizeof encoding);
+               extract_token(istr, start, 3, '?', sizeof istr);
+
+               ibuf = malloc(1024);
+               isav = ibuf;
+               if (!strcasecmp(encoding, "B")) {       /**< base64 */
+                       ibuflen = CtdlDecodeBase64(ibuf, istr, strlen(istr));
+               }
+               else if (!strcasecmp(encoding, "Q")) {  /**< quoted-printable */
+                       size_t len;
+                       long pos;
+                       
+                       len = strlen(istr);
+                       pos = 0;
+                       while (pos < len)
+                       {
+                               if (istr[pos] == '_') istr[pos] = ' ';
+                               pos++;
+                       }
+
+                       ibuflen = CtdlDecodeQuotedPrintable(ibuf, istr, len);
+               }
+               else {
+                       strcpy(ibuf, istr);             /**< unknown encoding */
+                       ibuflen = strlen(istr);
+               }
+
+               ic = ctdl_iconv_open("UTF-8", charset);
+               if (ic != (iconv_t)(-1) ) {
+                       obuflen = 1024;
+                       obuf = (char *) malloc(obuflen);
+                       osav = obuf;
+                       iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen);
+                       osav[1024-obuflen] = 0;
+
+                       end = start;
+                       end++;
+                       strcpy(start, "");
+                       remove_token(end, 0, '?');
+                       remove_token(end, 0, '?');
+                       remove_token(end, 0, '?');
+                       remove_token(end, 0, '?');
+                       strcpy(end, &end[1]);
+
+                       snprintf(newbuf, sizeof newbuf, "%s%s%s", buf, osav, end);
+                       strcpy(buf, newbuf);
+                       free(osav);
+                       iconv_close(ic);
+               }
+               else {
+                       end = start;
+                       end++;
+                       strcpy(start, "");
+                       remove_token(end, 0, '?');
+                       remove_token(end, 0, '?');
+                       remove_token(end, 0, '?');
+                       remove_token(end, 0, '?');
+                       strcpy(end, &end[1]);
+
+                       snprintf(newbuf, sizeof newbuf, "%s(unreadable)%s", buf, end);
+                       strcpy(buf, newbuf);
+               }
+
+               free(isav);
+
+               /*
+                * Since spammers will go to all sorts of absurd lengths to get their
+                * messages through, there are LOTS of corrupt headers out there.
+                * So, prevent a really badly formed RFC2047 header from throwing
+                * this function into an infinite loop.
+                */
+               ++passes;
+               if (passes > 20) return;
+       }
+
+}
+#else
+inline void utf8ify_rfc822_string(char *a){};
+
+#endif
+
+
+
 struct trynamebuf {
        char buffer1[SIZ];
        char buffer2[SIZ];
@@ -758,6 +998,7 @@ char *harvest_collected_addresses(struct CtdlMessage *msg) {
                if (msg->cm_fields[field] != NULL) {
                        for (j=0; j<num_tokens(msg->cm_fields[field], ','); ++j) {
                                extract_token(addr, msg->cm_fields[field], j, ',', sizeof addr);
+                               utf8ify_rfc822_string(addr);
                                process_rfc822_addr(addr, user, node, name);
                                h = CtdlHostAlias(node);
                                if ( (h != hostalias_localhost) && (h != hostalias_directory) ) {