fixing html2ascii
authorWilfried Goesgens <dothebart@citadel.org>
Mon, 21 Feb 2011 22:13:33 +0000 (23:13 +0100)
committerWilfried Goesgens <dothebart@citadel.org>
Mon, 21 Feb 2011 22:51:01 +0000 (23:51 +0100)
  - add possibility to scan 4 digit entities
  - don't scan over the end
  - add tests

libcitadel/lib/html_to_ascii.c
libcitadel/tests/stringbuf_conversion.c
libcitadel/tests/testdata/html/entitystrings.txt [new file with mode: 0644]

index b80272aecd0d7f8517801062fbcf8a1ac3b80cab..9a898cb70c5ed98853f16771305e3232e917f91b 100644 (file)
@@ -470,8 +470,12 @@ char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth, int do_ci
                        }
 
                        /* two-digit decimal equivalents */
-                       else if ((!strncmp(&outbuf[i], "&#", 2))
-                             && (outbuf[i+4] == ';') ) {
+                       else if (outbuf[i] == '&'       &&
+                                outbuf[i + 1] == '#'   &&
+                                isdigit(outbuf[i + 2]) && 
+                                isdigit(outbuf[i + 3]) &&
+                                (outbuf[i+4] == ';') ) 
+                       {
                                scanch = 0;
                                sscanf(&outbuf[i+2], "%02d", &scanch);
                                outbuf[i] = scanch;
@@ -479,14 +483,34 @@ char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth, int do_ci
                        }
 
                        /* three-digit decimal equivalents */
-                       else if ((!strncmp(&outbuf[i], "&#", 2))
-                             && (outbuf[i+5] == ';') ) {
+                       else if (outbuf[i] == '&'       &&
+                                outbuf[i + 1] == '#'   &&
+                                isdigit(outbuf[i + 2]) && 
+                                isdigit(outbuf[i + 3]) && 
+                                isdigit(outbuf[i + 4]) &&
+                                (outbuf[i + 5] == ';') ) 
+                       {
                                scanch = 0;
                                sscanf(&outbuf[i+2], "%03d", &scanch);
                                outbuf[i] = scanch;
                                strcpy(&outbuf[i+1], &outbuf[i+6]);
                        }
 
+                       /* four-digit decimal equivalents */
+                       else if (outbuf[i] == '&'       &&
+                                outbuf[i + 1] == '#'   &&
+                                isdigit(outbuf[i + 2]) && 
+                                isdigit(outbuf[i + 3]) && 
+                                isdigit(outbuf[i + 4]) &&
+                                isdigit(outbuf[i + 5]) &&
+                                (outbuf[i + 6] == ';') ) 
+                       {
+                               scanch = 0;
+                               sscanf(&outbuf[i+2], "%04d", &scanch);
+                               outbuf[i] = scanch;
+                               strcpy(&outbuf[i+1], &outbuf[i+6]);
+                       }
+
                }
 
                /* Make sure the output buffer is big enough */
index e2841624219376af3479fca21e4a607ff2beb6fc..a554c82618f5858fe2285e819c66d4c8fe4911d3 100644 (file)
@@ -30,6 +30,7 @@
 
 int fromstdin = 0;
 int parse_email = 0;
+int parse_html = 0;
 static void TestRevalidateStrBuf(StrBuf *Buf)
 {
        CU_ASSERT(strlen(ChrPtr(Buf)) == StrLength(Buf));
@@ -202,6 +203,29 @@ static void TestEncodeEmailSTDIN(void)
 }
 
 
+static void TestHTML2ASCII_line(void)
+{
+       int fdin = 0;// STDIN
+       const char *Err;
+       StrBuf *Source;
+       char *Target;
+
+       Source = NewStrBuf();
+
+       while (fdin == 0) {
+               
+               StrBufTCP_read_line(Source, &fdin, 0, &Err);
+               printf("the source:>%s<\n", ChrPtr(Source));
+               Target = html_to_ascii(ChrPtr(Source), StrLength(Source), 80, 0);
+               
+               printf("the target:>%s<\n", Target);
+               FlushStrBuf(Source);
+               free(Target);
+       }
+
+       FreeStrBuf(&Source);
+}
+
 
 static void AddStrBufSimlpeTests(void)
 {
@@ -209,22 +233,25 @@ static void AddStrBufSimlpeTests(void)
        CU_pTest pTest = NULL;
 
        pGroup = CU_add_suite("TestStringBufConversions", NULL, NULL);
-       if (!parse_email) {
+       if (parse_email) {
                if (!fromstdin) {
-                       pTest = CU_add_test(pGroup, "testRFC822Decode", TestRFC822Decode);
-                       pTest = CU_add_test(pGroup, "testRFC822Decode1", TestRFC822Decode);
-                       pTest = CU_add_test(pGroup, "testRFC822Decode2", TestRFC822Decode);
-                       pTest = CU_add_test(pGroup, "testRFC822Decode3", TestRFC822Decode);
+                       pTest = CU_add_test(pGroup, "TestParseEmailSTDIN", TestEncodeEmail);
                }
                else
-                       pTest = CU_add_test(pGroup, "testRFC822DecodeSTDIN", TestRFC822DecodeStdin);
+                       pTest = CU_add_test(pGroup, "TestParseEmailSTDIN", TestEncodeEmailSTDIN);
+       }
+       else if (parse_html) {
+                       pTest = CU_add_test(pGroup, "TestParseHTMLSTDIN", TestHTML2ASCII_line);
        }
        else {
                if (!fromstdin) {
-                       pTest = CU_add_test(pGroup, "TestParseEmailSTDIN", TestEncodeEmail);
+                       pTest = CU_add_test(pGroup, "testRFC822Decode", TestRFC822Decode);
+                       pTest = CU_add_test(pGroup, "testRFC822Decode1", TestRFC822Decode);
+                       pTest = CU_add_test(pGroup, "testRFC822Decode2", TestRFC822Decode);
+                       pTest = CU_add_test(pGroup, "testRFC822Decode3", TestRFC822Decode);
                }
                else
-                       pTest = CU_add_test(pGroup, "TestParseEmailSTDIN", TestEncodeEmailSTDIN);
+                       pTest = CU_add_test(pGroup, "testRFC822DecodeSTDIN", TestRFC822DecodeStdin);
        }
 
 }
@@ -234,8 +261,11 @@ int main(int argc, char* argv[])
 {
        int a;
 
-       while ((a = getopt(argc, argv, "@i")) != EOF)
+       while ((a = getopt(argc, argv, "@ih")) != EOF)
                switch (a) {
+               case 'h':
+                       parse_html = 1;
+                       break;
                case '@':
                        parse_email = 1;
                        break;
diff --git a/libcitadel/tests/testdata/html/entitystrings.txt b/libcitadel/tests/testdata/html/entitystrings.txt
new file mode 100644 (file)
index 0000000..f34ebbd
--- /dev/null
@@ -0,0 +1,2 @@
+TRASH Messenger Bags &#8211; Hip Pack.
+Abandon &#8216;Share The Road&#8217;