Suppress output of inline HTML styles when converting to plain text
[citadel.git] / libcitadel / lib / html_to_ascii.c
index 86cbdedeed4574c0fb0f8e67de99354e06644f5e..45beb2cbb079b97eeaa2041df905e8676ce3145a 100644 (file)
@@ -1,22 +1,20 @@
 /*
- * $Id: html.c 6014 2008-02-04 18:38:35Z ajc $
- *
  * Functions which handle translation between HTML and plain text
- * Copyright (c) 2000-2010 by the citadel.org team
+ * Copyright (c) 2000-2018 by the citadel.org team
  *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 3 of the License, or
- *  (at your option) any later version.
+ * This program is open source software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
  *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
  *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <stdlib.h>
  *
  * inputmsg      = pointer to raw HTML message
  * screenwidth   = desired output screenwidth
- * do_citaformat = set to 1 to indent newlines with spaces
  */
-char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaformat) {
+char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth) {
        char inbuf[SIZ];
        int inbuf_len = 0;
        char outbuf[SIZ];
        char tag[1024];
        int done_reading = 0;
-       char *inptr;
+       const char *inptr;
        char *outptr;
        size_t outptr_buffer_size;
        size_t output_len = 0;
@@ -69,6 +66,7 @@ char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaform
        int bytes_processed = 0;
        char nl[128];
 
+       tag[0] = '\0';
        strcpy(nl, "\n");
        inptr = inputmsg;
        strcpy(inbuf, "");
@@ -199,6 +197,9 @@ char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaform
                                        strcat(outbuf, nl);
                                }
 
+#if 0
+       These seemed like a good idea at the time, but it just makes a mess.
+
                                else if (
                                        (!strcasecmp(tag, "B"))
                                        || (!strcasecmp(tag, "/B"))
@@ -206,7 +207,6 @@ char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaform
                                        || (!strcasecmp(tag, "/STRONG"))
                                ) {
                                        strcat(outbuf, "*");
-                                       
                                }
 
                                else if (
@@ -216,7 +216,6 @@ char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaform
                                        || (!strcasecmp(tag, "/EM"))
                                ) {
                                        strcat(outbuf, "/");
-                                       
                                }
 
                                else if (
@@ -224,8 +223,8 @@ char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaform
                                        || (!strcasecmp(tag, "/U"))
                                ) {
                                        strcat(outbuf, "_");
-                                       
                                }
+#endif
 
                                else if (!strcasecmp(tag, "BR")) {
                                        strcat(outbuf, nl);
@@ -275,7 +274,7 @@ char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaform
                                tag[strlen(tag)] = ch;
                        }
                                
-                       else if (!nest) {
+                       else if ((!nest) && (styletag == 0)) {
                                outbuf[strlen(outbuf)+1] = 0;
                                outbuf[strlen(outbuf)] = ch;
                        }
@@ -471,9 +470,23 @@ char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaform
                                strcpy(&outbuf[i+1], &outbuf[i+7]);
                        }
 
+                       else if (!strncasecmp(&outbuf[i], "&#8217;", 7)) {
+                               outbuf[i] = '\'';
+                               strcpy(&outbuf[i+1], &outbuf[i+7]);
+                       }
+
+                       else if (!strncasecmp(&outbuf[i], "&#8211;", 7)) {
+                               outbuf[i] = '-';
+                               strcpy(&outbuf[i+1], &outbuf[i+7]);
+                       }
+
                        /* two-digit decimal equivalents */
-                       else if ((!strncmp(&outbuf[i], "&#", 2))
-                             && (outbuf[i+4] == ';') ) {
+                       else if (outbuf[i] == '&'       &&
+                                outbuf[i + 1] == '#'   &&
+                                isdigit(outbuf[i + 2]) && 
+                                isdigit(outbuf[i + 3]) &&
+                                (outbuf[i+4] == ';') ) 
+                       {
                                scanch = 0;
                                sscanf(&outbuf[i+2], "%02d", &scanch);
                                outbuf[i] = scanch;
@@ -481,14 +494,34 @@ char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaform
                        }
 
                        /* three-digit decimal equivalents */
-                       else if ((!strncmp(&outbuf[i], "&#", 2))
-                             && (outbuf[i+5] == ';') ) {
+                       else if (outbuf[i] == '&'       &&
+                                outbuf[i + 1] == '#'   &&
+                                isdigit(outbuf[i + 2]) && 
+                                isdigit(outbuf[i + 3]) && 
+                                isdigit(outbuf[i + 4]) &&
+                                (outbuf[i + 5] == ';') ) 
+                       {
                                scanch = 0;
                                sscanf(&outbuf[i+2], "%03d", &scanch);
                                outbuf[i] = scanch;
                                strcpy(&outbuf[i+1], &outbuf[i+6]);
                        }
 
+                       /* four-digit decimal equivalents */
+                       else if (outbuf[i] == '&'       &&
+                                outbuf[i + 1] == '#'   &&
+                                isdigit(outbuf[i + 2]) && 
+                                isdigit(outbuf[i + 3]) && 
+                                isdigit(outbuf[i + 4]) &&
+                                isdigit(outbuf[i + 5]) &&
+                                (outbuf[i + 6] == ';') ) 
+                       {
+                               scanch = 0;
+                               sscanf(&outbuf[i+2], "%04d", &scanch);
+                               outbuf[i] = scanch;
+                               strcpy(&outbuf[i+1], &outbuf[i+7]);
+                       }
+
                }
 
                /* Make sure the output buffer is big enough */
@@ -510,11 +543,6 @@ char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaform
                                        strncpy(&outptr[output_len], outbuf, i+1);
                                        output_len += (i+1);
 
-                                       if (do_citaformat) {
-                                               strcpy(&outptr[output_len], " ");
-                                               ++output_len;
-                                       }
-
                                        strcpy(outbuf, &outbuf[i+1]);
                                        i = 0;
                                        did_out = 1;
@@ -534,10 +562,6 @@ char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaform
                                output_len += rb;
                                strcpy(&outptr[output_len], nl);
                                output_len += strlen(nl);
-                               if (do_citaformat) {
-                                       strcpy(&outptr[output_len], " ");
-                                       ++output_len;
-                               }
                                strcpy(outbuf, &outbuf[rb+1]);
                        } else {
                                strncpy(&outptr[output_len], outbuf,
@@ -545,10 +569,6 @@ char *html_to_ascii(char *inputmsg, int msglen, int screenwidth, int do_citaform
                                output_len += (screenwidth-2);
                                strcpy(&outptr[output_len], nl);
                                output_len += strlen(nl);
-                               if (do_citaformat) {
-                                       strcpy(&outptr[output_len], " ");
-                                       ++output_len;
-                               }
                                strcpy(outbuf, &outbuf[screenwidth-2]);
                        }
                }