]> code.citadel.org Git - citadel.git/blobdiff - libcitadel/lib/html_to_ascii.c
work on sixel support
[citadel.git] / libcitadel / lib / html_to_ascii.c
index 3c1012f00bf601a9aa9ff0efa85d8a4679891a8d..47e0f0c8b7866dc7881b2cfd5fa48446eb4639fb 100644 (file)
@@ -1,8 +1,7 @@
-// Functions which handle translation between HTML and plain text
-// Copyright (c) 2000-2022 by the citadel.org team
+// This is an HTML to plain text converter.
+// Copyright (c) 2000-2024 by the citadel.org team (Art Cancro et al.)
 //
-// This program is open source software.  Use, duplication, or disclosure
-// is subject to the terms of the GNU General Public License, version 3.
+// This program is open source software.  Use, duplication, or disclosure is subject to the GNU General Public License version 3.
 
 #include <stdlib.h>
 #include <unistd.h>
 #include <sys/stat.h>
 #include <errno.h>
 #include <limits.h>
+#include <time.h>
+#include "libcitadel.h"
 
-#if TIME_WITH_SYS_TIME
-# include <sys/time.h>
-# include <time.h>
-#else
-# if HAVE_SYS_TIME_H
-#  include <sys/time.h>
-# else
-#  include <time.h>
-# endif
-#endif
+int u8_wc_toutf8(char *dest, u_int32_t ch) {
+       if (ch < 0x80) {
+               dest[0] = (char)ch;
+               return 1;
+       }
+       if (ch < 0x800) {
+               dest[0] = (ch>>6) | 0xC0;
+               dest[1] = (ch & 0x3F) | 0x80;
+               return 2;
+       }
+       if (ch < 0x10000) {
+               dest[0] = (ch>>12) | 0xE0;
+               dest[1] = ((ch>>6) & 0x3F) | 0x80;
+               dest[2] = (ch & 0x3F) | 0x80;
+               return 3;
+       }
+       if (ch < 0x110000) {
+               dest[0] = (ch>>18) | 0xF0;
+               dest[1] = ((ch>>12) & 0x3F) | 0x80;
+               dest[2] = ((ch>>6) & 0x3F) | 0x80;
+               dest[3] = (ch & 0x3F) | 0x80;
+               return 4;
+       }
+       return 0;
+}
+
+
+// Try to embed an image in the display stream.
+// out                 = the StrBuf to which we are writing the display stream
+// url                 = the URL of the image (warning: it might be a data: URL)
+// display_protocol    = currently only H2A_SIXEL is supported
+void h2a_embed_image(StrBuf *out, char *url, int display_protocol) {
+
+       char buf[4096];
+       snprintf(buf, sizeof(buf), "curl -s '%s' | img2sixel -", url);
+
+       FILE *cmd = popen(buf, "r");
+       if (!cmd) {
+               return;
+       }
+
+       size_t bytes;
+       while (bytes = fread(buf, 1, sizeof(buf), cmd), bytes>0) {
+               StrBufAppendBufPlain(out, buf, bytes, 0);
+       }
+       pclose(cmd);
+}
 
-#include "libcitadel.h"
 
 // Convert HTML to plain text.
 //
-// inputmsg    = pointer to raw HTML message
-// msglen      = stop reading after this many bytes
-// screenwidth = desired output screenwidth
-// ansi                = if nonzero, assume output is to a terminal that supports ANSI escape codes
+// inputmsg     = pointer to raw HTML message
+// msglen       = stop reading after this many bytes
+// screenwidth  = desired output screenwidth
+// flags        = Flags that can be set:
+//              H2A_ANSI       = Output ANSI terminal escape sequences
+//              H2A_SIXEL      = Output Sixel graphics (not yet implemented)
 //
-char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth, int ansi) {
-       char inbuf[SIZ];
+char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth, unsigned int flags) {
+       char *inbuf = NULL;
        int inbuf_len = 0;
-       char outbuf[SIZ];
        char tag[1024];
-       int done_reading = 0;
-       const char *inptr;
+       char *tag_start = NULL;
+       char *tag_end = NULL;
+       StrBuf *out;
        char *outptr;
-       size_t outptr_buffer_size;
-       size_t output_len = 0;
-       int i, j, ch, did_out, rb, scanch;
-       int nest = 0;                           // Bracket nesting level
+       int j;
+       char ch;
+       int tag_nesting_level = 0;              // angle bracket nesting level
        int blockquote = 0;                     // BLOCKQUOTE nesting level
        int styletag = 0;                       // STYLE tag nesting level
-       int styletag_start = 0;
-       int bytes_processed = 0;
-       char nl[128];
+       char nl[128];                           // The current value of what a "newline" looks like (changes during blockquotes)
+
+       int ansi = (flags & H2A_ANSI) ? 1 : 0;          // Output to a terminal that can accept ANSI escape sequences
+       int sixel = (flags & H2A_SIXEL) ? 1 : 0;        // Output to a terminal that can accept Sixel graphics
+
+       out = NewStrBuf();
+       if (!out) {
+               return(NULL);
+       }
 
        tag[0] = '\0';
        strcpy(nl, "\n");
-       inptr = inputmsg;
-       strcpy(inbuf, "");
-       strcpy(outbuf, "");
-       if (msglen == 0) msglen = strlen(inputmsg);
-
-       outptr_buffer_size = strlen(inptr) + SIZ;
-       outptr = malloc(outptr_buffer_size);
-       if (outptr == NULL) return NULL;
-       strcpy(outptr, "");
-       output_len = 0;
-
-       do {
-               // Fill the input buffer
-               inbuf_len = strlen(inbuf);
-               if ( (done_reading == 0) && (inbuf_len < (SIZ-128)) ) {
-
-                       ch = *inptr++;
-                       if (ch != 0) {
-                               inbuf[inbuf_len++] = ch;
-                               inbuf[inbuf_len] = 0;
-                       } 
-                       else {
-                               done_reading = 1;
-                       }
+       if (msglen == 0) {
+               msglen = strlen(inputmsg);
+       }
 
-                       ++bytes_processed;
-                       if (bytes_processed > msglen) {
-                               done_reading = 1;
-                       }
+       inbuf = strdup(inputmsg);
+       if (!inbuf) {
+               return NULL;
+       }
+
+       // "inbuf" ingests the unparsed HTML while we work with it.
+       inbuf_len = strlen(inbuf);
+       if (inbuf_len > msglen) {
+               inbuf[msglen] = 0;
+               inbuf_len = msglen;
+       }
 
+       // Do some parsing
+       if (!IsEmptyStr(inbuf)) {
+
+               // Convert newlines, carriage returns, and tabs to spaces
+               char *sp;
+               while ( (sp = strchr(inbuf, '\r'))
+                       || (sp = strchr(inbuf, '\n'))
+                       || (sp = strchr(inbuf, '\t'))
+               ) {
+                       *sp = ' ';
                }
 
-               // Do some parsing
-               if (!IsEmptyStr(inbuf)) {
-
-                   // Fold in all the spacing
-                   for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
-                       if (inbuf[i]==10) inbuf[i]=32;
-                       if (inbuf[i]==13) inbuf[i]=32;
-                       if (inbuf[i]==9) inbuf[i]=32;
-                   }
-                   for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
-                       while ((inbuf[i]==32)&&(inbuf[i+1]==32)) {
-                               strcpy(&inbuf[i], &inbuf[i+1]);
-                       }
-                   }
+               // Convert multiple spaces to a single space.
+               while (sp = strstr(inbuf, "  "), sp!=NULL) {
+                       strcpy(sp, sp+1);
+               }
 
-                   for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
+               // Run through the markup performing the conversion.
+               char *inptr = inbuf;
+               int linelen = 0;
+               while (ch = inptr[0], ch != 0) {
 
-                       ch = inbuf[i];
+                       // Keep track of how many angle brackets were found in case someone is sloppy with them
+                       // or tries to nest tags.  If nest is 0 then we are within text; if it is nonzero then we
+                       // are within a tag.
 
-                       if (ch == '<') {
-                               ++nest;
+                       if (ch == '<') {                // We have hit the beginning of a tag.
+                               ++tag_nesting_level;
+                               tag_start = inptr + 1;
                                strcpy(tag, "");
                        }
 
-                       else if (ch == '>') {   // We have a tag.
-                               if (nest > 0) --nest;
-
-                               // Unqualify the tag (truncate at first space)
-                               if (strchr(tag, ' ') != NULL) {
-                                       strcpy(strchr(tag, ' '), "");
-                               }
-                               
-                               if (!strcasecmp(tag, "P")) {
-                                       strcat(outbuf, nl);
-                                       strcat(outbuf, nl);
+                       else if (ch == '>') {           // We have hit the end of a tag.
+                               if (tag_nesting_level > 0) {
+                                       --tag_nesting_level;
                                }
+                               if (tag_nesting_level == 0) {
+                                       tag_end = inptr;
 
-                               if (!strcasecmp(tag, "/DIV")) {
-                                       strcat(outbuf, nl);
-                                       strcat(outbuf, nl);
-                               }
-
-                               if (!strcasecmp(tag, "LI")) {
-                                       strcat(outbuf, nl);
-                                       strcat(outbuf, " * ");
-                               }
-
-                               else if (!strcasecmp(tag, "/UL")) {
-                                       strcat(outbuf, nl);
-                                       strcat(outbuf, nl);
-                               }
-
-                               else if (!strcasecmp(tag, "H1")) {
-                                       strcat(outbuf, nl);
-                                       strcat(outbuf, nl);
-                               }
-
-                               else if (!strcasecmp(tag, "H2")) {
-                                       strcat(outbuf, nl);
-                                       strcat(outbuf, nl);
-                               }
-
-                               else if (!strcasecmp(tag, "H3")) {
-                                       strcat(outbuf, nl);
-                                       strcat(outbuf, nl);
-                               }
-
-                               else if (!strcasecmp(tag, "H4")) {
-                                       strcat(outbuf, nl);
-                                       strcat(outbuf, nl);
-                               }
-
-                               else if (!strcasecmp(tag, "/H1")) {
-                                       strcat(outbuf, nl);
-                               }
-
-                               else if (!strcasecmp(tag, "/H2")) {
-                                       strcat(outbuf, nl);
-                               }
-
-                               else if (!strcasecmp(tag, "/H3")) {
-                                       strcat(outbuf, nl);
-                               }
-
-                               else if (!strcasecmp(tag, "/H4")) {
-                                       strcat(outbuf, nl);
-                               }
-
-                               else if (!strcasecmp(tag, "HR")) {
-                                       strcat(outbuf, nl);
-                                       strcat(outbuf, " ");
-                                       for (j=0; j<screenwidth-2; ++j)
-                                               strcat(outbuf, "-");
-                                       strcat(outbuf, nl);
-                               }
-
-                               else if (
-                                       (!strcasecmp(tag, "B"))
-                                       || (!strcasecmp(tag, "STRONG"))
-                               ) {
-                                       if (ansi) {
-                                               strcat(outbuf, "\033[1m");
+                                       size_t tag_len = tag_end - tag_start;
+                                       if (tag_len >= sizeof(tag)) {
+                                               tag_len = sizeof(tag);
                                        }
-                               }
-                               else if (
-                                       (!strcasecmp(tag, "/B"))
-                                       || (!strcasecmp(tag, "/STRONG"))
-                               ) {
-                                       if (ansi) {
-                                               strcat(outbuf, "\033[22m");
-                                       }
-                               }
+                                       strncpy(tag, tag_start, tag_len);
+                                       tag[tag_len] = 0;
 
-                               else if (
-                                       (!strcasecmp(tag, "I"))
-                                       || (!strcasecmp(tag, "EM"))
-                               ) {
-                                       if (ansi) {
-                                               strcat(outbuf, "\033[3m");
+                                       // Unqualify the tag (truncate at first space)
+                                       char *tagsp = strchr(tag, ' ');
+                                       if (tagsp) {
+                                               *tagsp = 0;
                                        }
-                               }
 
-                               else if (
-                                       (!strcasecmp(tag, "/I"))
-                                       || (!strcasecmp(tag, "/EM"))
-                               ) {
-                                       if (ansi) {
-                                               strcat(outbuf, "\033[23m");
+                                       // IMG tag on sixel terminals -- try to display the image
+                                       if ( (!strcasecmp(tag, "img")) && sixel) {
+                                               char *q1, *q2;
+
+                                               // look for src attribute
+                                               char *src = bmstrcasestr(tag_start, "src=");
+                                               q1 = q2 = NULL;
+                                               if (src && src<tag_end) {
+                                                       if (q1 = strchr(src, '"'), q1 && q1<tag_end) {          // in double quotes
+                                                               ++q1;
+                                                               q2 = strchr(q1, '"');
+                                                       }
+                                                       else if (q1 = strchr(src, '\''), q1 && q1<tag_end) {    // in single quotes
+                                                               ++q1;
+                                                               q2 = strchr(q1, '\'');
+                                                       }
+                                                       if (q1 && q1<q2 && q2<tag_end) {
+                                                               char url[SIZ];
+                                                               memcpy(url, q1, q2-q1);
+                                                               url[q2-q1] = 0;
+                                                               h2a_embed_image(out, url, H2A_SIXEL);           // try to display
+                                                               linelen = 0;
+                                                       }
+                                               }
                                        }
-                               }
 
-                               else if (!strcasecmp(tag, "U")) {
-                                       if (ansi) {
-                                               strcat(outbuf, "\033[4m");
+                                       // IMG tag on non-sixel terminals -- we can display the alt text
+                                       if ( (!strcasecmp(tag, "img")) && !sixel) {
+                                               char *q1, *q2;
+
+                                               // look for alt text
+                                               char *alt = bmstrcasestr(tag_start, "alt=");
+                                               q1 = q2 = NULL;
+                                               if (alt && alt<tag_end) {
+                                                       if (q1 = strchr(alt, '"'), q1 && q1<tag_end) {          // in double quotes
+                                                               ++q1;
+                                                               q2 = strchr(q1, '"');
+                                                       }
+                                                       else if (q1 = strchr(alt, '\''), q1 && q1<tag_end) {    // in single quotes
+                                                               ++q1;
+                                                               q2 = strchr(q1, '\'');
+                                                       }
+                                                       if (q1 && q1<q2 && q2<tag_end) {
+                                                               StrBufAppendBufPlain(out, q1, (long)(q2-q1), 0);
+                                                       }
+                                               }
                                        }
-                               }
 
-                               else if (!strcasecmp(tag, "/U")) {
-                                       if (ansi) {
-                                               strcat(outbuf, "\033[24m");
+                                       if (!strcasecmp(tag, "P")) {
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
                                        }
-                               }
 
-                               else if (!strcasecmp(tag, "BR")) {
-                                       strcat(outbuf, nl);
-                               }
-
-                               else if (!strcasecmp(tag, "TR")) {
-                                       strcat(outbuf, nl);
-                               }
-
-                               else if (!strcasecmp(tag, "/TABLE")) {
-                                       strcat(outbuf, nl);
-                               }
-
-                               else if (!strcasecmp(tag, "BLOCKQUOTE")) {
-                                       ++blockquote;
-                                       strcpy(nl, "\n");
-                                       if ( (blockquote == 1) && (ansi) ) {
-                                               strcat(nl, "\033[2m\033[3m");
+                                       if (!strcasecmp(tag, "/DIV")) {
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
                                        }
-                                       for (j=0; j<blockquote; ++j) strcat(nl, ">");
-                                       strcat(outbuf, nl);
-                               }
 
-                               else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
-                                       strcat(outbuf, "\n");
-                                       --blockquote;
-                                       if ( (blockquote == 0) && (ansi) ) {
-                                               strcat(outbuf, "\033[22m\033[23m");
+                                       if (!strcasecmp(tag, "LI")) {
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               StrBufAppendBufPlain(out, HKEY(" * "), 0);
+                                               linelen = 3;
                                        }
-                                       strcpy(nl, "\n");
-                                       for (j=0; j<blockquote; ++j) strcat(nl, ">");
-                                       strcat(outbuf, nl);
-                               }
 
-                               else if (!strcasecmp(tag, "STYLE")) {
-                                       ++styletag;
-                                       if (styletag == 1) {
-                                               styletag_start = strlen(outbuf);
+                                       else if (!strcasecmp(tag, "/UL")) {
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
                                        }
-                               }
 
-                               else if (!strcasecmp(tag, "/STYLE")) {
-                                       --styletag;
-                                       if (styletag == 0) {
-                                               outbuf[styletag_start] = 0;
+                                       else if (!strcasecmp(tag, "H1")) {
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
                                        }
-                               }
-
-                       }
-
-                       else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
-                               tag[strlen(tag)+1] = 0;
-                               tag[strlen(tag)] = ch;
-                       }
-                               
-                       else if ((!nest) && (styletag == 0)) {
-                               outbuf[strlen(outbuf)+1] = 0;
-                               outbuf[strlen(outbuf)] = ch;
-                       }
-                   }
-                   strcpy(inbuf, &inbuf[i]);
-               }
-
-               // Convert &; tags to the forbidden characters
-               if (!IsEmptyStr(outbuf)) for (i=0; !IsEmptyStr(&outbuf[i]); ++i) {
-
-                       // Character entity references
-                       if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
-                               outbuf[i] = ' ';
-                               strcpy(&outbuf[i+1], &outbuf[i+6]);
-                       }
-
-                       if (!strncasecmp(&outbuf[i], "&ensp;", 6)) {
-                               outbuf[i] = ' ';
-                               strcpy(&outbuf[i+1], &outbuf[i+6]);
-                       }
-
-                       if (!strncasecmp(&outbuf[i], "&emsp;", 6)) {
-                               outbuf[i] = ' ';
-                               strcpy(&outbuf[i+1], &outbuf[i+6]);
-                       }
 
-                       if (!strncasecmp(&outbuf[i], "&thinsp;", 8)) {
-                               outbuf[i] = ' ';
-                               strcpy(&outbuf[i+1], &outbuf[i+8]);
-                       }
-
-                       else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
-                               outbuf[i] = '<';
-                               strcpy(&outbuf[i+1], &outbuf[i+4]);
-                       }
-
-                       else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
-                               outbuf[i] = '>';
-                               strcpy(&outbuf[i+1], &outbuf[i+4]);
-                       }
-
-                       else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
-                               strcpy(&outbuf[i+1], &outbuf[i+5]);
-                       }
-
-                       else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
-                               outbuf[i] = '\"';
-                               strcpy(&outbuf[i+1], &outbuf[i+6]);
-                       }
-
-                       else if (!strncasecmp(&outbuf[i], "&lsquo;", 7)) {
-                               outbuf[i] = '`';
-                               strcpy(&outbuf[i+1], &outbuf[i+7]);
-                       }
-
-                       else if (!strncasecmp(&outbuf[i], "&rsquo;", 7)) {
-                               outbuf[i] = '\'';
-                               strcpy(&outbuf[i+1], &outbuf[i+7]);
-                       }
-
-                       else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
-                               outbuf[i] = '(';
-                               outbuf[i+1] = 'c';
-                               outbuf[i+2] = ')';
-                               strcpy(&outbuf[i+3], &outbuf[i+6]);
-                       }
-
-                       else if (!strncasecmp(&outbuf[i], "&bull;", 6)) {
-                               outbuf[i] = ' ';
-                               outbuf[i+1] = '*';
-                               outbuf[i+2] = ' ';
-                               strcpy(&outbuf[i+3], &outbuf[i+6]);
-                       }
-
-                       else if (!strncasecmp(&outbuf[i], "&hellip;", 8)) {
-                               outbuf[i] = '.';
-                               outbuf[i+1] = '.';
-                               outbuf[i+2] = '.';
-                               strcpy(&outbuf[i+3], &outbuf[i+8]);
-                       }
+                                       else if (!strcasecmp(tag, "H2")) {
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
+                                       }
 
-                       else if (!strncasecmp(&outbuf[i], "&trade;", 7)) {
-                               outbuf[i] = '(';
-                               outbuf[i+1] = 't';
-                               outbuf[i+2] = 'm';
-                               outbuf[i+3] = ')';
-                               strcpy(&outbuf[i+4], &outbuf[i+7]);
-                       }
+                                       else if (!strcasecmp(tag, "H3")) {
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
+                                       }
 
-                       else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
-                               outbuf[i] = '(';
-                               outbuf[i+1] = 'r';
-                               outbuf[i+2] = ')';
-                               strcpy(&outbuf[i+3], &outbuf[i+5]);
-                       }
+                                       else if (!strcasecmp(tag, "H4")) {
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
+                                       }
 
-                       else if (!strncasecmp(&outbuf[i], "&frac14;", 8)) {
-                               outbuf[i] = '1';
-                               outbuf[i+1] = '/';
-                               outbuf[i+2] = '4';
-                               strcpy(&outbuf[i+3], &outbuf[i+8]);
-                       }
+                                       else if (!strcasecmp(tag, "/H1")) {
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
+                                       }
 
-                       else if (!strncasecmp(&outbuf[i], "&frac12;", 8)) {
-                               outbuf[i] = '1';
-                               outbuf[i+1] = '/';
-                               outbuf[i+2] = '2';
-                               strcpy(&outbuf[i+3], &outbuf[i+8]);
-                       }
+                                       else if (!strcasecmp(tag, "/H2")) {
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
+                                       }
 
-                       else if (!strncasecmp(&outbuf[i], "&frac34;", 8)) {
-                               outbuf[i] = '3';
-                               outbuf[i+1] = '/';
-                               outbuf[i+2] = '4';
-                               strcpy(&outbuf[i+3], &outbuf[i+8]);
-                       }
+                                       else if (!strcasecmp(tag, "/H3")) {
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
+                                       }
 
-                       else if (!strncasecmp(&outbuf[i], "&ndash;", 7)) {
-                               outbuf[i] = '-';
-                               outbuf[i+1] = '-';
-                               strcpy(&outbuf[i+2], &outbuf[i+7]);
-                       }
+                                       else if (!strcasecmp(tag, "/H4")) {
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
+                                       }
 
-                       else if (!strncasecmp(&outbuf[i], "&mdash;", 7)) {
-                               outbuf[i] = '-';
-                               outbuf[i+1] = '-';
-                               outbuf[i+2] = '-';
-                               strcpy(&outbuf[i+3], &outbuf[i+7]);
-                       }
+                                       else if (!strcasecmp(tag, "HR")) {
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               StrBufAppendBufPlain(out, HKEY(" "), 0);
+                                               for (j = 0; j < screenwidth - 2; ++j) {
+                                                       StrBufAppendBufPlain(out, HKEY("-"), 0);
+                                               }
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
+                                       }
 
-                       else if (!strncmp(&outbuf[i], "&Ccedil;", 8)) {
-                               outbuf[i] = 'C';
-                               strcpy(&outbuf[i+1], &outbuf[i+8]);
-                       }
+                                       else if (       (!strcasecmp(tag, "B"))
+                                                       || (!strcasecmp(tag, "STRONG"))
+                                       ) {
+                                               if (ansi) {
+                                                       StrBufAppendBufPlain(out, HKEY("\033[1m"), 0);
+                                               }
+                                       }
+                                       else if (       (!strcasecmp(tag, "/B"))
+                                                       || (!strcasecmp(tag, "/STRONG"))
+                                       ) {
+                                               if (ansi) {
+                                                       StrBufAppendBufPlain(out, HKEY("\033[22m"), 0);
+                                               }
+                                       }
 
-                       else if (!strncasecmp(&outbuf[i], "&ccedil;", 8)) {
-                               outbuf[i] = 'c';
-                               strcpy(&outbuf[i+1], &outbuf[i+8]);
-                       }
+                                       else if (       (!strcasecmp(tag, "I"))
+                                                       || (!strcasecmp(tag, "EM"))
+                                       ) {
+                                               if (ansi) {
+                                                       StrBufAppendBufPlain(out, HKEY("\033[3m"), 0);
+                                               }
+                                       }
 
-                       else if (!strncmp(&outbuf[i], "&Egrave;", 8)) {
-                               outbuf[i] = 'E';
-                               strcpy(&outbuf[i+1], &outbuf[i+8]);
-                       }
+                                       else if (       (!strcasecmp(tag, "/I"))
+                                                       || (!strcasecmp(tag, "/EM"))
+                                       ) {
+                                               if (ansi) {
+                                                       StrBufAppendBufPlain(out, HKEY("\033[23m"), 0);
+                                               }
+                                       }
 
-                       else if (!strncasecmp(&outbuf[i], "&egrave;", 8)) {
-                               outbuf[i] = 'e';
-                               strcpy(&outbuf[i+1], &outbuf[i+8]);
-                       }
+                                       else if (!strcasecmp(tag, "U")) {
+                                               if (ansi) {
+                                                       StrBufAppendBufPlain(out, HKEY("\033[4m"), 0);
+                                               }
+                                       }
 
-                       else if (!strncmp(&outbuf[i], "&Ecirc;", 7)) {
-                               outbuf[i] = 'E';
-                               strcpy(&outbuf[i+1], &outbuf[i+7]);
-                       }
+                                       else if (!strcasecmp(tag, "/U")) {
+                                               if (ansi) {
+                                                       StrBufAppendBufPlain(out, HKEY("\033[24m"), 0);
+                                               }
+                                       }
 
-                       else if (!strncasecmp(&outbuf[i], "&ecirc;", 7)) {
-                               outbuf[i] = 'e';
-                               strcpy(&outbuf[i+1], &outbuf[i+7]);
-                       }
+                                       else if (!strcasecmp(tag, "BR")) {
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
+                                       }
 
-                       else if (!strncmp(&outbuf[i], "&Eacute;", 8)) {
-                               outbuf[i] = 'E';
-                               strcpy(&outbuf[i+1], &outbuf[i+8]);
-                       }
+                                       else if (!strcasecmp(tag, "TR")) {
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
+                                       }
 
-                       else if (!strncasecmp(&outbuf[i], "&eacute;", 8)) {
-                               outbuf[i] = 'e';
-                               strcpy(&outbuf[i+1], &outbuf[i+8]);
-                       }
+                                       else if (!strcasecmp(tag, "/TABLE")) {
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
+                                       }
 
-                       else if (!strncmp(&outbuf[i], "&Agrave;", 8)) {
-                               outbuf[i] = 'A';
-                               strcpy(&outbuf[i+1], &outbuf[i+8]);
-                       }
+                                       else if (!strcasecmp(tag, "BLOCKQUOTE")) {
+                                               ++blockquote;
+                                               strcpy(nl, "\n");
+                                               if ((blockquote == 1) && (ansi)) {
+                                                       strcat(nl, "\033[2m\033[2m");
+                                               }
+                                               for (j = 0; j < blockquote; ++j) {
+                                                       strcat(nl, ">");
+                                               }
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
+                                       }
 
-                       else if (!strncasecmp(&outbuf[i], "&agrave;", 8)) {
-                               outbuf[i] = 'a';
-                               strcpy(&outbuf[i+1], &outbuf[i+8]);
-                       }
+                                       else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
+                                               StrBufAppendBufPlain(out, HKEY("\n"), 0);
+                                               --blockquote;
+                                               if ((blockquote == 0) && (ansi)) {
+                                                       StrBufAppendBufPlain(out, HKEY("\033[22m\033[22m"), 0);
+                                               }
+                                               strcpy(nl, "\n");
+                                               for (j = 0; j < blockquote; ++j) {
+                                                       strcat(nl, ">");
+                                               }
+                                               StrBufAppendBufPlain(out, nl, -1, 0);
+                                               linelen = 0;
+                                       }
 
-                       else if (!strncasecmp(&outbuf[i], "&ldquo;", 7)) {
-                               outbuf[i] = '\"';
-                               strcpy(&outbuf[i+1], &outbuf[i+7]);
-                       }
+                                       else if (!strcasecmp(tag, "STYLE")) {
+                                               ++styletag;
+                                       }
 
-                       else if (!strncasecmp(&outbuf[i], "&rdquo;", 7)) {
-                               outbuf[i] = '\"';
-                               strcpy(&outbuf[i+1], &outbuf[i+7]);
-                       }
+                                       else if (!strcasecmp(tag, "/STYLE")) {
+                                               --styletag;
+                                       }
+                               }
 
-                       else if (!strncasecmp(&outbuf[i], "&acute;", 7)) {
-                               outbuf[i] = '\'';
-                               strcpy(&outbuf[i+1], &outbuf[i+7]);
                        }
 
-                       else if (!strncasecmp(&outbuf[i], "&#8217;", 7)) {
-                               outbuf[i] = '\'';
-                               strcpy(&outbuf[i+1], &outbuf[i+7]);
+                       // copy non-tag text to the output buffer
+                       else if ((!tag_nesting_level) && (styletag == 0)) {
+                               StrBufAppendBufPlain(out, &ch, 1, 0);
+                               ++linelen;
                        }
 
-                       else if (!strncasecmp(&outbuf[i], "&#8211;", 7)) {
-                               outbuf[i] = '-';
-                               strcpy(&outbuf[i+1], &outbuf[i+7]);
-                       }
+                       // Handle numeric entities
+                       if (ch == ';') {
 
-                       // two-digit decimal equivalents
-                       else if (outbuf[i] == '&'       &&
-                                outbuf[i + 1] == '#'   &&
-                                isdigit(outbuf[i + 2]) && 
-                                isdigit(outbuf[i + 3]) &&
-                                (outbuf[i+4] == ';') ) 
-                       {
-                               scanch = 0;
-                               sscanf(&outbuf[i+2], "%02d", &scanch);
-                               outbuf[i] = scanch;
-                               strcpy(&outbuf[i+1], &outbuf[i+5]);
-                       }
+                               u_int32_t scanch = 0;
+                               int elen = 0;
 
-                       // three-digit decimal equivalents
-                       else if (outbuf[i] == '&'       &&
-                                outbuf[i + 1] == '#'   &&
-                                isdigit(outbuf[i + 2]) && 
-                                isdigit(outbuf[i + 3]) && 
-                                isdigit(outbuf[i + 4]) &&
-                                (outbuf[i + 5] == ';') ) 
-                       {
-                               scanch = 0;
-                               sscanf(&outbuf[i+2], "%03d", &scanch);
-                               outbuf[i] = scanch;
-                               strcpy(&outbuf[i+1], &outbuf[i+6]);
-                       }
+                               if ( (linelen >= 5) && (*(inptr-4) == '&') && (*(inptr-3) == '#') ) {
+                                       sscanf(inptr-2, "%02d", &scanch);
+                                       elen = 5;
+                               }
+                               else if ( (linelen >= 6) && (*(inptr-5) == '&') && (*(inptr-4) == '#') ) {
+                                       sscanf(inptr-3, "%03d", &scanch);
+                                       elen = 6;
+                               }
+                               else if ( (linelen >= 7) && (*(inptr-6) == '&') && (*(inptr-5) == '#') ) {
+                                       sscanf(inptr-3, "%04d", &scanch);
+                                       elen = 7;
+                               }
+                               else if ( (linelen >= 8) && (*(inptr-7) == '&') && (*(inptr-6) == '#') ) {
+                                       sscanf(inptr-4, "%05d", &scanch);
+                                       elen = 8;
+                               }
 
-                       // four-digit decimal equivalents
-                       else if (outbuf[i] == '&'       &&
-                                outbuf[i + 1] == '#'   &&
-                                isdigit(outbuf[i + 2]) && 
-                                isdigit(outbuf[i + 3]) && 
-                                isdigit(outbuf[i + 4]) &&
-                                isdigit(outbuf[i + 5]) &&
-                                (outbuf[i + 6] == ';') ) 
-                       {
-                               scanch = 0;
-                               sscanf(&outbuf[i+2], "%04d", &scanch);
-                               outbuf[i] = scanch;
-                               strcpy(&outbuf[i+1], &outbuf[i+7]);
-                       }
+                               if (scanch) {
+                                       StrBufCutRight(out, elen);
+                                       linelen -= elen;
 
-               }
+                                       char utf[5];
+                                       int ulen = u8_wc_toutf8(utf, scanch);
+                                       utf[ulen] = 0;
+                                       StrBufAppendBufPlain(out, utf, ulen, 0);
+                                       linelen += elen;
+                               }
 
-               // Make sure the output buffer is big enough
-               if ((output_len + strlen(outbuf) + SIZ) > outptr_buffer_size) {
-                       outptr_buffer_size += SIZ;
-                       outptr = realloc(outptr, outptr_buffer_size);
-                       if (outptr == NULL) {
-                               abort();
                        }
-               }
-
-               // Output any lines terminated with hard line breaks
-               do {
-                       did_out = 0;
-                       if (strlen(outbuf) > 0) {
-                           for (i = 0; i<strlen(outbuf); ++i) {
-                               if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
-
-                                       strncpy(&outptr[output_len], outbuf, i+1);
-                                       output_len += (i+1);
 
-                                       strcpy(outbuf, &outbuf[i+1]);
-                                       i = 0;
-                                       did_out = 1;
+                       // Add soft line breaks when necessary
+                       if (linelen > (screenwidth - 8)) {
+                               char *ptr = (char *)ChrPtr(out) + StrLength(out) - linelen;
+                               char *rightmost_space = strrchr(ptr, ' ');
+                               if (rightmost_space && rightmost_space > ptr) {
+                                       int space_pos = rightmost_space - ChrPtr(out);
+                                       StrBufReplaceToken(out, (long)space_pos, 1, nl, strlen(nl));
+                                       linelen = strlen(rightmost_space) - 1;
                                }
                        }
-                   }
-               } while (did_out);
-
-               // Add soft line breaks
-               if (strlen(outbuf) > (screenwidth - 2 )) {
-                       rb = (-1);
-                       for (i=0; i<(screenwidth-2); ++i) {
-                               if (outbuf[i]==32) rb = i;
-                       }
-                       if (rb>=0) {
-                               strncpy(&outptr[output_len], outbuf, rb);
-                               output_len += rb;
-                               strcpy(&outptr[output_len], nl);
-                               output_len += strlen(nl);
-                               strcpy(outbuf, &outbuf[rb+1]);
-                       }
-                       else {
-                               strncpy(&outptr[output_len], outbuf, screenwidth-2);
-                               output_len += (screenwidth-2);
-                               strcpy(&outptr[output_len], nl);
-                               output_len += strlen(nl);
-                               strcpy(outbuf, &outbuf[screenwidth-2]);
-                       }
-               }
-
-       } while (done_reading == 0);
 
-       strcpy(&outptr[output_len], outbuf);
-       output_len += strlen(outbuf);
-
-       // Strip leading/trailing whitespace.
+                       // Advance to the next byte of input.
+                       inptr++;
+               }
+       }
+       free(inbuf);
+
+       // Convert entity tags to printable characters
+       StrBufReplaceAllOccurrences(out, "&nbsp;", " ");
+       StrBufReplaceAllOccurrences(out, "&ensp;", " ");
+       StrBufReplaceAllOccurrences(out, "&emsp;", " ");
+       StrBufReplaceAllOccurrences(out, "&thinsp;", " ");
+       StrBufReplaceAllOccurrences(out, "&lt;", "<");
+       StrBufReplaceAllOccurrences(out, "&gt;", ">");
+       StrBufReplaceAllOccurrences(out, "&amp;", "&");
+       StrBufReplaceAllOccurrences(out, "&quot;", "\"");
+       StrBufReplaceAllOccurrences(out, "&lsquo;", "`");
+       StrBufReplaceAllOccurrences(out, "&rsquo;", "'");
+       StrBufReplaceAllOccurrences(out, "&bull;", " * ");
+       StrBufReplaceAllOccurrences(out, "&hellip;", "…");
+       StrBufReplaceAllOccurrences(out, "&copy;", "©");
+       StrBufReplaceAllOccurrences(out, "&trade;", "™");
+       StrBufReplaceAllOccurrences(out, "&reg;", "®");
+       StrBufReplaceAllOccurrences(out, "&frac14;", "¼");
+       StrBufReplaceAllOccurrences(out, "&frac12;", "½");
+       StrBufReplaceAllOccurrences(out, "&frac34;", "¾");
+       StrBufReplaceAllOccurrences(out, "&ndash;", "–");
+       StrBufReplaceAllOccurrences(out, "&mdash;", "—");
+       StrBufReplaceAllOccurrences(out, "&Ccedil;", "Ç");
+       StrBufReplaceAllOccurrences(out, "&ccedil;", "ç");
+       StrBufReplaceAllOccurrences(out, "&Egrave;", "È");
+       StrBufReplaceAllOccurrences(out, "&egrave;", "è");
+       StrBufReplaceAllOccurrences(out, "&Ecirc;", "Ê");
+       StrBufReplaceAllOccurrences(out, "&ecirc;", "ê");
+       StrBufReplaceAllOccurrences(out, "&Eacute;", "É");
+       StrBufReplaceAllOccurrences(out, "&eacute;", "é");
+       StrBufReplaceAllOccurrences(out, "&Agrave;", "À");
+       StrBufReplaceAllOccurrences(out, "&agrave;", "à");
+       StrBufReplaceAllOccurrences(out, "&ldquo;", "\"");
+       StrBufReplaceAllOccurrences(out, "&rdquo;", "\"");
+       StrBufReplaceAllOccurrences(out, "&acute;", "'");
+       StrBufReplaceAllOccurrences(out, "&#8217;", "'");
+       StrBufReplaceAllOccurrences(out, "&#8211;", "-");
+
+       // Convert from a StrBuf to a plain C string
+       int output_len = StrLength(out);
+       outptr = SmashStrBuf(&out);
+
+       // Strip leading whitespace
        while ((output_len > 0) && (isspace(outptr[0]))) {
                strcpy(outptr, &outptr[1]);
                --output_len;
        }
-       while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
-               outptr[output_len-1] = 0;
+
+       // Strip trailing whitespace
+       while ((output_len > 0) && (isspace(outptr[output_len - 1]))) {
+               outptr[output_len - 1] = 0;
                --output_len;
        }
 
        // Make sure the final line ends with a newline character.
-       if ((output_len > 0) && (outptr[output_len-1] != '\n')) {
+       if ((output_len > 0) && (outptr[output_len - 1] != '\n')) {
                strcat(outptr, "\n");
                ++output_len;
        }
 
        return outptr;
-
 }