X-Git-Url: https://code.citadel.org/?a=blobdiff_plain;f=libcitadel%2Flib%2Fhtml_to_ascii.c;h=ca2de8df8d9e7b0eb756ca1a7186a16d5f571641;hb=HEAD;hp=faa343710c1c19556b67cabd9be578e65dc38f78;hpb=bca06b89514d8d91c1442735272ab10ea0e19f9a;p=citadel.git

diff --git a/libcitadel/lib/html_to_ascii.c b/libcitadel/lib/html_to_ascii.c
index faa343710..8f3c9eca0 100644
--- a/libcitadel/lib/html_to_ascii.c
+++ b/libcitadel/lib/html_to_ascii.c
@@ -1,21 +1,7 @@
-/*
- * Functions which handle translation between HTML and plain text
- * Copyright (c) 2000-2010 by the citadel.org team
- *
- * This program is open source software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
+// This is an HTML to plain text converter.
+// Copyright (c) 2000-2024 by the citadel.org team (Art Cancro et al.)
+//
+// This program is open source software.  Use, duplication, or disclosure is subject to the GNU General Public License version 3.
 
 #include <stdlib.h>
 #include <unistd.h>
@@ -27,576 +13,499 @@
 #include <sys/stat.h>
 #include <errno.h>
 #include <limits.h>
-
-#if TIME_WITH_SYS_TIME
-# include <sys/time.h>
-# include <time.h>
-#else
-# if HAVE_SYS_TIME_H
-#  include <sys/time.h>
-# else
-#  include <time.h>
-# endif
-#endif
-
+#include <time.h>
 #include "libcitadel.h"
- 
-
-/*
- * Convert HTML to plain text.
- *
- * inputmsg      = pointer to raw HTML message
- * screenwidth   = desired output screenwidth
- * do_citaformat = set to 1 to indent newlines with spaces
- */
-char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth, int do_citaformat) {
-	char inbuf[SIZ];
-	int inbuf_len = 0;
-	char outbuf[SIZ];
-	char tag[1024];
-	int done_reading = 0;
-	const char *inptr;
-	char *outptr;
-	size_t outptr_buffer_size;
-	size_t output_len = 0;
-	int i, j, ch, did_out, rb, scanch;
-	int nest = 0;		/* Bracket nesting level */
-	int blockquote = 0;	/* BLOCKQUOTE nesting level */
-	int styletag = 0;	/* STYLE tag nesting level */
-	int styletag_start = 0;
-	int bytes_processed = 0;
-	char nl[128];
-
-	strcpy(nl, "\n");
-	inptr = inputmsg;
-	strcpy(inbuf, "");
-	strcpy(outbuf, "");
-	if (msglen == 0) msglen = strlen(inputmsg);
-
-	outptr_buffer_size = strlen(inptr) + SIZ;
-	outptr = malloc(outptr_buffer_size);
-	if (outptr == NULL) return NULL;
-	strcpy(outptr, "");
-	output_len = 0;
-
-	do {
-		/* Fill the input buffer */
-		inbuf_len = strlen(inbuf);
-		if ( (done_reading == 0) && (inbuf_len < (SIZ-128)) ) {
-
-			ch = *inptr++;
-			if (ch != 0) {
-				inbuf[inbuf_len++] = ch;
-				inbuf[inbuf_len] = 0;
-			} 
-			else {
-				done_reading = 1;
-			}
-
-			++bytes_processed;
-			if (bytes_processed > msglen) {
-				done_reading = 1;
-			}
-
-		}
-
-		/* Do some parsing */
-		if (!IsEmptyStr(inbuf)) {
-
-
-		    /* Fold in all the spacing */
-		    for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
-			if (inbuf[i]==10) inbuf[i]=32;
-			if (inbuf[i]==13) inbuf[i]=32;
-			if (inbuf[i]==9) inbuf[i]=32;
-		    }
-		    for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
-			while ((inbuf[i]==32)&&(inbuf[i+1]==32)) {
-				strcpy(&inbuf[i], &inbuf[i+1]);
-			}
-		    }
-
-		    for (i=0; !IsEmptyStr(&inbuf[i]); ++i) {
-
-			ch = inbuf[i];
-
-			if (ch == '<') {
-				++nest;
-				strcpy(tag, "");
-			}
 
-			else if (ch == '>') {	/* We have a tag. */
-				if (nest > 0) --nest;
-
-				/* Unqualify the tag (truncate at first space) */
-				if (strchr(tag, ' ') != NULL) {
-					strcpy(strchr(tag, ' '), "");
-				}
-				
-				if (!strcasecmp(tag, "P")) {
-					strcat(outbuf, nl);
-					strcat(outbuf, nl);
-				}
-
-				if (!strcasecmp(tag, "/DIV")) {
-					strcat(outbuf, nl);
-					strcat(outbuf, nl);
-				}
-
-				if (!strcasecmp(tag, "LI")) {
-					strcat(outbuf, nl);
-					strcat(outbuf, " * ");
-				}
-
-				else if (!strcasecmp(tag, "/UL")) {
-					strcat(outbuf, nl);
-					strcat(outbuf, nl);
-				}
+int u8_wc_toutf8(char *dest, u_int32_t ch) {
+	if (ch < 0x80) {
+		dest[0] = (char)ch;
+		return 1;
+	}
+	if (ch < 0x800) {
+		dest[0] = (ch>>6) | 0xC0;
+		dest[1] = (ch & 0x3F) | 0x80;
+		return 2;
+	}
+	if (ch < 0x10000) {
+		dest[0] = (ch>>12) | 0xE0;
+		dest[1] = ((ch>>6) & 0x3F) | 0x80;
+		dest[2] = (ch & 0x3F) | 0x80;
+		return 3;
+	}
+	if (ch < 0x110000) {
+		dest[0] = (ch>>18) | 0xF0;
+		dest[1] = ((ch>>12) & 0x3F) | 0x80;
+		dest[2] = ((ch>>6) & 0x3F) | 0x80;
+		dest[3] = (ch & 0x3F) | 0x80;
+		return 4;
+	}
+	return 0;
+}
 
-				else if (!strcasecmp(tag, "H1")) {
-					strcat(outbuf, nl);
-					strcat(outbuf, nl);
-				}
 
-				else if (!strcasecmp(tag, "H2")) {
-					strcat(outbuf, nl);
-					strcat(outbuf, nl);
-				}
+// Try to embed an image in the display stream.
+// out			= the StrBuf to which we are writing the display stream
+// url			= the URL of the image (warning: it might be a data: URL)
+// display_protocol	= currently only H2A_SIXEL is supported
+void h2a_embed_image(StrBuf *out, char *url, int display_protocol) {
 
-				else if (!strcasecmp(tag, "H3")) {
-					strcat(outbuf, nl);
-					strcat(outbuf, nl);
-				}
+	char buf[4096];
+	snprintf(buf, sizeof(buf), "curl -s '%s' | img2sixel - | fold", url);
 
-				else if (!strcasecmp(tag, "H4")) {
-					strcat(outbuf, nl);
-					strcat(outbuf, nl);
-				}
+	FILE *cmd = popen(buf, "r");
+	if (!cmd) {
+		return;
+	}
 
-				else if (!strcasecmp(tag, "/H1")) {
-					strcat(outbuf, nl);
-				}
+	size_t bytes;
+	while (bytes = fread(buf, 1, sizeof(buf), cmd), bytes>0) {
+		StrBufAppendBufPlain(out, buf, bytes, 0);
+	}
+	pclose(cmd);
+}
 
-				else if (!strcasecmp(tag, "/H2")) {
-					strcat(outbuf, nl);
-				}
 
-				else if (!strcasecmp(tag, "/H3")) {
-					strcat(outbuf, nl);
-				}
+// Convert HTML to plain text.
+//
+// inputmsg     = pointer to raw HTML message
+// msglen       = stop reading after this many bytes
+// screenwidth  = desired output screenwidth
+// flags        = Flags that can be set:
+//              H2A_ANSI	= Output ANSI terminal escape sequences
+//              H2A_SIXEL	= Output Sixel graphics (not yet implemented)
+//
+char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth, unsigned int flags) {
+	char *inbuf = NULL;
+	int inbuf_len = 0;
+	char tag[1024];
+	char *tag_start = NULL;
+	char *tag_end = NULL;
+	StrBuf *out;
+	char *outptr;
+	int j;
+	char ch;
+	int tag_nesting_level = 0;		// angle bracket nesting level
+	int blockquote = 0;			// BLOCKQUOTE nesting level
+	int styletag = 0;			// STYLE tag nesting level
+	char nl[128];				// The current value of what a "newline" looks like (changes during blockquotes)
+
+	int ansi = (flags & H2A_ANSI) ? 1 : 0;		// Output to a terminal that can accept ANSI escape sequences
+	int sixel = (flags & H2A_SIXEL) ? 1 : 0;	// Output to a terminal that can accept Sixel graphics
+
+	out = NewStrBuf();
+	if (!out) {
+		return(NULL);
+	}
 
-				else if (!strcasecmp(tag, "/H4")) {
-					strcat(outbuf, nl);
-				}
+	tag[0] = '\0';
+	strcpy(nl, "\n");
+	if (msglen == 0) {
+		msglen = strlen(inputmsg);
+	}
 
-				else if (!strcasecmp(tag, "HR")) {
-					strcat(outbuf, nl);
-					strcat(outbuf, " ");
-					for (j=0; j<screenwidth-2; ++j)
-						strcat(outbuf, "-");
-					strcat(outbuf, nl);
-				}
+	inbuf = strdup(inputmsg);
+	if (!inbuf) {
+		return NULL;
+	}
 
-				else if (
-					(!strcasecmp(tag, "B"))
-					|| (!strcasecmp(tag, "/B"))
-					|| (!strcasecmp(tag, "STRONG"))
-					|| (!strcasecmp(tag, "/STRONG"))
-				) {
-					strcat(outbuf, "*");
-					
-				}
+	// "inbuf" ingests the unparsed HTML while we work with it.
+	inbuf_len = strlen(inbuf);
+	if (inbuf_len > msglen) {
+		inbuf[msglen] = 0;
+		inbuf_len = msglen;
+	}
 
-				else if (
-					(!strcasecmp(tag, "I"))
-					|| (!strcasecmp(tag, "/I"))
-					|| (!strcasecmp(tag, "EM"))
-					|| (!strcasecmp(tag, "/EM"))
-				) {
-					strcat(outbuf, "/");
-					
-				}
+	// Do some parsing
+	if (!IsEmptyStr(inbuf)) {
 
-				else if (
-					(!strcasecmp(tag, "U"))
-					|| (!strcasecmp(tag, "/U"))
-				) {
-					strcat(outbuf, "_");
-					
-				}
+		// Convert newlines, carriage returns, and tabs to spaces
+		char *sp;
+		while (	(sp = strchr(inbuf, '\r'))
+			|| (sp = strchr(inbuf, '\n'))
+			|| (sp = strchr(inbuf, '\t'))
+		) {
+			*sp = ' ';
+		}
 
-				else if (!strcasecmp(tag, "BR")) {
-					strcat(outbuf, nl);
-				}
+		// Convert multiple spaces to a single space.
+		while (sp = strstr(inbuf, "  "), sp!=NULL) {
+			strcpy(sp, sp+1);
+		}
 
-				else if (!strcasecmp(tag, "TR")) {
-					strcat(outbuf, nl);
-				}
+		// Run through the markup performing the conversion.
+		char *inptr = inbuf;
+		int linelen = 0;
+		while (ch = inptr[0], ch != 0) {
 
-				else if (!strcasecmp(tag, "/TABLE")) {
-					strcat(outbuf, nl);
-				}
+			// Keep track of how many angle brackets were found in case someone is sloppy with them
+			// or tries to nest tags.  If nest is 0 then we are within text; if it is nonzero then we
+			// are within a tag.
 
-				else if (!strcasecmp(tag, "BLOCKQUOTE")) {
-					++blockquote;
-					strcpy(nl, "\n");
-					for (j=0; j<blockquote; ++j) strcat(nl, ">");
-					strcat(outbuf, nl);
-				}
+			if (ch == '<') {		// We have hit the beginning of a tag.
+				++tag_nesting_level;
+				tag_start = inptr + 1;
+				strcpy(tag, "");
+			}
 
-				else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
-					strcat(outbuf, "\n");
-					--blockquote;
-					strcpy(nl, "\n");
-					for (j=0; j<blockquote; ++j) strcat(nl, ">");
-					strcat(outbuf, nl);
+			else if (ch == '>') {		// We have hit the end of a tag.
+				if (tag_nesting_level > 0) {
+					--tag_nesting_level;
 				}
+				if (tag_nesting_level == 0) {
+					tag_end = inptr;
 
-				else if (!strcasecmp(tag, "STYLE")) {
-					++styletag;
-					if (styletag == 1) {
-						styletag_start = strlen(outbuf);
+					size_t tag_len = tag_end - tag_start;
+					if (tag_len >= sizeof(tag)) {
+						tag_len = sizeof(tag);
 					}
-				}
+					strncpy(tag, tag_start, tag_len);
+					tag[tag_len] = 0;
 
-				else if (!strcasecmp(tag, "/STYLE")) {
-					--styletag;
-					if (styletag == 0) {
-						outbuf[styletag_start] = 0;
+					// Unqualify the tag (truncate at first space)
+					char *tagsp = strchr(tag, ' ');
+					if (tagsp) {
+						*tagsp = 0;
 					}
-				}
-
-			}
-
-			else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) {
-				tag[strlen(tag)+1] = 0;
-				tag[strlen(tag)] = ch;
-			}
-				
-			else if (!nest) {
-				outbuf[strlen(outbuf)+1] = 0;
-				outbuf[strlen(outbuf)] = ch;
-			}
-		    }
-		    strcpy(inbuf, &inbuf[i]);
-		}
-
-		/* Convert &; tags to the forbidden characters */
-		if (!IsEmptyStr(outbuf)) for (i=0; !IsEmptyStr(&outbuf[i]); ++i) {
-
-			/* Character entity references */
-			if (!strncasecmp(&outbuf[i], "&nbsp;", 6)) {
-				outbuf[i] = ' ';
-				strcpy(&outbuf[i+1], &outbuf[i+6]);
-			}
 
-			if (!strncasecmp(&outbuf[i], "&ensp;", 6)) {
-				outbuf[i] = ' ';
-				strcpy(&outbuf[i+1], &outbuf[i+6]);
-			}
-
-			if (!strncasecmp(&outbuf[i], "&emsp;", 6)) {
-				outbuf[i] = ' ';
-				strcpy(&outbuf[i+1], &outbuf[i+6]);
-			}
-
-			if (!strncasecmp(&outbuf[i], "&thinsp;", 8)) {
-				outbuf[i] = ' ';
-				strcpy(&outbuf[i+1], &outbuf[i+8]);
-			}
+					// IMG tag on sixel terminals -- try to display the image
+					if ( (!strcasecmp(tag, "img")) && sixel) {
+						char *q1, *q2;
+
+						// look for src attribute
+						char *src = bmstrcasestr(tag_start, "src=");
+						q1 = q2 = NULL;
+						if (src && src<tag_end) {
+							if (q1 = strchr(src, '"'), q1 && q1<tag_end) {		// in double quotes
+								++q1;
+								q2 = strchr(q1, '"');
+							}
+							else if (q1 = strchr(src, '\''), q1 && q1<tag_end) {	// in single quotes
+								++q1;
+								q2 = strchr(q1, '\'');
+							}
+							if (q1 && q1<q2 && q2<tag_end) {
+								char url[SIZ];
+								memcpy(url, q1, q2-q1);
+								url[q2-q1] = 0;
+								h2a_embed_image(out, url, H2A_SIXEL);		// try to display
+								linelen = 0;
+							}
+						}
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&lt;", 4)) {
-				outbuf[i] = '<';
-				strcpy(&outbuf[i+1], &outbuf[i+4]);
-			}
+					// IMG tag on non-sixel terminals -- we can display the alt text
+					if ( (!strcasecmp(tag, "img")) && !sixel) {
+						char *q1, *q2;
+
+						// look for alt text
+						char *alt = bmstrcasestr(tag_start, "alt=");
+						q1 = q2 = NULL;
+						if (alt && alt<tag_end) {
+							if (q1 = strchr(alt, '"'), q1 && q1<tag_end) {		// in double quotes
+								++q1;
+								q2 = strchr(q1, '"');
+							}
+							else if (q1 = strchr(alt, '\''), q1 && q1<tag_end) {	// in single quotes
+								++q1;
+								q2 = strchr(q1, '\'');
+							}
+							if (q1 && q1<q2 && q2<tag_end) {
+								StrBufAppendBufPlain(out, q1, (long)(q2-q1), 0);
+							}
+						}
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&gt;", 4)) {
-				outbuf[i] = '>';
-				strcpy(&outbuf[i+1], &outbuf[i+4]);
-			}
+					if (!strcasecmp(tag, "P")) {
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&amp;", 5)) {
-				strcpy(&outbuf[i+1], &outbuf[i+5]);
-			}
+					if (!strcasecmp(tag, "/DIV")) {
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&quot;", 6)) {
-				outbuf[i] = '\"';
-				strcpy(&outbuf[i+1], &outbuf[i+6]);
-			}
+					if (!strcasecmp(tag, "LI")) {
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						StrBufAppendBufPlain(out, HKEY(" * "), 0);
+						linelen = 3;
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&lsquo;", 7)) {
-				outbuf[i] = '`';
-				strcpy(&outbuf[i+1], &outbuf[i+7]);
-			}
+					else if (!strcasecmp(tag, "/UL")) {
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&rsquo;", 7)) {
-				outbuf[i] = '\'';
-				strcpy(&outbuf[i+1], &outbuf[i+7]);
-			}
+					else if (!strcasecmp(tag, "H1")) {
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&copy;", 6)) {
-				outbuf[i] = '(';
-				outbuf[i+1] = 'c';
-				outbuf[i+2] = ')';
-				strcpy(&outbuf[i+3], &outbuf[i+6]);
-			}
+					else if (!strcasecmp(tag, "H2")) {
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&bull;", 6)) {
-				outbuf[i] = ' ';
-				outbuf[i+1] = '*';
-				outbuf[i+2] = ' ';
-				strcpy(&outbuf[i+3], &outbuf[i+6]);
-			}
+					else if (!strcasecmp(tag, "H3")) {
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&hellip;", 8)) {
-				outbuf[i] = '.';
-				outbuf[i+1] = '.';
-				outbuf[i+2] = '.';
-				strcpy(&outbuf[i+3], &outbuf[i+8]);
-			}
+					else if (!strcasecmp(tag, "H4")) {
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&trade;", 7)) {
-				outbuf[i] = '(';
-				outbuf[i+1] = 't';
-				outbuf[i+2] = 'm';
-				outbuf[i+3] = ')';
-				strcpy(&outbuf[i+4], &outbuf[i+7]);
-			}
+					else if (!strcasecmp(tag, "/H1")) {
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&reg;", 5)) {
-				outbuf[i] = '(';
-				outbuf[i+1] = 'r';
-				outbuf[i+2] = ')';
-				strcpy(&outbuf[i+3], &outbuf[i+5]);
-			}
+					else if (!strcasecmp(tag, "/H2")) {
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&frac14;", 8)) {
-				outbuf[i] = '1';
-				outbuf[i+1] = '/';
-				outbuf[i+2] = '4';
-				strcpy(&outbuf[i+3], &outbuf[i+8]);
-			}
+					else if (!strcasecmp(tag, "/H3")) {
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&frac12;", 8)) {
-				outbuf[i] = '1';
-				outbuf[i+1] = '/';
-				outbuf[i+2] = '2';
-				strcpy(&outbuf[i+3], &outbuf[i+8]);
-			}
+					else if (!strcasecmp(tag, "/H4")) {
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&frac34;", 8)) {
-				outbuf[i] = '3';
-				outbuf[i+1] = '/';
-				outbuf[i+2] = '4';
-				strcpy(&outbuf[i+3], &outbuf[i+8]);
-			}
+					else if (!strcasecmp(tag, "HR")) {
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						StrBufAppendBufPlain(out, HKEY(" "), 0);
+						for (j = 0; j < screenwidth - 2; ++j) {
+							StrBufAppendBufPlain(out, HKEY("-"), 0);
+						}
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&ndash;", 7)) {
-				outbuf[i] = '-';
-				outbuf[i+1] = '-';
-				strcpy(&outbuf[i+2], &outbuf[i+7]);
-			}
+					else if (	(!strcasecmp(tag, "B"))
+							|| (!strcasecmp(tag, "STRONG"))
+					) {
+						if (ansi) {
+							StrBufAppendBufPlain(out, HKEY("\033[1m"), 0);
+						}
+					}
+					else if (	(!strcasecmp(tag, "/B"))
+							|| (!strcasecmp(tag, "/STRONG"))
+					) {
+						if (ansi) {
+							StrBufAppendBufPlain(out, HKEY("\033[22m"), 0);
+						}
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&mdash;", 7)) {
-				outbuf[i] = '-';
-				outbuf[i+1] = '-';
-				outbuf[i+2] = '-';
-				strcpy(&outbuf[i+3], &outbuf[i+7]);
-			}
+					else if (	(!strcasecmp(tag, "I"))
+							|| (!strcasecmp(tag, "EM"))
+					) {
+						if (ansi) {
+							StrBufAppendBufPlain(out, HKEY("\033[3m"), 0);
+						}
+					}
 
-			else if (!strncmp(&outbuf[i], "&Ccedil;", 8)) {
-				outbuf[i] = 'C';
-				strcpy(&outbuf[i+1], &outbuf[i+8]);
-			}
+					else if (	(!strcasecmp(tag, "/I"))
+							|| (!strcasecmp(tag, "/EM"))
+					) {
+						if (ansi) {
+							StrBufAppendBufPlain(out, HKEY("\033[23m"), 0);
+						}
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&ccedil;", 8)) {
-				outbuf[i] = 'c';
-				strcpy(&outbuf[i+1], &outbuf[i+8]);
-			}
+					else if (!strcasecmp(tag, "U")) {
+						if (ansi) {
+							StrBufAppendBufPlain(out, HKEY("\033[4m"), 0);
+						}
+					}
 
-			else if (!strncmp(&outbuf[i], "&Egrave;", 8)) {
-				outbuf[i] = 'E';
-				strcpy(&outbuf[i+1], &outbuf[i+8]);
-			}
+					else if (!strcasecmp(tag, "/U")) {
+						if (ansi) {
+							StrBufAppendBufPlain(out, HKEY("\033[24m"), 0);
+						}
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&egrave;", 8)) {
-				outbuf[i] = 'e';
-				strcpy(&outbuf[i+1], &outbuf[i+8]);
-			}
+					else if (!strcasecmp(tag, "BR")) {
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncmp(&outbuf[i], "&Ecirc;", 7)) {
-				outbuf[i] = 'E';
-				strcpy(&outbuf[i+1], &outbuf[i+7]);
-			}
+					else if (!strcasecmp(tag, "TR")) {
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&ecirc;", 7)) {
-				outbuf[i] = 'e';
-				strcpy(&outbuf[i+1], &outbuf[i+7]);
-			}
+					else if (!strcasecmp(tag, "/TABLE")) {
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncmp(&outbuf[i], "&Eacute;", 8)) {
-				outbuf[i] = 'E';
-				strcpy(&outbuf[i+1], &outbuf[i+8]);
-			}
+					else if (!strcasecmp(tag, "BLOCKQUOTE")) {
+						++blockquote;
+						strcpy(nl, "\n");
+						if ((blockquote == 1) && (ansi)) {
+							strcat(nl, "\033[2m\033[2m");
+						}
+						for (j = 0; j < blockquote; ++j) {
+							strcat(nl, ">");
+						}
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&eacute;", 8)) {
-				outbuf[i] = 'e';
-				strcpy(&outbuf[i+1], &outbuf[i+8]);
-			}
+					else if (!strcasecmp(tag, "/BLOCKQUOTE")) {
+						StrBufAppendBufPlain(out, HKEY("\n"), 0);
+						--blockquote;
+						if ((blockquote == 0) && (ansi)) {
+							StrBufAppendBufPlain(out, HKEY("\033[22m\033[22m"), 0);
+						}
+						strcpy(nl, "\n");
+						for (j = 0; j < blockquote; ++j) {
+							strcat(nl, ">");
+						}
+						StrBufAppendBufPlain(out, nl, -1, 0);
+						linelen = 0;
+					}
 
-			else if (!strncmp(&outbuf[i], "&Agrave;", 8)) {
-				outbuf[i] = 'A';
-				strcpy(&outbuf[i+1], &outbuf[i+8]);
-			}
+					else if (!strcasecmp(tag, "STYLE")) {
+						++styletag;
+					}
 
-			else if (!strncasecmp(&outbuf[i], "&agrave;", 8)) {
-				outbuf[i] = 'a';
-				strcpy(&outbuf[i+1], &outbuf[i+8]);
-			}
+					else if (!strcasecmp(tag, "/STYLE")) {
+						--styletag;
+					}
+				}
 
-			else if (!strncasecmp(&outbuf[i], "&ldquo;", 7)) {
-				outbuf[i] = '\"';
-				strcpy(&outbuf[i+1], &outbuf[i+7]);
 			}
 
-			else if (!strncasecmp(&outbuf[i], "&rdquo;", 7)) {
-				outbuf[i] = '\"';
-				strcpy(&outbuf[i+1], &outbuf[i+7]);
+			// copy non-tag text to the output buffer
+			else if ((!tag_nesting_level) && (styletag == 0)) {
+				StrBufAppendBufPlain(out, &ch, 1, 0);
+				++linelen;
 			}
 
-			else if (!strncasecmp(&outbuf[i], "&acute;", 7)) {
-				outbuf[i] = '\'';
-				strcpy(&outbuf[i+1], &outbuf[i+7]);
-			}
+			// Handle numeric entities
+			if (ch == ';') {
 
-			/* two-digit decimal equivalents */
-			else if (outbuf[i] == '&'       &&
-				 outbuf[i + 1] == '#'   &&
-				 isdigit(outbuf[i + 2]) && 
-				 isdigit(outbuf[i + 3]) &&
-				 (outbuf[i+4] == ';') ) 
-			{
-				scanch = 0;
-				sscanf(&outbuf[i+2], "%02d", &scanch);
-				outbuf[i] = scanch;
-				strcpy(&outbuf[i+1], &outbuf[i+5]);
-			}
+				u_int32_t scanch = 0;
+				int elen = 0;
 
-			/* three-digit decimal equivalents */
-			else if (outbuf[i] == '&'       &&
-				 outbuf[i + 1] == '#'   &&
-				 isdigit(outbuf[i + 2]) && 
-				 isdigit(outbuf[i + 3]) && 
-				 isdigit(outbuf[i + 4]) &&
-				 (outbuf[i + 5] == ';') ) 
-			{
-				scanch = 0;
-				sscanf(&outbuf[i+2], "%03d", &scanch);
-				outbuf[i] = scanch;
-				strcpy(&outbuf[i+1], &outbuf[i+6]);
-			}
+				if ( (linelen >= 5) && (*(inptr-4) == '&') && (*(inptr-3) == '#') ) {
+					sscanf(inptr-2, "%02d", &scanch);
+					elen = 5;
+				}
+				else if ( (linelen >= 6) && (*(inptr-5) == '&') && (*(inptr-4) == '#') ) {
+					sscanf(inptr-3, "%03d", &scanch);
+					elen = 6;
+				}
+				else if ( (linelen >= 7) && (*(inptr-6) == '&') && (*(inptr-5) == '#') ) {
+					sscanf(inptr-3, "%04d", &scanch);
+					elen = 7;
+				}
+				else if ( (linelen >= 8) && (*(inptr-7) == '&') && (*(inptr-6) == '#') ) {
+					sscanf(inptr-4, "%05d", &scanch);
+					elen = 8;
+				}
 
-			/* four-digit decimal equivalents */
-			else if (outbuf[i] == '&'       &&
-				 outbuf[i + 1] == '#'   &&
-				 isdigit(outbuf[i + 2]) && 
-				 isdigit(outbuf[i + 3]) && 
-				 isdigit(outbuf[i + 4]) &&
-				 isdigit(outbuf[i + 5]) &&
-				 (outbuf[i + 6] == ';') ) 
-			{
-				scanch = 0;
-				sscanf(&outbuf[i+2], "%04d", &scanch);
-				outbuf[i] = scanch;
-				strcpy(&outbuf[i+1], &outbuf[i+6]);
-			}
+				if (scanch) {
+					StrBufCutRight(out, elen);
+					linelen -= elen;
 
-		}
+					char utf[5];
+					int ulen = u8_wc_toutf8(utf, scanch);
+					utf[ulen] = 0;
+					StrBufAppendBufPlain(out, utf, ulen, 0);
+					linelen += elen;
+				}
 
-		/* Make sure the output buffer is big enough */
-		if ((output_len + strlen(outbuf) + SIZ) > outptr_buffer_size) {
-			outptr_buffer_size += SIZ;
-			outptr = realloc(outptr, outptr_buffer_size);
-			if (outptr == NULL) {
-				abort();
 			}
-		}
 
-		/* Output any lines terminated with hard line breaks */
-		do {
-			did_out = 0;
-			if (strlen(outbuf) > 0) {
-			    for (i = 0; i<strlen(outbuf); ++i) {
-				if ( (i<(screenwidth-2)) && (outbuf[i]=='\n')) {
-
-					strncpy(&outptr[output_len], outbuf, i+1);
-					output_len += (i+1);
-
-					if (do_citaformat) {
-						strcpy(&outptr[output_len], " ");
-						++output_len;
-					}
-
-					strcpy(outbuf, &outbuf[i+1]);
-					i = 0;
-					did_out = 1;
-				}
-			}
-		    }
-		} while (did_out);
-
-		/* Add soft line breaks */
-		if (strlen(outbuf) > (screenwidth - 2 )) {
-			rb = (-1);
-			for (i=0; i<(screenwidth-2); ++i) {
-				if (outbuf[i]==32) rb = i;
-			}
-			if (rb>=0) {
-				strncpy(&outptr[output_len], outbuf, rb);
-				output_len += rb;
-				strcpy(&outptr[output_len], nl);
-				output_len += strlen(nl);
-				if (do_citaformat) {
-					strcpy(&outptr[output_len], " ");
-					++output_len;
-				}
-				strcpy(outbuf, &outbuf[rb+1]);
-			} else {
-				strncpy(&outptr[output_len], outbuf,
-					screenwidth-2);
-				output_len += (screenwidth-2);
-				strcpy(&outptr[output_len], nl);
-				output_len += strlen(nl);
-				if (do_citaformat) {
-					strcpy(&outptr[output_len], " ");
-					++output_len;
+			// Add soft line breaks when necessary
+			if (linelen > (screenwidth - 8)) {
+				char *ptr = (char *)ChrPtr(out) + StrLength(out) - linelen;
+				char *rightmost_space = strrchr(ptr, ' ');
+				if (rightmost_space && rightmost_space > ptr) {
+					int space_pos = rightmost_space - ChrPtr(out);
+					StrBufReplaceToken(out, (long)space_pos, 1, nl, strlen(nl));
+					linelen = strlen(rightmost_space) - 1;
 				}
-				strcpy(outbuf, &outbuf[screenwidth-2]);
 			}
-		}
-
-	} while (done_reading == 0);
 
-	strcpy(&outptr[output_len], outbuf);
-	output_len += strlen(outbuf);
-
-	/* Strip leading/trailing whitespace.  We can't do this with
-	 * striplt() because it uses too many strlen()'s
-	 */
+			// Advance to the next byte of input.
+			inptr++;
+		}
+	}
+	free(inbuf);
+
+	// Convert entity tags to printable characters
+	StrBufReplaceAllOccurrences(out, "&nbsp;", " ");
+	StrBufReplaceAllOccurrences(out, "&ensp;", " ");
+	StrBufReplaceAllOccurrences(out, "&emsp;", " ");
+	StrBufReplaceAllOccurrences(out, "&thinsp;", " ");
+	StrBufReplaceAllOccurrences(out, "&lt;", "<");
+	StrBufReplaceAllOccurrences(out, "&gt;", ">");
+	StrBufReplaceAllOccurrences(out, "&amp;", "&");
+	StrBufReplaceAllOccurrences(out, "&quot;", "\"");
+	StrBufReplaceAllOccurrences(out, "&lsquo;", "`");
+	StrBufReplaceAllOccurrences(out, "&rsquo;", "'");
+	StrBufReplaceAllOccurrences(out, "&bull;", " * ");
+	StrBufReplaceAllOccurrences(out, "&hellip;", "â¦");
+	StrBufReplaceAllOccurrences(out, "&copy;", "Â©");
+	StrBufReplaceAllOccurrences(out, "&trade;", "â¢");
+	StrBufReplaceAllOccurrences(out, "&reg;", "Â®");
+	StrBufReplaceAllOccurrences(out, "&frac14;", "Â¼");
+	StrBufReplaceAllOccurrences(out, "&frac12;", "Â½");
+	StrBufReplaceAllOccurrences(out, "&frac34;", "Â¾");
+	StrBufReplaceAllOccurrences(out, "&ndash;", "â");
+	StrBufReplaceAllOccurrences(out, "&mdash;", "â");
+	StrBufReplaceAllOccurrences(out, "&Ccedil;", "Ã");
+	StrBufReplaceAllOccurrences(out, "&ccedil;", "Ã§");
+	StrBufReplaceAllOccurrences(out, "&Egrave;", "Ã");
+	StrBufReplaceAllOccurrences(out, "&egrave;", "Ã¨");
+	StrBufReplaceAllOccurrences(out, "&Ecirc;", "Ã");
+	StrBufReplaceAllOccurrences(out, "&ecirc;", "Ãª");
+	StrBufReplaceAllOccurrences(out, "&Eacute;", "Ã");
+	StrBufReplaceAllOccurrences(out, "&eacute;", "Ã©");
+	StrBufReplaceAllOccurrences(out, "&Agrave;", "Ã");
+	StrBufReplaceAllOccurrences(out, "&agrave;", "Ã ");
+	StrBufReplaceAllOccurrences(out, "&ldquo;", "\"");
+	StrBufReplaceAllOccurrences(out, "&rdquo;", "\"");
+	StrBufReplaceAllOccurrences(out, "&acute;", "'");
+	StrBufReplaceAllOccurrences(out, "&#8217;", "'");
+	StrBufReplaceAllOccurrences(out, "&#8211;", "-");
+
+	// Convert from a StrBuf to a plain C string
+	int output_len = StrLength(out);
+	outptr = SmashStrBuf(&out);
+
+	// Strip leading whitespace
 	while ((output_len > 0) && (isspace(outptr[0]))) {
 		strcpy(outptr, &outptr[1]);
 		--output_len;
 	}
-	while ((output_len > 0) && (isspace(outptr[output_len-1]))) {
-		outptr[output_len-1] = 0;
+
+	// Strip trailing whitespace
+	while ((output_len > 0) && (isspace(outptr[output_len - 1]))) {
+		outptr[output_len - 1] = 0;
 		--output_len;
 	}
 
-	if ((output_len > 0) && (outptr[output_len-1] != '\n')) {
+	// Make sure the final line ends with a newline character.
+	if ((output_len > 0) && (outptr[output_len - 1] != '\n')) {
 		strcat(outptr, "\n");
 		++output_len;
 	}
 
 	return outptr;
-
 }