X-Git-Url: https://code.citadel.org/?a=blobdiff_plain;f=libcitadel%2Flib%2Fhtml_to_ascii.c;h=ca2de8df8d9e7b0eb756ca1a7186a16d5f571641;hb=HEAD;hp=faa343710c1c19556b67cabd9be578e65dc38f78;hpb=bca06b89514d8d91c1442735272ab10ea0e19f9a;p=citadel.git diff --git a/libcitadel/lib/html_to_ascii.c b/libcitadel/lib/html_to_ascii.c index faa343710..8f3c9eca0 100644 --- a/libcitadel/lib/html_to_ascii.c +++ b/libcitadel/lib/html_to_ascii.c @@ -1,21 +1,7 @@ -/* - * Functions which handle translation between HTML and plain text - * Copyright (c) 2000-2010 by the citadel.org team - * - * This program is open source software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ +// This is an HTML to plain text converter. +// Copyright (c) 2000-2024 by the citadel.org team (Art Cancro et al.) +// +// This program is open source software. Use, duplication, or disclosure is subject to the GNU General Public License version 3. #include #include @@ -27,576 +13,499 @@ #include #include #include - -#if TIME_WITH_SYS_TIME -# include -# include -#else -# if HAVE_SYS_TIME_H -# include -# else -# include -# endif -#endif - +#include #include "libcitadel.h" - - -/* - * Convert HTML to plain text. - * - * inputmsg = pointer to raw HTML message - * screenwidth = desired output screenwidth - * do_citaformat = set to 1 to indent newlines with spaces - */ -char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth, int do_citaformat) { - char inbuf[SIZ]; - int inbuf_len = 0; - char outbuf[SIZ]; - char tag[1024]; - int done_reading = 0; - const char *inptr; - char *outptr; - size_t outptr_buffer_size; - size_t output_len = 0; - int i, j, ch, did_out, rb, scanch; - int nest = 0; /* Bracket nesting level */ - int blockquote = 0; /* BLOCKQUOTE nesting level */ - int styletag = 0; /* STYLE tag nesting level */ - int styletag_start = 0; - int bytes_processed = 0; - char nl[128]; - - strcpy(nl, "\n"); - inptr = inputmsg; - strcpy(inbuf, ""); - strcpy(outbuf, ""); - if (msglen == 0) msglen = strlen(inputmsg); - - outptr_buffer_size = strlen(inptr) + SIZ; - outptr = malloc(outptr_buffer_size); - if (outptr == NULL) return NULL; - strcpy(outptr, ""); - output_len = 0; - - do { - /* Fill the input buffer */ - inbuf_len = strlen(inbuf); - if ( (done_reading == 0) && (inbuf_len < (SIZ-128)) ) { - - ch = *inptr++; - if (ch != 0) { - inbuf[inbuf_len++] = ch; - inbuf[inbuf_len] = 0; - } - else { - done_reading = 1; - } - - ++bytes_processed; - if (bytes_processed > msglen) { - done_reading = 1; - } - - } - - /* Do some parsing */ - if (!IsEmptyStr(inbuf)) { - - - /* Fold in all the spacing */ - for (i=0; !IsEmptyStr(&inbuf[i]); ++i) { - if (inbuf[i]==10) inbuf[i]=32; - if (inbuf[i]==13) inbuf[i]=32; - if (inbuf[i]==9) inbuf[i]=32; - } - for (i=0; !IsEmptyStr(&inbuf[i]); ++i) { - while ((inbuf[i]==32)&&(inbuf[i+1]==32)) { - strcpy(&inbuf[i], &inbuf[i+1]); - } - } - - for (i=0; !IsEmptyStr(&inbuf[i]); ++i) { - - ch = inbuf[i]; - - if (ch == '<') { - ++nest; - strcpy(tag, ""); - } - else if (ch == '>') { /* We have a tag. */ - if (nest > 0) --nest; - - /* Unqualify the tag (truncate at first space) */ - if (strchr(tag, ' ') != NULL) { - strcpy(strchr(tag, ' '), ""); - } - - if (!strcasecmp(tag, "P")) { - strcat(outbuf, nl); - strcat(outbuf, nl); - } - - if (!strcasecmp(tag, "/DIV")) { - strcat(outbuf, nl); - strcat(outbuf, nl); - } - - if (!strcasecmp(tag, "LI")) { - strcat(outbuf, nl); - strcat(outbuf, " * "); - } - - else if (!strcasecmp(tag, "/UL")) { - strcat(outbuf, nl); - strcat(outbuf, nl); - } +int u8_wc_toutf8(char *dest, u_int32_t ch) { + if (ch < 0x80) { + dest[0] = (char)ch; + return 1; + } + if (ch < 0x800) { + dest[0] = (ch>>6) | 0xC0; + dest[1] = (ch & 0x3F) | 0x80; + return 2; + } + if (ch < 0x10000) { + dest[0] = (ch>>12) | 0xE0; + dest[1] = ((ch>>6) & 0x3F) | 0x80; + dest[2] = (ch & 0x3F) | 0x80; + return 3; + } + if (ch < 0x110000) { + dest[0] = (ch>>18) | 0xF0; + dest[1] = ((ch>>12) & 0x3F) | 0x80; + dest[2] = ((ch>>6) & 0x3F) | 0x80; + dest[3] = (ch & 0x3F) | 0x80; + return 4; + } + return 0; +} - else if (!strcasecmp(tag, "H1")) { - strcat(outbuf, nl); - strcat(outbuf, nl); - } - else if (!strcasecmp(tag, "H2")) { - strcat(outbuf, nl); - strcat(outbuf, nl); - } +// Try to embed an image in the display stream. +// out = the StrBuf to which we are writing the display stream +// url = the URL of the image (warning: it might be a data: URL) +// display_protocol = currently only H2A_SIXEL is supported +void h2a_embed_image(StrBuf *out, char *url, int display_protocol) { - else if (!strcasecmp(tag, "H3")) { - strcat(outbuf, nl); - strcat(outbuf, nl); - } + char buf[4096]; + snprintf(buf, sizeof(buf), "curl -s '%s' | img2sixel - | fold", url); - else if (!strcasecmp(tag, "H4")) { - strcat(outbuf, nl); - strcat(outbuf, nl); - } + FILE *cmd = popen(buf, "r"); + if (!cmd) { + return; + } - else if (!strcasecmp(tag, "/H1")) { - strcat(outbuf, nl); - } + size_t bytes; + while (bytes = fread(buf, 1, sizeof(buf), cmd), bytes>0) { + StrBufAppendBufPlain(out, buf, bytes, 0); + } + pclose(cmd); +} - else if (!strcasecmp(tag, "/H2")) { - strcat(outbuf, nl); - } - else if (!strcasecmp(tag, "/H3")) { - strcat(outbuf, nl); - } +// Convert HTML to plain text. +// +// inputmsg = pointer to raw HTML message +// msglen = stop reading after this many bytes +// screenwidth = desired output screenwidth +// flags = Flags that can be set: +// H2A_ANSI = Output ANSI terminal escape sequences +// H2A_SIXEL = Output Sixel graphics (not yet implemented) +// +char *html_to_ascii(const char *inputmsg, int msglen, int screenwidth, unsigned int flags) { + char *inbuf = NULL; + int inbuf_len = 0; + char tag[1024]; + char *tag_start = NULL; + char *tag_end = NULL; + StrBuf *out; + char *outptr; + int j; + char ch; + int tag_nesting_level = 0; // angle bracket nesting level + int blockquote = 0; // BLOCKQUOTE nesting level + int styletag = 0; // STYLE tag nesting level + char nl[128]; // The current value of what a "newline" looks like (changes during blockquotes) + + int ansi = (flags & H2A_ANSI) ? 1 : 0; // Output to a terminal that can accept ANSI escape sequences + int sixel = (flags & H2A_SIXEL) ? 1 : 0; // Output to a terminal that can accept Sixel graphics + + out = NewStrBuf(); + if (!out) { + return(NULL); + } - else if (!strcasecmp(tag, "/H4")) { - strcat(outbuf, nl); - } + tag[0] = '\0'; + strcpy(nl, "\n"); + if (msglen == 0) { + msglen = strlen(inputmsg); + } - else if (!strcasecmp(tag, "HR")) { - strcat(outbuf, nl); - strcat(outbuf, " "); - for (j=0; j msglen) { + inbuf[msglen] = 0; + inbuf_len = msglen; + } - else if ( - (!strcasecmp(tag, "I")) - || (!strcasecmp(tag, "/I")) - || (!strcasecmp(tag, "EM")) - || (!strcasecmp(tag, "/EM")) - ) { - strcat(outbuf, "/"); - - } + // Do some parsing + if (!IsEmptyStr(inbuf)) { - else if ( - (!strcasecmp(tag, "U")) - || (!strcasecmp(tag, "/U")) - ) { - strcat(outbuf, "_"); - - } + // Convert newlines, carriage returns, and tabs to spaces + char *sp; + while ( (sp = strchr(inbuf, '\r')) + || (sp = strchr(inbuf, '\n')) + || (sp = strchr(inbuf, '\t')) + ) { + *sp = ' '; + } - else if (!strcasecmp(tag, "BR")) { - strcat(outbuf, nl); - } + // Convert multiple spaces to a single space. + while (sp = strstr(inbuf, " "), sp!=NULL) { + strcpy(sp, sp+1); + } - else if (!strcasecmp(tag, "TR")) { - strcat(outbuf, nl); - } + // Run through the markup performing the conversion. + char *inptr = inbuf; + int linelen = 0; + while (ch = inptr[0], ch != 0) { - else if (!strcasecmp(tag, "/TABLE")) { - strcat(outbuf, nl); - } + // Keep track of how many angle brackets were found in case someone is sloppy with them + // or tries to nest tags. If nest is 0 then we are within text; if it is nonzero then we + // are within a tag. - else if (!strcasecmp(tag, "BLOCKQUOTE")) { - ++blockquote; - strcpy(nl, "\n"); - for (j=0; j"); - strcat(outbuf, nl); - } + if (ch == '<') { // We have hit the beginning of a tag. + ++tag_nesting_level; + tag_start = inptr + 1; + strcpy(tag, ""); + } - else if (!strcasecmp(tag, "/BLOCKQUOTE")) { - strcat(outbuf, "\n"); - --blockquote; - strcpy(nl, "\n"); - for (j=0; j"); - strcat(outbuf, nl); + else if (ch == '>') { // We have hit the end of a tag. + if (tag_nesting_level > 0) { + --tag_nesting_level; } + if (tag_nesting_level == 0) { + tag_end = inptr; - else if (!strcasecmp(tag, "STYLE")) { - ++styletag; - if (styletag == 1) { - styletag_start = strlen(outbuf); + size_t tag_len = tag_end - tag_start; + if (tag_len >= sizeof(tag)) { + tag_len = sizeof(tag); } - } + strncpy(tag, tag_start, tag_len); + tag[tag_len] = 0; - else if (!strcasecmp(tag, "/STYLE")) { - --styletag; - if (styletag == 0) { - outbuf[styletag_start] = 0; + // Unqualify the tag (truncate at first space) + char *tagsp = strchr(tag, ' '); + if (tagsp) { + *tagsp = 0; } - } - - } - - else if ((nest > 0) && (strlen(tag)<(sizeof(tag)-1))) { - tag[strlen(tag)+1] = 0; - tag[strlen(tag)] = ch; - } - - else if (!nest) { - outbuf[strlen(outbuf)+1] = 0; - outbuf[strlen(outbuf)] = ch; - } - } - strcpy(inbuf, &inbuf[i]); - } - - /* Convert &; tags to the forbidden characters */ - if (!IsEmptyStr(outbuf)) for (i=0; !IsEmptyStr(&outbuf[i]); ++i) { - - /* Character entity references */ - if (!strncasecmp(&outbuf[i], " ", 6)) { - outbuf[i] = ' '; - strcpy(&outbuf[i+1], &outbuf[i+6]); - } - if (!strncasecmp(&outbuf[i], " ", 6)) { - outbuf[i] = ' '; - strcpy(&outbuf[i+1], &outbuf[i+6]); - } - - if (!strncasecmp(&outbuf[i], " ", 6)) { - outbuf[i] = ' '; - strcpy(&outbuf[i+1], &outbuf[i+6]); - } - - if (!strncasecmp(&outbuf[i], " ", 8)) { - outbuf[i] = ' '; - strcpy(&outbuf[i+1], &outbuf[i+8]); - } + // IMG tag on sixel terminals -- try to display the image + if ( (!strcasecmp(tag, "img")) && sixel) { + char *q1, *q2; + + // look for src attribute + char *src = bmstrcasestr(tag_start, "src="); + q1 = q2 = NULL; + if (src && src"); + } + StrBufAppendBufPlain(out, nl, -1, 0); + linelen = 0; + } - else if (!strncasecmp(&outbuf[i], "é", 8)) { - outbuf[i] = 'e'; - strcpy(&outbuf[i+1], &outbuf[i+8]); - } + else if (!strcasecmp(tag, "/BLOCKQUOTE")) { + StrBufAppendBufPlain(out, HKEY("\n"), 0); + --blockquote; + if ((blockquote == 0) && (ansi)) { + StrBufAppendBufPlain(out, HKEY("\033[22m\033[22m"), 0); + } + strcpy(nl, "\n"); + for (j = 0; j < blockquote; ++j) { + strcat(nl, ">"); + } + StrBufAppendBufPlain(out, nl, -1, 0); + linelen = 0; + } - else if (!strncmp(&outbuf[i], "À", 8)) { - outbuf[i] = 'A'; - strcpy(&outbuf[i+1], &outbuf[i+8]); - } + else if (!strcasecmp(tag, "STYLE")) { + ++styletag; + } - else if (!strncasecmp(&outbuf[i], "à", 8)) { - outbuf[i] = 'a'; - strcpy(&outbuf[i+1], &outbuf[i+8]); - } + else if (!strcasecmp(tag, "/STYLE")) { + --styletag; + } + } - else if (!strncasecmp(&outbuf[i], "“", 7)) { - outbuf[i] = '\"'; - strcpy(&outbuf[i+1], &outbuf[i+7]); } - else if (!strncasecmp(&outbuf[i], "”", 7)) { - outbuf[i] = '\"'; - strcpy(&outbuf[i+1], &outbuf[i+7]); + // copy non-tag text to the output buffer + else if ((!tag_nesting_level) && (styletag == 0)) { + StrBufAppendBufPlain(out, &ch, 1, 0); + ++linelen; } - else if (!strncasecmp(&outbuf[i], "´", 7)) { - outbuf[i] = '\''; - strcpy(&outbuf[i+1], &outbuf[i+7]); - } + // Handle numeric entities + if (ch == ';') { - /* two-digit decimal equivalents */ - else if (outbuf[i] == '&' && - outbuf[i + 1] == '#' && - isdigit(outbuf[i + 2]) && - isdigit(outbuf[i + 3]) && - (outbuf[i+4] == ';') ) - { - scanch = 0; - sscanf(&outbuf[i+2], "%02d", &scanch); - outbuf[i] = scanch; - strcpy(&outbuf[i+1], &outbuf[i+5]); - } + u_int32_t scanch = 0; + int elen = 0; - /* three-digit decimal equivalents */ - else if (outbuf[i] == '&' && - outbuf[i + 1] == '#' && - isdigit(outbuf[i + 2]) && - isdigit(outbuf[i + 3]) && - isdigit(outbuf[i + 4]) && - (outbuf[i + 5] == ';') ) - { - scanch = 0; - sscanf(&outbuf[i+2], "%03d", &scanch); - outbuf[i] = scanch; - strcpy(&outbuf[i+1], &outbuf[i+6]); - } + if ( (linelen >= 5) && (*(inptr-4) == '&') && (*(inptr-3) == '#') ) { + sscanf(inptr-2, "%02d", &scanch); + elen = 5; + } + else if ( (linelen >= 6) && (*(inptr-5) == '&') && (*(inptr-4) == '#') ) { + sscanf(inptr-3, "%03d", &scanch); + elen = 6; + } + else if ( (linelen >= 7) && (*(inptr-6) == '&') && (*(inptr-5) == '#') ) { + sscanf(inptr-3, "%04d", &scanch); + elen = 7; + } + else if ( (linelen >= 8) && (*(inptr-7) == '&') && (*(inptr-6) == '#') ) { + sscanf(inptr-4, "%05d", &scanch); + elen = 8; + } - /* four-digit decimal equivalents */ - else if (outbuf[i] == '&' && - outbuf[i + 1] == '#' && - isdigit(outbuf[i + 2]) && - isdigit(outbuf[i + 3]) && - isdigit(outbuf[i + 4]) && - isdigit(outbuf[i + 5]) && - (outbuf[i + 6] == ';') ) - { - scanch = 0; - sscanf(&outbuf[i+2], "%04d", &scanch); - outbuf[i] = scanch; - strcpy(&outbuf[i+1], &outbuf[i+6]); - } + if (scanch) { + StrBufCutRight(out, elen); + linelen -= elen; - } + char utf[5]; + int ulen = u8_wc_toutf8(utf, scanch); + utf[ulen] = 0; + StrBufAppendBufPlain(out, utf, ulen, 0); + linelen += elen; + } - /* Make sure the output buffer is big enough */ - if ((output_len + strlen(outbuf) + SIZ) > outptr_buffer_size) { - outptr_buffer_size += SIZ; - outptr = realloc(outptr, outptr_buffer_size); - if (outptr == NULL) { - abort(); } - } - /* Output any lines terminated with hard line breaks */ - do { - did_out = 0; - if (strlen(outbuf) > 0) { - for (i = 0; i (screenwidth - 2 )) { - rb = (-1); - for (i=0; i<(screenwidth-2); ++i) { - if (outbuf[i]==32) rb = i; - } - if (rb>=0) { - strncpy(&outptr[output_len], outbuf, rb); - output_len += rb; - strcpy(&outptr[output_len], nl); - output_len += strlen(nl); - if (do_citaformat) { - strcpy(&outptr[output_len], " "); - ++output_len; - } - strcpy(outbuf, &outbuf[rb+1]); - } else { - strncpy(&outptr[output_len], outbuf, - screenwidth-2); - output_len += (screenwidth-2); - strcpy(&outptr[output_len], nl); - output_len += strlen(nl); - if (do_citaformat) { - strcpy(&outptr[output_len], " "); - ++output_len; + // Add soft line breaks when necessary + if (linelen > (screenwidth - 8)) { + char *ptr = (char *)ChrPtr(out) + StrLength(out) - linelen; + char *rightmost_space = strrchr(ptr, ' '); + if (rightmost_space && rightmost_space > ptr) { + int space_pos = rightmost_space - ChrPtr(out); + StrBufReplaceToken(out, (long)space_pos, 1, nl, strlen(nl)); + linelen = strlen(rightmost_space) - 1; } - strcpy(outbuf, &outbuf[screenwidth-2]); } - } - - } while (done_reading == 0); - strcpy(&outptr[output_len], outbuf); - output_len += strlen(outbuf); - - /* Strip leading/trailing whitespace. We can't do this with - * striplt() because it uses too many strlen()'s - */ + // Advance to the next byte of input. + inptr++; + } + } + free(inbuf); + + // Convert entity tags to printable characters + StrBufReplaceAllOccurrences(out, " ", " "); + StrBufReplaceAllOccurrences(out, " ", " "); + StrBufReplaceAllOccurrences(out, " ", " "); + StrBufReplaceAllOccurrences(out, " ", " "); + StrBufReplaceAllOccurrences(out, "<", "<"); + StrBufReplaceAllOccurrences(out, ">", ">"); + StrBufReplaceAllOccurrences(out, "&", "&"); + StrBufReplaceAllOccurrences(out, """, "\""); + StrBufReplaceAllOccurrences(out, "‘", "`"); + StrBufReplaceAllOccurrences(out, "’", "'"); + StrBufReplaceAllOccurrences(out, "•", " * "); + StrBufReplaceAllOccurrences(out, "…", "…"); + StrBufReplaceAllOccurrences(out, "©", "©"); + StrBufReplaceAllOccurrences(out, "™", "™"); + StrBufReplaceAllOccurrences(out, "®", "®"); + StrBufReplaceAllOccurrences(out, "¼", "¼"); + StrBufReplaceAllOccurrences(out, "½", "½"); + StrBufReplaceAllOccurrences(out, "¾", "¾"); + StrBufReplaceAllOccurrences(out, "–", "–"); + StrBufReplaceAllOccurrences(out, "—", "—"); + StrBufReplaceAllOccurrences(out, "Ç", "Ç"); + StrBufReplaceAllOccurrences(out, "ç", "ç"); + StrBufReplaceAllOccurrences(out, "È", "È"); + StrBufReplaceAllOccurrences(out, "è", "è"); + StrBufReplaceAllOccurrences(out, "Ê", "Ê"); + StrBufReplaceAllOccurrences(out, "ê", "ê"); + StrBufReplaceAllOccurrences(out, "É", "É"); + StrBufReplaceAllOccurrences(out, "é", "é"); + StrBufReplaceAllOccurrences(out, "À", "À"); + StrBufReplaceAllOccurrences(out, "à", "à"); + StrBufReplaceAllOccurrences(out, "“", "\""); + StrBufReplaceAllOccurrences(out, "”", "\""); + StrBufReplaceAllOccurrences(out, "´", "'"); + StrBufReplaceAllOccurrences(out, "’", "'"); + StrBufReplaceAllOccurrences(out, "–", "-"); + + // Convert from a StrBuf to a plain C string + int output_len = StrLength(out); + outptr = SmashStrBuf(&out); + + // Strip leading whitespace while ((output_len > 0) && (isspace(outptr[0]))) { strcpy(outptr, &outptr[1]); --output_len; } - while ((output_len > 0) && (isspace(outptr[output_len-1]))) { - outptr[output_len-1] = 0; + + // Strip trailing whitespace + while ((output_len > 0) && (isspace(outptr[output_len - 1]))) { + outptr[output_len - 1] = 0; --output_len; } - if ((output_len > 0) && (outptr[output_len-1] != '\n')) { + // Make sure the final line ends with a newline character. + if ((output_len > 0) && (outptr[output_len - 1] != '\n')) { strcat(outptr, "\n"); ++output_len; } return outptr; - }