From 2da33e9a303d108d625aecdb627a96b5b14dd2cd Mon Sep 17 00:00:00 2001 From: Wilfried Goesgens Date: Thu, 18 Aug 2011 17:53:18 +0000 Subject: [PATCH] Fix StrHtmlEcmaEscAppend() more edge cases in utf8 handling - Ctdl_GetUtf8SequenceLength(): return 0 for broken, 1 for non-utf8 sequences - StrHtmlEcmaEscAppend(): another attempt to fix json appending: - use switch instead of Rumpelstilskin if sequence - use HKEY for our static sequences - don't check for buffer expansion again while utf8-handling; we have 11 bytes reserved, utf8 will at most become 6 bytes - Simply handle ASCII chars in the utf8 loop - since aptr is moved implicitely, we mustn't move it for the last byte - add missing handling for some escapers - let unicode escapers pass Conflicts: libcitadel/lib/stringbuf.c --- libcitadel/lib/stringbuf.c | 150 ++++++++++++++++++++++++------------- 1 file changed, 98 insertions(+), 52 deletions(-) diff --git a/libcitadel/lib/stringbuf.c b/libcitadel/lib/stringbuf.c index 1914190bb..18d9870b5 100644 --- a/libcitadel/lib/stringbuf.c +++ b/libcitadel/lib/stringbuf.c @@ -2222,83 +2222,126 @@ long StrHtmlEcmaEscAppend(StrBuf *Target, const StrBuf *Source, const char *Plai eptr = Target->buf + Target->BufSize - 11; /* our biggest unit to put in... */ bptr = Target->buf + Target->BufUsed; } - if (*aptr == '<') { - memcpy(bptr, "<", 4); + switch (*aptr) { + case '<': + memcpy(bptr, HKEY("<")); bptr += 4; Target->BufUsed += 4; - } - else if (*aptr == '>') { - memcpy(bptr, ">", 4); + break; + case '>': + memcpy(bptr, HKEY(">")); bptr += 4; Target->BufUsed += 4; - } - else if (*aptr == '&') { - memcpy(bptr, "&", 5); + break; + case '&': + memcpy(bptr, HKEY("&")); bptr += 5; Target->BufUsed += 5; - } - else if (*aptr == LB) { + break; + case LB: *bptr = '<'; bptr ++; Target->BufUsed ++; - } - else if (*aptr == RB) { + break; + case RB: *bptr = '>'; bptr ++; Target->BufUsed ++; - } - else if ((*aptr == 32) && (nbsp == 1)) { - memcpy(bptr, " ", 6); + break; + case 32: +//) && (nbsp == 1)) { + memcpy(bptr, HKEY(" ")); bptr += 6; Target->BufUsed += 6; - } - else if ((*aptr == '\n') && (nolinebreaks == 1)) { - *bptr='\0'; /* nothing */ - } - else if ((*aptr == '\n') && (nolinebreaks == 2)) { - memcpy(bptr, "<br/>", 11); - bptr += 11; - Target->BufUsed += 11; - } - - else if ((*aptr == '\r') && (nolinebreaks != 0)) { - *bptr='\0'; /* nothing */ - } - - else if ((*aptr == '"') || (*aptr == QU)) { + break; + case '\n': + switch (nolinebreaks) { + case 1: + *bptr='\0'; /* nothing */ + break; + case 2: + memcpy(bptr, HKEY("<br/>")); + bptr += 11; + Target->BufUsed += 11; + break; + default: + memcpy(bptr, HKEY("\\n")); + bptr += 2; + Target->BufUsed += 2; + } + break; + case '\r': + switch (nolinebreaks) { + case 1: + case 2: + *bptr='\0'; /* nothing */ + break; + default: + memcpy(bptr, HKEY("\\r")); + bptr += 2; + Target->BufUsed += 2; + break; + } + break; + case '"': + case QU: *bptr = '\\'; bptr ++; *bptr = '"'; bptr ++; Target->BufUsed += 2; - } else if (*aptr == '\\') { + break; + case '\\': + if ((*(aptr + 1) == 'u') && + isxdigit(*(aptr + 2)) && + isxdigit(*(aptr + 3)) && + isxdigit(*(aptr + 4)) && + isxdigit(*(aptr + 5))) + { /* oh, a unicode escaper. let it pass through. */ + memcpy(bptr, aptr, 6); + aptr += 5; + bptr +=6; + Target->BufUsed += 6; + } + else + { + *bptr = '\\'; + bptr ++; + *bptr = '\\'; + bptr ++; + Target->BufUsed += 2; + } + break; + case '\b': *bptr = '\\'; bptr ++; + *bptr = 'b'; + bptr ++; + Target->BufUsed += 2; + break; + case '\f': *bptr = '\\'; bptr ++; + *bptr = 'f'; + bptr ++; Target->BufUsed += 2; - } - else { - if (((unsigned char)*aptr) >= 0x20) - { - IsUtf8Sequence = Ctdl_GetUtf8SequenceLength(aptr, eiptr); - + break; + case '\t': + *bptr = '\\'; + bptr ++; + *bptr = 't'; + bptr ++; + Target->BufUsed += 2; + break; + default: + IsUtf8Sequence = Ctdl_GetUtf8SequenceLength(aptr, eiptr); + while (IsUtf8Sequence > 0){ *bptr = *aptr; Target->BufUsed ++; - while (IsUtf8Sequence > 1){ - if(bptr + IsUtf8Sequence >= eptr) { - IncreaseBuf(Target, 1, -1); - eptr = Target->buf + Target->BufSize - 11; /* our biggest unit to put in... */ - bptr = Target->buf + Target->BufUsed - 1; - } - bptr++; aptr++; - IsUtf8Sequence --; - *bptr = *aptr; - Target->BufUsed ++; - } + if (--IsUtf8Sequence) + aptr++; bptr++; } - } aptr ++; } @@ -3212,19 +3255,22 @@ void StrBuf_RFC822_2_Utf8(StrBuf *Target, * @ingroup StrBuf * @brief evaluate the length of an utf8 special character sequence * @param Char the character to examine - * @returns width of utf8 chars in bytes + * @returns width of utf8 chars in bytes; if the sequence is broken 0 is returned; 1 if its simply ASCII. */ static inline int Ctdl_GetUtf8SequenceLength(const char *CharS, const char *CharE) { int n = 1; char test = (1<<7); - + + if ((*CharS & 0xC0) == 0) + return 1; + while ((n < 8) && ((test & *CharS) != 0)) { test = test << 1; n ++; } if ((n > 6) || ((CharE - CharS) < n)) - n = 1; + n = 0; return n; } -- 2.30.2