COMMON: Add string encoding API with more detailed error behavior.

This commit is contained in:
elasota 2022-06-18 21:57:23 -04:00 committed by Eugene Sandulenko
parent 52cee62a64
commit 2cad62a6ec
4 changed files with 104 additions and 54 deletions

View file

@ -388,7 +388,9 @@ void U32String::decodeJohab(const char *src, uint32 len) {
} }
void String::encodeWindows932(const U32String &src) { StringEncodingResult String::encodeWindows932(const U32String &src, char errorChar) {
StringEncodingResult encodingResult = kStringEncodingResultSucceeded;
ensureCapacity(src.size() * 2, false); ensureCapacity(src.size() * 2, false);
if (!cjk_tables_loaded) if (!cjk_tables_loaded)
@ -432,12 +434,14 @@ void String::encodeWindows932(const U32String &src) {
} }
if (point > 0x10000) { if (point > 0x10000) {
operator+=('?'); operator+=(errorChar);
encodingResult = kStringEncodingResultHasErrors;
continue; continue;
} }
if (!windows932ReverseConversionTable) { if (!windows932ReverseConversionTable) {
operator+=('?'); operator+=(errorChar);
encodingResult = kStringEncodingResultHasErrors;
continue; continue;
} }
@ -450,12 +454,17 @@ void String::encodeWindows932(const U32String &src) {
// This codepage contains cyrillic, so no need to transliterate // This codepage contains cyrillic, so no need to transliterate
operator+=('?'); operator+=(errorChar);
encodingResult = kStringEncodingResultHasErrors;
continue; continue;
} }
return encodingResult;
} }
void String::encodeWindows949(const U32String &src) { StringEncodingResult String::encodeWindows949(const U32String &src, char errorChar) {
StringEncodingResult encodingResult = kStringEncodingResultSucceeded;
ensureCapacity(src.size() * 2, false); ensureCapacity(src.size() * 2, false);
if (!cjk_tables_loaded) if (!cjk_tables_loaded)
@ -493,20 +502,24 @@ void String::encodeWindows949(const U32String &src) {
} }
if (point > 0x10000 || !windows949ReverseConversionTable) { if (point > 0x10000 || !windows949ReverseConversionTable) {
operator+=('?'); operator+=(errorChar);
encodingResult = kStringEncodingResultHasErrors;
continue; continue;
} }
uint16 rev = windows949ReverseConversionTable[point]; uint16 rev = windows949ReverseConversionTable[point];
if (rev == 0) { if (rev == 0) {
// This codepage contains cyrillic, so no need to transliterate // This codepage contains cyrillic, so no need to transliterate
operator+=('?'); operator+=(errorChar);
encodingResult = kStringEncodingResultHasErrors;
continue; continue;
} }
operator+=(rev >> 8); operator+=(rev >> 8);
operator+=(rev & 0xff); operator+=(rev & 0xff);
} }
return encodingResult;
} }
static const char g_cyrillicTransliterationTable[] = { static const char g_cyrillicTransliterationTable[] = {
@ -518,31 +531,34 @@ static const char g_cyrillicTransliterationTable[] = {
'e', 'e', 'd', 'g', 'e', 'z', 'i', 'i', 'j', 'l', 'n', 'c', 'k', 'i', 'u', 'd', 'e', 'e', 'd', 'g', 'e', 'z', 'i', 'i', 'j', 'l', 'n', 'c', 'k', 'i', 'u', 'd',
}; };
void String::translitChar(U32String::value_type point) { StringEncodingResult String::translitChar(U32String::value_type point, char errorChar) {
if (point == 0xa0) { if (point == 0xa0) {
operator+=(' '); operator+=(' ');
return; return kStringEncodingResultSucceeded;
} }
if (point == 0xad) { if (point == 0xad) {
operator+=('-'); operator+=('-');
return; return kStringEncodingResultSucceeded;
} }
if (point == 0x2116) { if (point == 0x2116) {
operator+=('N'); operator+=('N');
return; return kStringEncodingResultSucceeded;
} }
if (point >= 0x401 && point <= 0x45f) { if (point >= 0x401 && point <= 0x45f) {
operator+=(g_cyrillicTransliterationTable[point - 0x400]); operator+=(g_cyrillicTransliterationTable[point - 0x400]);
return; return kStringEncodingResultSucceeded;
} }
operator+=('?'); operator+=(errorChar);
return kStringEncodingResultHasErrors;
} }
void String::encodeWindows950(const U32String &src, bool transliterate) { StringEncodingResult String::encodeWindows950(const U32String &src, bool transliterate, char errorChar) {
StringEncodingResult encodingResult = kStringEncodingResultSucceeded;
ensureCapacity(src.size() * 2, false); ensureCapacity(src.size() * 2, false);
if (!cjk_tables_loaded) if (!cjk_tables_loaded)
@ -578,7 +594,8 @@ void String::encodeWindows950(const U32String &src, bool transliterate) {
} }
if (point > 0x10000) { if (point > 0x10000) {
operator+=('?'); operator+=(errorChar);
encodingResult = kStringEncodingResultHasErrors;
continue; continue;
} }
@ -589,7 +606,8 @@ void String::encodeWindows950(const U32String &src, bool transliterate) {
} }
if (!windows950ReverseConversionTable) { if (!windows950ReverseConversionTable) {
operator+=('?'); operator+=(errorChar);
encodingResult = kStringEncodingResultHasErrors;
continue; continue;
} }
@ -629,16 +647,23 @@ void String::encodeWindows950(const U32String &src, bool transliterate) {
} }
if (transliterate) { if (transliterate) {
translitChar(point); StringEncodingResult translitResult = translitChar(point, errorChar);
if (translitResult != kStringEncodingResultSucceeded)
encodingResult = translitResult;
continue; continue;
} }
operator+=('?'); operator+=(errorChar);
encodingResult = kStringEncodingResultHasErrors;
continue; continue;
} }
return encodingResult;
} }
void String::encodeJohab(const U32String &src) { StringEncodingResult String::encodeJohab(const U32String &src, char errorChar) {
StringEncodingResult encodingResult = kStringEncodingResultSucceeded;
ensureCapacity(src.size() * 2, false); ensureCapacity(src.size() * 2, false);
if (!cjk_tables_loaded) if (!cjk_tables_loaded)
@ -671,19 +696,23 @@ void String::encodeJohab(const U32String &src) {
} }
if (point > 0x10000 || !johabReverseConversionTable) { if (point > 0x10000 || !johabReverseConversionTable) {
operator+=('?'); operator+=(errorChar);
encodingResult = kStringEncodingResultHasErrors;
continue; continue;
} }
uint16 rev = johabReverseConversionTable[point]; uint16 rev = johabReverseConversionTable[point];
if (rev == 0) { if (rev == 0) {
operator+=('?'); operator+=(errorChar);
encodingResult = kStringEncodingResultHasErrors;
continue; continue;
} }
operator+=(rev >> 8); operator+=(rev >> 8);
operator+=(rev & 0xff); operator+=(rev & 0xff);
} }
return encodingResult;
} }
// //TODO: This is a quick and dirty converter. Refactoring needed: // //TODO: This is a quick and dirty converter. Refactoring needed:
@ -693,7 +722,7 @@ void String::encodeJohab(const U32String &src) {
// character does not fit in 4 bytes & does not inform caller on any errors // character does not fit in 4 bytes & does not inform caller on any errors
// //
// More comprehensive one lives in wintermute/utils/convert_utf.cpp // More comprehensive one lives in wintermute/utils/convert_utf.cpp
void String::encodeUTF8(const U32String &src) { StringEncodingResult String::encodeUTF8(const U32String &src, char errorChar) {
ensureCapacity(src.size(), false); ensureCapacity(src.size(), false);
static const uint8 firstByteMark[5] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0 }; static const uint8 firstByteMark[5] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0 };
char writingBytes[5] = {0x00, 0x00, 0x00, 0x00, 0x00}; char writingBytes[5] = {0x00, 0x00, 0x00, 0x00, 0x00};
@ -742,6 +771,8 @@ void String::encodeUTF8(const U32String &src) {
operator+=(pBytes); operator+=(pBytes);
} }
return kStringEncodingResultSucceeded;
} }
#define decodeUTF16Template(suffix, read) \ #define decodeUTF16Template(suffix, read) \
@ -916,7 +947,9 @@ void U32String::decodeOneByte(const char *src, uint32 len, CodePage page) {
} }
} }
void String::encodeOneByte(const U32String &src, CodePage page, bool transliterate) { StringEncodingResult String::encodeOneByte(const U32String &src, CodePage page, bool transliterate, char errorChar) {
StringEncodingResult encodingResult = kStringEncodingResultSucceeded;
const ReverseTablePrefixTreeLevel1 *conversionTable = const ReverseTablePrefixTreeLevel1 *conversionTable =
getReverseConversionTable(page); getReverseConversionTable(page);
@ -931,11 +964,15 @@ void String::encodeOneByte(const U32String &src, CodePage page, bool translitera
} }
if (transliterate) { if (transliterate) {
translitChar(c); StringEncodingResult translitResult = translitChar(c, errorChar);
} else if (translitResult != kStringEncodingResultSucceeded)
operator+=('?'); encodingResult = translitResult;
} else {
operator+=(errorChar);
encodingResult = kStringEncodingResultHasErrors;
}
} }
return; return encodingResult;
} }
for (uint i = 0; i < src.size(); ++i) { for (uint i = 0; i < src.size(); ++i) {
@ -955,32 +992,32 @@ void String::encodeOneByte(const U32String &src, CodePage page, bool translitera
} }
if (transliterate) { if (transliterate) {
translitChar(c); StringEncodingResult translitResult = translitChar(c, errorChar);
} else if (translitResult != kStringEncodingResultSucceeded)
operator+=('?'); encodingResult = translitResult;
} else {
operator+=(errorChar);
encodingResult = kStringEncodingResultHasErrors;
}
} }
return encodingResult;
} }
void String::encodeInternal(const U32String &src, CodePage page) { StringEncodingResult String::encodeInternal(const U32String &src, CodePage page, char errorChar) {
switch(page) { switch(page) {
case kUtf8: case kUtf8:
encodeUTF8(src); return encodeUTF8(src, errorChar);
break;
case kWindows932: case kWindows932:
encodeWindows932(src); return encodeWindows932(src, errorChar);
break;
case kWindows949: case kWindows949:
encodeWindows949(src); return encodeWindows949(src, errorChar);
break;
case kWindows950: case kWindows950:
encodeWindows950(src); return encodeWindows950(src, true, errorChar);
break;
case kJohab: case kJohab:
encodeJohab(src); return encodeJohab(src, errorChar);
break;
default: default:
encodeOneByte(src, page); return encodeOneByte(src, page, true, errorChar);
break;
} }
} }
@ -1040,14 +1077,18 @@ U32String String::decode(CodePage page) const {
} }
String U32String::encode(CodePage page) const { String U32String::encode(CodePage page) const {
String string;
(void)encode(string, page, '?');
return string;
}
StringEncodingResult U32String::encode(String &outString, CodePage page, char errorChar) const {
if (page == kCodePageInvalid || if (page == kCodePageInvalid ||
page > kLastEncoding) { page > kLastEncoding) {
error("Invalid codepage"); error("Invalid codepage");
} }
String string; return outString.encodeInternal(*this, page, errorChar);
string.encodeInternal(*this, page);
return string;
} }
} // End of namespace Common } // End of namespace Common

View file

@ -57,6 +57,11 @@ enum CodePage {
kLastEncoding = kASCII kLastEncoding = kASCII
}; };
enum StringEncodingResult {
kStringEncodingResultSucceeded,
kStringEncodingResultHasErrors,
};
U32String convertUtf8ToUtf32(const String &str); U32String convertUtf8ToUtf32(const String &str);
String convertUtf32ToUtf8(const U32String &str); String convertUtf32ToUtf8(const U32String &str);

View file

@ -246,14 +246,14 @@ public:
U32String decode(CodePage page = kUtf8) const; U32String decode(CodePage page = kUtf8) const;
protected: protected:
void encodeUTF8(const U32String &src); StringEncodingResult encodeUTF8(const U32String &src, char errorChar);
void encodeWindows932(const U32String &src); StringEncodingResult encodeWindows932(const U32String &src, char errorChar);
void encodeWindows949(const U32String &src); StringEncodingResult encodeWindows949(const U32String &src, char errorChar);
void encodeWindows950(const U32String &src, bool translit = true); StringEncodingResult encodeWindows950(const U32String &src, bool translit, char errorChar);
void encodeJohab(const U32String &src); StringEncodingResult encodeJohab(const U32String &src, char errorChar);
void encodeOneByte(const U32String &src, CodePage page, bool translit = true); StringEncodingResult encodeOneByte(const U32String &src, CodePage page, bool translit, char errorChar);
void encodeInternal(const U32String &src, CodePage page); StringEncodingResult encodeInternal(const U32String &src, CodePage page, char errorChar);
void translitChar(U32String::value_type point); StringEncodingResult translitChar(U32String::value_type point, char errorChar);
friend class U32String; friend class U32String;
}; };

View file

@ -127,6 +127,10 @@ public:
/** Convert the string to the given @p page encoding and return the result as a new String. */ /** Convert the string to the given @p page encoding and return the result as a new String. */
String encode(CodePage page = kUtf8) const; String encode(CodePage page = kUtf8) const;
/** Convert the string to the given @p page encoding and output in string @p outString,
replacing invalid characters with @p errorChar. */
StringEncodingResult encode(String &outString, CodePage page, char errorChar) const;
/** /**
* Print formatted data into a U32String object. * Print formatted data into a U32String object.
* *