COMMON: Add string encoding API with more detailed error behavior.
This commit is contained in:
parent
52cee62a64
commit
2cad62a6ec
4 changed files with 104 additions and 54 deletions
|
@ -388,7 +388,9 @@ void U32String::decodeJohab(const char *src, uint32 len) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void String::encodeWindows932(const U32String &src) {
|
StringEncodingResult String::encodeWindows932(const U32String &src, char errorChar) {
|
||||||
|
StringEncodingResult encodingResult = kStringEncodingResultSucceeded;
|
||||||
|
|
||||||
ensureCapacity(src.size() * 2, false);
|
ensureCapacity(src.size() * 2, false);
|
||||||
|
|
||||||
if (!cjk_tables_loaded)
|
if (!cjk_tables_loaded)
|
||||||
|
@ -432,12 +434,14 @@ void String::encodeWindows932(const U32String &src) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (point > 0x10000) {
|
if (point > 0x10000) {
|
||||||
operator+=('?');
|
operator+=(errorChar);
|
||||||
|
encodingResult = kStringEncodingResultHasErrors;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!windows932ReverseConversionTable) {
|
if (!windows932ReverseConversionTable) {
|
||||||
operator+=('?');
|
operator+=(errorChar);
|
||||||
|
encodingResult = kStringEncodingResultHasErrors;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -450,12 +454,17 @@ void String::encodeWindows932(const U32String &src) {
|
||||||
|
|
||||||
// This codepage contains cyrillic, so no need to transliterate
|
// This codepage contains cyrillic, so no need to transliterate
|
||||||
|
|
||||||
operator+=('?');
|
operator+=(errorChar);
|
||||||
|
encodingResult = kStringEncodingResultHasErrors;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return encodingResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
void String::encodeWindows949(const U32String &src) {
|
StringEncodingResult String::encodeWindows949(const U32String &src, char errorChar) {
|
||||||
|
StringEncodingResult encodingResult = kStringEncodingResultSucceeded;
|
||||||
|
|
||||||
ensureCapacity(src.size() * 2, false);
|
ensureCapacity(src.size() * 2, false);
|
||||||
|
|
||||||
if (!cjk_tables_loaded)
|
if (!cjk_tables_loaded)
|
||||||
|
@ -493,20 +502,24 @@ void String::encodeWindows949(const U32String &src) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (point > 0x10000 || !windows949ReverseConversionTable) {
|
if (point > 0x10000 || !windows949ReverseConversionTable) {
|
||||||
operator+=('?');
|
operator+=(errorChar);
|
||||||
|
encodingResult = kStringEncodingResultHasErrors;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint16 rev = windows949ReverseConversionTable[point];
|
uint16 rev = windows949ReverseConversionTable[point];
|
||||||
if (rev == 0) {
|
if (rev == 0) {
|
||||||
// This codepage contains cyrillic, so no need to transliterate
|
// This codepage contains cyrillic, so no need to transliterate
|
||||||
operator+=('?');
|
operator+=(errorChar);
|
||||||
|
encodingResult = kStringEncodingResultHasErrors;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
operator+=(rev >> 8);
|
operator+=(rev >> 8);
|
||||||
operator+=(rev & 0xff);
|
operator+=(rev & 0xff);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return encodingResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char g_cyrillicTransliterationTable[] = {
|
static const char g_cyrillicTransliterationTable[] = {
|
||||||
|
@ -518,31 +531,34 @@ static const char g_cyrillicTransliterationTable[] = {
|
||||||
'e', 'e', 'd', 'g', 'e', 'z', 'i', 'i', 'j', 'l', 'n', 'c', 'k', 'i', 'u', 'd',
|
'e', 'e', 'd', 'g', 'e', 'z', 'i', 'i', 'j', 'l', 'n', 'c', 'k', 'i', 'u', 'd',
|
||||||
};
|
};
|
||||||
|
|
||||||
void String::translitChar(U32String::value_type point) {
|
StringEncodingResult String::translitChar(U32String::value_type point, char errorChar) {
|
||||||
if (point == 0xa0) {
|
if (point == 0xa0) {
|
||||||
operator+=(' ');
|
operator+=(' ');
|
||||||
return;
|
return kStringEncodingResultSucceeded;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (point == 0xad) {
|
if (point == 0xad) {
|
||||||
operator+=('-');
|
operator+=('-');
|
||||||
return;
|
return kStringEncodingResultSucceeded;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (point == 0x2116) {
|
if (point == 0x2116) {
|
||||||
operator+=('N');
|
operator+=('N');
|
||||||
return;
|
return kStringEncodingResultSucceeded;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (point >= 0x401 && point <= 0x45f) {
|
if (point >= 0x401 && point <= 0x45f) {
|
||||||
operator+=(g_cyrillicTransliterationTable[point - 0x400]);
|
operator+=(g_cyrillicTransliterationTable[point - 0x400]);
|
||||||
return;
|
return kStringEncodingResultSucceeded;
|
||||||
}
|
}
|
||||||
|
|
||||||
operator+=('?');
|
operator+=(errorChar);
|
||||||
|
return kStringEncodingResultHasErrors;
|
||||||
}
|
}
|
||||||
|
|
||||||
void String::encodeWindows950(const U32String &src, bool transliterate) {
|
StringEncodingResult String::encodeWindows950(const U32String &src, bool transliterate, char errorChar) {
|
||||||
|
StringEncodingResult encodingResult = kStringEncodingResultSucceeded;
|
||||||
|
|
||||||
ensureCapacity(src.size() * 2, false);
|
ensureCapacity(src.size() * 2, false);
|
||||||
|
|
||||||
if (!cjk_tables_loaded)
|
if (!cjk_tables_loaded)
|
||||||
|
@ -578,7 +594,8 @@ void String::encodeWindows950(const U32String &src, bool transliterate) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (point > 0x10000) {
|
if (point > 0x10000) {
|
||||||
operator+=('?');
|
operator+=(errorChar);
|
||||||
|
encodingResult = kStringEncodingResultHasErrors;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -589,7 +606,8 @@ void String::encodeWindows950(const U32String &src, bool transliterate) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!windows950ReverseConversionTable) {
|
if (!windows950ReverseConversionTable) {
|
||||||
operator+=('?');
|
operator+=(errorChar);
|
||||||
|
encodingResult = kStringEncodingResultHasErrors;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -629,16 +647,23 @@ void String::encodeWindows950(const U32String &src, bool transliterate) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (transliterate) {
|
if (transliterate) {
|
||||||
translitChar(point);
|
StringEncodingResult translitResult = translitChar(point, errorChar);
|
||||||
|
if (translitResult != kStringEncodingResultSucceeded)
|
||||||
|
encodingResult = translitResult;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
operator+=('?');
|
operator+=(errorChar);
|
||||||
|
encodingResult = kStringEncodingResultHasErrors;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return encodingResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
void String::encodeJohab(const U32String &src) {
|
StringEncodingResult String::encodeJohab(const U32String &src, char errorChar) {
|
||||||
|
StringEncodingResult encodingResult = kStringEncodingResultSucceeded;
|
||||||
|
|
||||||
ensureCapacity(src.size() * 2, false);
|
ensureCapacity(src.size() * 2, false);
|
||||||
|
|
||||||
if (!cjk_tables_loaded)
|
if (!cjk_tables_loaded)
|
||||||
|
@ -671,19 +696,23 @@ void String::encodeJohab(const U32String &src) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (point > 0x10000 || !johabReverseConversionTable) {
|
if (point > 0x10000 || !johabReverseConversionTable) {
|
||||||
operator+=('?');
|
operator+=(errorChar);
|
||||||
|
encodingResult = kStringEncodingResultHasErrors;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint16 rev = johabReverseConversionTable[point];
|
uint16 rev = johabReverseConversionTable[point];
|
||||||
if (rev == 0) {
|
if (rev == 0) {
|
||||||
operator+=('?');
|
operator+=(errorChar);
|
||||||
|
encodingResult = kStringEncodingResultHasErrors;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
operator+=(rev >> 8);
|
operator+=(rev >> 8);
|
||||||
operator+=(rev & 0xff);
|
operator+=(rev & 0xff);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return encodingResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
// //TODO: This is a quick and dirty converter. Refactoring needed:
|
// //TODO: This is a quick and dirty converter. Refactoring needed:
|
||||||
|
@ -693,7 +722,7 @@ void String::encodeJohab(const U32String &src) {
|
||||||
// character does not fit in 4 bytes & does not inform caller on any errors
|
// character does not fit in 4 bytes & does not inform caller on any errors
|
||||||
//
|
//
|
||||||
// More comprehensive one lives in wintermute/utils/convert_utf.cpp
|
// More comprehensive one lives in wintermute/utils/convert_utf.cpp
|
||||||
void String::encodeUTF8(const U32String &src) {
|
StringEncodingResult String::encodeUTF8(const U32String &src, char errorChar) {
|
||||||
ensureCapacity(src.size(), false);
|
ensureCapacity(src.size(), false);
|
||||||
static const uint8 firstByteMark[5] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0 };
|
static const uint8 firstByteMark[5] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0 };
|
||||||
char writingBytes[5] = {0x00, 0x00, 0x00, 0x00, 0x00};
|
char writingBytes[5] = {0x00, 0x00, 0x00, 0x00, 0x00};
|
||||||
|
@ -742,6 +771,8 @@ void String::encodeUTF8(const U32String &src) {
|
||||||
|
|
||||||
operator+=(pBytes);
|
operator+=(pBytes);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return kStringEncodingResultSucceeded;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define decodeUTF16Template(suffix, read) \
|
#define decodeUTF16Template(suffix, read) \
|
||||||
|
@ -916,7 +947,9 @@ void U32String::decodeOneByte(const char *src, uint32 len, CodePage page) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void String::encodeOneByte(const U32String &src, CodePage page, bool transliterate) {
|
StringEncodingResult String::encodeOneByte(const U32String &src, CodePage page, bool transliterate, char errorChar) {
|
||||||
|
StringEncodingResult encodingResult = kStringEncodingResultSucceeded;
|
||||||
|
|
||||||
const ReverseTablePrefixTreeLevel1 *conversionTable =
|
const ReverseTablePrefixTreeLevel1 *conversionTable =
|
||||||
getReverseConversionTable(page);
|
getReverseConversionTable(page);
|
||||||
|
|
||||||
|
@ -931,11 +964,15 @@ void String::encodeOneByte(const U32String &src, CodePage page, bool translitera
|
||||||
}
|
}
|
||||||
|
|
||||||
if (transliterate) {
|
if (transliterate) {
|
||||||
translitChar(c);
|
StringEncodingResult translitResult = translitChar(c, errorChar);
|
||||||
} else
|
if (translitResult != kStringEncodingResultSucceeded)
|
||||||
operator+=('?');
|
encodingResult = translitResult;
|
||||||
|
} else {
|
||||||
|
operator+=(errorChar);
|
||||||
|
encodingResult = kStringEncodingResultHasErrors;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return;
|
return encodingResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint i = 0; i < src.size(); ++i) {
|
for (uint i = 0; i < src.size(); ++i) {
|
||||||
|
@ -955,32 +992,32 @@ void String::encodeOneByte(const U32String &src, CodePage page, bool translitera
|
||||||
}
|
}
|
||||||
|
|
||||||
if (transliterate) {
|
if (transliterate) {
|
||||||
translitChar(c);
|
StringEncodingResult translitResult = translitChar(c, errorChar);
|
||||||
} else
|
if (translitResult != kStringEncodingResultSucceeded)
|
||||||
operator+=('?');
|
encodingResult = translitResult;
|
||||||
|
} else {
|
||||||
|
operator+=(errorChar);
|
||||||
|
encodingResult = kStringEncodingResultHasErrors;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return encodingResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
void String::encodeInternal(const U32String &src, CodePage page) {
|
StringEncodingResult String::encodeInternal(const U32String &src, CodePage page, char errorChar) {
|
||||||
switch(page) {
|
switch(page) {
|
||||||
case kUtf8:
|
case kUtf8:
|
||||||
encodeUTF8(src);
|
return encodeUTF8(src, errorChar);
|
||||||
break;
|
|
||||||
case kWindows932:
|
case kWindows932:
|
||||||
encodeWindows932(src);
|
return encodeWindows932(src, errorChar);
|
||||||
break;
|
|
||||||
case kWindows949:
|
case kWindows949:
|
||||||
encodeWindows949(src);
|
return encodeWindows949(src, errorChar);
|
||||||
break;
|
|
||||||
case kWindows950:
|
case kWindows950:
|
||||||
encodeWindows950(src);
|
return encodeWindows950(src, true, errorChar);
|
||||||
break;
|
|
||||||
case kJohab:
|
case kJohab:
|
||||||
encodeJohab(src);
|
return encodeJohab(src, errorChar);
|
||||||
break;
|
|
||||||
default:
|
default:
|
||||||
encodeOneByte(src, page);
|
return encodeOneByte(src, page, true, errorChar);
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1040,14 +1077,18 @@ U32String String::decode(CodePage page) const {
|
||||||
}
|
}
|
||||||
|
|
||||||
String U32String::encode(CodePage page) const {
|
String U32String::encode(CodePage page) const {
|
||||||
|
String string;
|
||||||
|
(void)encode(string, page, '?');
|
||||||
|
return string;
|
||||||
|
}
|
||||||
|
|
||||||
|
StringEncodingResult U32String::encode(String &outString, CodePage page, char errorChar) const {
|
||||||
if (page == kCodePageInvalid ||
|
if (page == kCodePageInvalid ||
|
||||||
page > kLastEncoding) {
|
page > kLastEncoding) {
|
||||||
error("Invalid codepage");
|
error("Invalid codepage");
|
||||||
}
|
}
|
||||||
|
|
||||||
String string;
|
return outString.encodeInternal(*this, page, errorChar);
|
||||||
string.encodeInternal(*this, page);
|
|
||||||
return string;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
} // End of namespace Common
|
} // End of namespace Common
|
||||||
|
|
|
@ -57,6 +57,11 @@ enum CodePage {
|
||||||
kLastEncoding = kASCII
|
kLastEncoding = kASCII
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum StringEncodingResult {
|
||||||
|
kStringEncodingResultSucceeded,
|
||||||
|
kStringEncodingResultHasErrors,
|
||||||
|
};
|
||||||
|
|
||||||
U32String convertUtf8ToUtf32(const String &str);
|
U32String convertUtf8ToUtf32(const String &str);
|
||||||
String convertUtf32ToUtf8(const U32String &str);
|
String convertUtf32ToUtf8(const U32String &str);
|
||||||
|
|
||||||
|
|
16
common/str.h
16
common/str.h
|
@ -246,14 +246,14 @@ public:
|
||||||
U32String decode(CodePage page = kUtf8) const;
|
U32String decode(CodePage page = kUtf8) const;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
void encodeUTF8(const U32String &src);
|
StringEncodingResult encodeUTF8(const U32String &src, char errorChar);
|
||||||
void encodeWindows932(const U32String &src);
|
StringEncodingResult encodeWindows932(const U32String &src, char errorChar);
|
||||||
void encodeWindows949(const U32String &src);
|
StringEncodingResult encodeWindows949(const U32String &src, char errorChar);
|
||||||
void encodeWindows950(const U32String &src, bool translit = true);
|
StringEncodingResult encodeWindows950(const U32String &src, bool translit, char errorChar);
|
||||||
void encodeJohab(const U32String &src);
|
StringEncodingResult encodeJohab(const U32String &src, char errorChar);
|
||||||
void encodeOneByte(const U32String &src, CodePage page, bool translit = true);
|
StringEncodingResult encodeOneByte(const U32String &src, CodePage page, bool translit, char errorChar);
|
||||||
void encodeInternal(const U32String &src, CodePage page);
|
StringEncodingResult encodeInternal(const U32String &src, CodePage page, char errorChar);
|
||||||
void translitChar(U32String::value_type point);
|
StringEncodingResult translitChar(U32String::value_type point, char errorChar);
|
||||||
|
|
||||||
friend class U32String;
|
friend class U32String;
|
||||||
};
|
};
|
||||||
|
|
|
@ -127,6 +127,10 @@ public:
|
||||||
/** Convert the string to the given @p page encoding and return the result as a new String. */
|
/** Convert the string to the given @p page encoding and return the result as a new String. */
|
||||||
String encode(CodePage page = kUtf8) const;
|
String encode(CodePage page = kUtf8) const;
|
||||||
|
|
||||||
|
/** Convert the string to the given @p page encoding and output in string @p outString,
|
||||||
|
replacing invalid characters with @p errorChar. */
|
||||||
|
StringEncodingResult encode(String &outString, CodePage page, char errorChar) const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Print formatted data into a U32String object.
|
* Print formatted data into a U32String object.
|
||||||
*
|
*
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue