2013-07-20 21:46:18 -07:00
|
|
|
#pragma once
|
|
|
|
|
2020-09-29 12:44:47 +02:00
|
|
|
#include <cstdint>
|
2020-09-29 12:53:18 +02:00
|
|
|
|
2020-09-30 00:06:51 +02:00
|
|
|
#include "Common/BitSet.h"
|
2013-07-20 21:46:18 -07:00
|
|
|
|
|
|
|
// Should optimize out.
|
|
|
|
#define UTF16_IS_LITTLE_ENDIAN (*(const uint16_t *)"\0\xff" >= 0x100)
|
|
|
|
|
|
|
|
template <bool is_little>
|
|
|
|
uint16_t UTF16_Swap(uint16_t u) {
|
|
|
|
if (is_little) {
|
|
|
|
return UTF16_IS_LITTLE_ENDIAN ? u : swap16(u);
|
|
|
|
} else {
|
|
|
|
return UTF16_IS_LITTLE_ENDIAN ? swap16(u) : u;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
template <bool is_little>
|
|
|
|
struct UTF16_Type {
|
|
|
|
public:
|
2020-03-22 19:29:25 +01:00
|
|
|
static const char32_t INVALID = (char32_t)-1;
|
2013-07-20 21:46:18 -07:00
|
|
|
|
2020-03-22 19:29:25 +01:00
|
|
|
UTF16_Type(const char16_t *c) : c_(c), index_(0) {}
|
2013-07-20 21:46:18 -07:00
|
|
|
|
2020-03-22 19:29:25 +01:00
|
|
|
char32_t next() {
|
|
|
|
const char32_t u = UTF16_Swap<is_little>(c_[index_++]);
|
2013-07-20 21:46:18 -07:00
|
|
|
|
|
|
|
// Surrogate pair. UTF-16 is so simple. We assume it's valid.
|
2014-01-21 08:03:57 -08:00
|
|
|
if ((u & 0xF800) == 0xD800) {
|
2013-07-20 21:46:18 -07:00
|
|
|
return 0x10000 + (((u & 0x3FF) << 10) | (UTF16_Swap<is_little>(c_[index_++]) & 0x3FF));
|
|
|
|
}
|
|
|
|
return u;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool end() const {
|
|
|
|
return c_[index_] == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int length() const {
|
|
|
|
int len = 0;
|
|
|
|
for (UTF16_Type<is_little> dec(c_); !dec.end(); dec.next())
|
|
|
|
++len;
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
2014-05-03 13:13:23 -07:00
|
|
|
int shortIndex() const {
|
2013-07-20 21:46:18 -07:00
|
|
|
return index_;
|
|
|
|
}
|
|
|
|
|
2020-03-22 19:29:25 +01:00
|
|
|
static int encode(char16_t *dest, char32_t u) {
|
2013-07-20 21:46:18 -07:00
|
|
|
if (u >= 0x10000) {
|
|
|
|
u -= 0x10000;
|
2013-07-21 12:55:28 -07:00
|
|
|
*dest++ = UTF16_Swap<is_little>(0xD800 + ((u >> 10) & 0x3FF));
|
2013-07-20 21:46:18 -07:00
|
|
|
*dest = UTF16_Swap<is_little>(0xDC00 + ((u >> 0) & 0x3FF));
|
|
|
|
return 2;
|
|
|
|
} else {
|
2020-03-22 19:29:25 +01:00
|
|
|
*dest = UTF16_Swap<is_little>((char16_t)u);
|
2013-07-20 21:46:18 -07:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-03-22 19:29:25 +01:00
|
|
|
// Rejects non-UCS2 codepoints.
|
|
|
|
static int encodeUCS2(char16_t *dest, char32_t u) {
|
|
|
|
if (u >= 0x10000 || (u >= 0xD800 && u <= 0xDFFF)) {
|
|
|
|
return 0;
|
|
|
|
} else {
|
|
|
|
*dest = UTF16_Swap<is_little>((char16_t)u);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int encodeUnits(char32_t u) {
|
2014-05-03 13:13:23 -07:00
|
|
|
if (u >= 0x10000) {
|
|
|
|
return 2;
|
|
|
|
} else {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-03-22 19:29:25 +01:00
|
|
|
static int encodeUnitsUCS2(char32_t u) {
|
|
|
|
if (u >= 0x10000 || (u >= 0xD800 && u <= 0xDFFF)) {
|
|
|
|
return 0;
|
|
|
|
} else {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
}
|
2013-07-20 21:46:18 -07:00
|
|
|
private:
|
2020-03-22 19:29:25 +01:00
|
|
|
const char16_t *c_;
|
2013-07-20 21:46:18 -07:00
|
|
|
int index_;
|
|
|
|
};
|
|
|
|
|
|
|
|
typedef UTF16_Type<true> UTF16LE;
|
|
|
|
typedef UTF16_Type<false> UTF16BE;
|