一、unicode编码基础认知
二、C++如何判断是否是UTF8编码
形式1:详见mozilla xpcom\string\nsReadableUtils.cpp:
只要有一个字符不满足UTF8判断条件,就返回false
bool
IsUTF8(const nsACString& aString, bool aRejectNonChar)
{nsReadingIterator<char> done_reading;aString.EndReading(done_reading);int32_t state = 0;bool overlong = false;bool surrogate = false;bool nonchar = false;uint16_t olupper = 0; // overlong byte upper bound.uint16_t slower = 0; // surrogate byte lower bound.nsReadingIterator<char> iter;aString.BeginReading(iter);const char* ptr = iter.get();const char* end = done_reading.get();while (ptr < end) {uint8_t c;if (0 == state) {c = *ptr++;if (UTF8traits::isASCII(c)) {continue;}if (c <= 0xC1) { // [80-BF] where not expected, [C0-C1] for overlong.return false;} else if (UTF8traits::is2byte(c)) {state = 1;} else if (UTF8traits::is3byte(c)) {state = 2;if (c == 0xE0) { // to exclude E0[80-9F][80-BF]overlong = true;olupper = 0x9F;} else if (c == 0xED) { // ED[A0-BF][80-BF] : surrogate codepointsurrogate = true;slower = 0xA0;} else if (c == 0xEF) { // EF BF [BE-BF] : non-characternonchar = true;}} else if (c <= 0xF4) { // XXX replace /w UTF8traits::is4byte when it's updated to exclude [F5-F7].(bug 199090)state = 3;nonchar = true;if (c == 0xF0) { // to exclude F0[80-8F][80-BF]{2}overlong = true;olupper = 0x8F;} else if (c == 0xF4) { // to exclude F4[90-BF][80-BF]// actually not surrogates but codepoints beyond 0x10FFFFsurrogate = true;slower = 0x90;}} else {return false; // Not UTF-8 string}}if (nonchar && !aRejectNonChar) {nonchar = false;}while (ptr < end && state) {c = *ptr++;--state;// non-character : EF BF [BE-BF] or F[0-7] [89AB]F BF [BE-BF]if (nonchar &&((!state && c < 0xBE) ||(state == 1 && c != 0xBF) ||(state == 2 && 0x0F != (0x0F & c)))) {nonchar = false;}if (!UTF8traits::isInSeq(c) || (overlong && c <= olupper) ||(surrogate && slower <= c) || (nonchar && !state)) {return false; // Not UTF-8 string}overlong = surrogate = false;}}return !state; // state != 0 at the end indicates an invalid UTF-8 seq.
}
形式二:C++ 标准版
bool IsUTF8(const void *pBuffer, int size)
{bool IsUTF8 = false;unsigned char *start = (unsigned char *)pBuffer;unsigned char *end = (unsigned char *)pBuffer + size;while (start < end){if (*start < 0x80) // (10000000): 值小于0x80的为ASCII字符 {start++;}else if (*start < (0xC0)) // (11000000): 值介于0x80与0xC0之间的为无效UTF-8字符 {IsUTF8 = false;break;}else if (*start < (0xE0)) // (11100000): 此范围内为2字节UTF-8字符 {IsUTF8 = true;if (start >= end - 1)break;if ((start[1] & (0xC0)) != 0x80){IsUTF8 = false;break;}start += 2;}else if (*start < (0xF0)) // (11110000): 此范围内为3字节UTF-8字符 {IsUTF8 = true;if (start >= end - 2) break;if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80){IsUTF8 = false; break;}start += 3;}else if (*start < (0xF8)) // (11111000): 此范围内为4字节UTF-8字符 {IsUTF8 = true;if (start >= end - 3) break;if ((start[1] & (0xC0)) != 0x80 || (start[2] & (0xC0)) != 0x80 || (start[3] & (0xC0)) != 0x80){IsUTF8 = false; break;}start += 4;}else{IsUTF8 = false;break;}}return IsUTF8;
}
形式一、形式二都存在一定的缺陷:
例如 "通知"两个字的GB2312编码为:0xCD0xA8(通)、0xD60xAA(知) 使用这两种形式的代码都会被误判为是UTF8编码,具体那些字段会被误判,可查看GB2312编码表《CP936.TXT》
三、GBK编码基础认知
判断是否是GBK编码:
形式一、C++通用版本
bool isGBKCode(const string& strIn)
{unsigned int nBytes = 0;//GBK可用1-2个字节编码,中文两个 ,英文一个 unsigned char chr = strIn.at(0);bool bAllAscii = true; //如果全部都是ASCII, for (unsigned int i = 0; strIn[i] != '\0'; ++i){chr = strIn.at(i);if ((chr & 0x80) != 0 && nBytes == 0){// 判断是否ASCII编码,如果不是,说明有可能是GBKbAllAscii = false;}if (nBytes == 0) {if (chr >= 0x80) {if (chr >= 0x81 && chr <= 0xFE){nBytes = +2;}else{return false;}nBytes--;}}else{if (chr < 0x40 || chr>0xFE){return false;}nBytes--;}//else end}if (nBytes != 0) { //违返规则 return false;}if (bAllAscii){ //如果全部都是ASCII, 也是GBKreturn true;}return true;
}
形式二、适合mozilla的代码如下:
bool
IsGBK(const nsACString& aString)
{nsReadingIterator<char> done_reading;aString.EndReading(done_reading);//GBK:英文字母和数字占用一个字节,特殊字符(如部分标点符号、//非常见字母等)仍占用两个字节,汉字编码占用两个字节。unsigned int nBytes = 0;bool bAllAscii = true; //如果为true,全部都是ASCIInsReadingIterator<char> iter;aString.BeginReading(iter);const char* ptr = iter.get();const char* end = done_reading.get();while (ptr < end) {uint8_t c;c = *ptr++;if ((c & 0x80) != 0 && nBytes == 0) {bAllAscii = false;}if (nBytes == 0) {if (c >= 0x80) {if (c >= 0x81 && c <= 0xFE) {nBytes = +2;} else {return false;}nBytes--;}} else {if (c < 0x40 || c > 0xFE) {return false;}nBytes--;}}if (nBytes != 0) {return false;}//如果全是ASCII码,就认为不是GBK,还是以UTF8编码为准,不改变原逻辑if (bAllAscii) {return false;}return true;
}
四、GB2312和UTF8互转
在该网站可看unicode编码库:
Index of /Public/MAPPINGS/VENDORS/MICSFT/WINDOWShttps://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/
其中,CP936.TXT就是GB2312编码库; 该文档部分截图如下:
以"知"字为例:GB2312编码为:0xD6AA 对应的UTF8编码为 0x77E5;
在在线转换网站UTF-8编码转换 UTF-8转换工具 在线UTF-8编码汉字互转工具 iP138在线工具在线UTF-8编码汉字互转工具是一款可以帮助你把中文转换成UTF-8编码,同时也支持把UTF-8编码过的还原成中文的小工具。https://tool.ip138.com/utf8/上可以互相转换: