左边第一个字节1的个数表示这个字符编码字节的位数,例如两位字节字符编码样式为为:110xxxxx 10xxxxxx; 三位字节字符的编码样式为:1110xxxx 10xxxxxx 10xxxxxx.;以此类推,六位字节字符的编码样式为:1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx。 xxx 的值由字符编码的二进制表示的位填入。
1字节:0xxxxxxx
2字节:110xxxxx 10xxxxxx
3字节:1110xxxx 10xxxxxx 10xxxxxx
4字节:11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5字节:111110xx 10xxxxxx 10xxxxxx 10xxxxxx
6字节:1111110x 10xxxxxx 10xxxxxx 10xxxxxx
void fnReadCharactersUTF8( char *pszSentence, vector
{
int iLen;
iLen = strlen( pszSentence );
char *p;
p = pszSentence;
unsigned char *q;
char szCharacter[ 101 ];
int iChar;
int iNumChars;
iNumChars = 0;
vec.clear( );
string strCharacter;
while( p != NULL && strlen( p ) > 0 )
{
q = (unsigned char *)p;
if ( q[ 0 ] < 0x80 )
{
//p[ 0 ] must be an ASCII character
iChar = 0;
szCharacter[ iChar++ ] = p[ 0 ];
p++;
q = (unsigned char *)p;
while( p != NULL && q[ 0 ] < 0x80 )
{
szCharacter[ iChar++ ] = p[ 0 ];
p++;
q = (unsigned char *)p;
}
szCharacter[ iChar ] = '\0';
vec.push_back( string( szCharacter ) );
iNumChars++;
}
else if ( q[ 0 ] < 0xC0 )
{
//invalid char between 0x80 and 0xC0
p++;
}
else if ( q[ 0 ] < 0xE0 )
{
//two chars
szCharacter[ 0 ] = p[ 0 ];
szCharacter[ 1 ] = p[ 1 ];
szCharacter[ 2 ] = '\0';
p = p + 2;
strCharacter = string( szCharacter );
vec.push_back( strCharacter );
iNumChars++;
}
else if ( q[ 0 ] < 0xF0 )
{
//three chars
szCharacter[ 0 ] = p[ 0 ];
szCharacter[ 1 ] = p[ 1 ];
szCharacter[ 2 ] = p[ 2 ];
szCharacter[ 3 ] = '\0';
p = p + 3;
strCharacter = string( szCharacter );
vec.push_back( strCharacter );
//printf( "%s ", strCharacter.c_str( ) );
iNumChars++;
}
else if ( q[ 0 ] < 0xF8 )
{
//four chars
p += 4;
}
else if ( q[ 0 ] < 0xFC )
{
//five chars
p += 5;
}
else if ( q[0] < 0xFE )
{
//6 chars
p += 5;
}
else
{
//>=0xFE
p++;
}
}
}
没有评论:
发表评论