类型:转载 责任编辑:asp.net 日期:2007/05/23
热门软件下载:
在MSDN中, 我查到:
Byte-order mark Description
EF BB BF UTF-8
FE FF UTF-16/UCS-2, little endian
FF FE UTF-16/UCS-2, big endian
FF FE 00 00 UTF-32/UCS-4, little endian.
00 00 FE FF UTF-32/UCS-4, big-endian.
但是在Notepad中,当我用 Unicode big endian 保存后,
却发现它的文件标识是:FE FF。
这显然和MSDN说的不一样,到底哪个对?
竟然会有这种事?奇怪!
谢谢了。
网友回答:
http://www.vccode.com/file_show.php?id=2668
MFC下Unicode到UTF-8格式的转换
http://www.utf8.com/
以下是一段UCS4和UTF8互换的源代码,可以把4格式换成你要的格式
//UCS4 <--> UTF8
//=====================
//Written by Chio Chan Fong (http://chiosoft.51.net, chiosoft@163.net)
//21 March 2003
#include<stdio.h>
#include<stdlib.h>
//define a UCS-4 type
typedef long wchar_t_ccf;
unsigned char* ucs_to_utf8( wchar_t_ccf x )
{
unsigned char *pt = NULL;
if( x>=0 && x<=0x7f ) {
pt = malloc( 1+1 );
pt[0] = x;
pt[1] = 0;
}
if( x>=0x80 && x<=0x7ff ) {
pt = malloc( 2+1 );
pt[0] = 0xc0 + x/64;
pt[1] = 0x80 + x%64;
pt[2] = 0;
}
if( x>=0x800 && x<=0xffff ) {
pt = malloc( 3+1 );
pt[0] = 0xe0 + x/4096;
pt[1] = 0x80 + (x/64)%64;
pt[2] = 0x80 + x%64;
pt[3] = 0;
}
return pt;
}
wchar_t_ccf utf8_to_ucs( unsigned char *z )
{
wchar_t_ccf w = -1;
if( *z <= 0x7f )
w = *z;
else
if( (*z&0xf0) == 0xc0 ) { //two bytes(1100 xxxx)
w = ((*z++)-0xc0)*64L;
w += (*z)-0x80;
}
else
if( (*z&0xf0) == 0xe0 ) { //three bytes(1110 xxxx)
w = ((*z++)-0xe0)*4096L;
w += ((*z++)-0x80)*64L;
w += (*z)-0x80;
}
return w;
}
int main(void)
{
unsigned char *p, *q;
wchar_t_ccf uni = 0x8d99;
p = ucs_to_utf8( uni );
if( p != NULL )
{
q = p; //save the pointer
printf( "%lx -> ", uni );
while( *p )
printf( "%x ", *p++ );
printf( "-> %lx\n", utf8_to_ucs( q ) );
free( q );
}
return 0;
}