ISO10646/UCS与UTF-8的对应关系:
U-00000000 - U-0000007F: 0xxxxxxx
U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-----------------------------------------------------------------------------------------------------------
| UCS-2 | UTF-8 |
|----------------------------------------------------------------------------------------------------------
| | code | 1st Byte | 2nd byte | 3rd Byte |
|--------------------------------------------------------------------------------------------------------
| 000000000aaaaaaa | 0000 - 007F | 0aaaaaaa | | |
|--------------------------------------------------------------------------------------------------------
| 00000bbbbbaaaaaa | 0080 - 07FF | 110bbbbb | 10aaaaaa | |
|--------------------------------------------------------------------------------------------------------
| ccccbbbbbbaaaaaa | 0800 - FFFF | 1110cccc | 10bbbbbb | 10aaaaaa |
|--------------------------------------------------------------------------------------------------------
public class ISO10646Decoder {
public String decode(char[] rawData) {
StringBuilder sb = new StringBuilder();
try {
for (char c : rawData) {
sb.append(new String(UCS2toUTF8Code((short) c), "UTF-8"));
}
} catch (UnsupportedEncodingException unsupportedEncodingException) {
return null; //can't reach here
}
return sb.toString();
}
public String decode(byte[] rawData) {
if(rawData.length == 0 || rawData.length % 2 != 0)
return "Not UCS-2 Code";
StringBuilder sb = new StringBuilder();
try {
for (int i = 0; i < rawData.length; i += 2) {
sb.append(new String(UCS2toUTF8Code(encodeUCS2FromByte(rawData[i], rawData[i + 1])), "UTF-8"));
}
} catch (UnsupportedEncodingException unsupportedEncodingException) {
return null; //can't reach here;
}
return sb.toString();
}
private byte[] UCS2toUTF8Code(short ucs2Code) {
byte[] utf8Code = null;
if (ucs2Code < 0 || ucs2Code > (short) 0x0800) {
utf8Code = new byte[3];
utf8Code[0] = (byte) ((convertShortToInt(ucs2Code) >>> 12) | 0xe0);
utf8Code[1] = (byte) ((convertShortToInt((short) (ucs2Code & 0x0fc0)) >>> 6) | 0x80);
utf8Code[2] = (byte) ((ucs2Code & 0x003f) | 0x80);
} else if ((short) 0x0080 > ucs2Code) {
utf8Code = new byte[1];
utf8Code[0] = (byte) ucs2Code;
} else {
utf8Code = new byte[2];
utf8Code[0] = (byte) ((ucs2Code >>> 6) | 0xc0);
utf8Code[1] = (byte) ((ucs2Code & 0x003f) | 0x80);
}
return utf8Code;
}
private short convertByteToShort(byte byteValue) {
return byteValue < 0 ? (short) (byteValue ^ 0xff00) : (short) byteValue;
}
private int convertShortToInt(short shortValue) {
return shortValue < 0 ? shortValue ^ 0xffff0000 : shortValue;
}
private short encodeUCS2FromByte(byte highByte, byte lowByte) {
return (short) ((short) (convertByteToShort(highByte) << 8) | convertByteToShort(lowByte));
}
public static void main(String[] args) throws UnsupportedEncodingException {
byte[] tmp = {0x4f, 0x60, 0x59, 0x7d, (byte)0xff, 0x0c, 0x53, 0x17, 0x4e,
(byte)0xac, 0x00, 0x32, 0x00, 0x30, 0x00, 0x30, 0x00, 0x38};
String utfStr = new ISO10646Decoder().decode(tmp);
System.out.println(utfStr);
}
}
没有评论:
发表评论