2007年10月28日星期日

Java实现UCS-2/ISO10646转UTF-8

ISO10646/UCS与UTF-8的对应关系:

U-00000000 - U-0000007F: 0xxxxxxx
U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

-----------------------------------------------------------------------------------------------------------
| UCS-2                                         | UTF-8                                        |
|----------------------------------------------------------------------------------------------------------
| | code                                        | 1st Byte   | 2nd byte   | 3rd Byte   |
|--------------------------------------------------------------------------------------------------------
| 000000000aaaaaaa | 0000 - 007F | 0aaaaaaa | | |
|--------------------------------------------------------------------------------------------------------
| 00000bbbbbaaaaaa | 0080 - 07FF | 110bbbbb | 10aaaaaa | |
|--------------------------------------------------------------------------------------------------------
| ccccbbbbbbaaaaaa | 0800 - FFFF | 1110cccc | 10bbbbbb | 10aaaaaa |
|--------------------------------------------------------------------------------------------------------

public class ISO10646Decoder {

    public String decode(char[] rawData) {
        StringBuilder sb = new StringBuilder();
        try {
            for (char c : rawData) {
                sb.append(new String(UCS2toUTF8Code((short) c), "UTF-8"));
            }
        } catch (UnsupportedEncodingException unsupportedEncodingException) {
            return null; //can't reach here
        }
        return sb.toString();
    }

    public String decode(byte[] rawData) {
        if(rawData.length == 0 || rawData.length % 2 != 0)
            return "Not UCS-2 Code";
        StringBuilder sb = new StringBuilder();
        try {
            for (int i = 0; i < rawData.length; i += 2) {
                sb.append(new String(UCS2toUTF8Code(encodeUCS2FromByte(rawData[i], rawData[i + 1])), "UTF-8"));
            }
        } catch (UnsupportedEncodingException unsupportedEncodingException) {
            return null; //can't reach here;
        }
        return sb.toString();
    }

    private byte[] UCS2toUTF8Code(short ucs2Code) {
        byte[] utf8Code = null;
        if (ucs2Code < 0 || ucs2Code > (short) 0x0800) {
            utf8Code = new byte[3];
            utf8Code[0] = (byte) ((convertShortToInt(ucs2Code) >>> 12) | 0xe0);
            utf8Code[1] = (byte) ((convertShortToInt((short) (ucs2Code & 0x0fc0)) >>> 6) | 0x80);
            utf8Code[2] = (byte) ((ucs2Code & 0x003f) | 0x80);
        } else if ((short) 0x0080 > ucs2Code) {
            utf8Code = new byte[1];
            utf8Code[0] = (byte) ucs2Code;
        } else {
            utf8Code = new byte[2];
            utf8Code[0] = (byte) ((ucs2Code >>> 6) | 0xc0);
            utf8Code[1] = (byte) ((ucs2Code & 0x003f) | 0x80);
        }
        return utf8Code;
    }

    private short convertByteToShort(byte byteValue) {
        return byteValue < 0 ? (short) (byteValue ^ 0xff00) : (short) byteValue;
    }

    private int convertShortToInt(short shortValue) {
        return shortValue < 0 ? shortValue ^ 0xffff0000 : shortValue;
    }

    private short encodeUCS2FromByte(byte highByte, byte lowByte) {
        return (short) ((short) (convertByteToShort(highByte) << 8) | convertByteToShort(lowByte));
    }

    public static void main(String[] args) throws UnsupportedEncodingException {
        byte[] tmp = {0x4f, 0x60, 0x59, 0x7d, (byte)0xff, 0x0c, 0x53, 0x17, 0x4e,
                            (byte)0xac, 0x00, 0x32, 0x00, 0x30, 0x00, 0x30, 0x00, 0x38};
        String utfStr = new ISO10646Decoder().decode(tmp);
        System.out.println(utfStr);
    }

}

没有评论: