일반 유니코드 관련 유틸 클래스

황제낙엽 2005.07.16 08:10 조회 수 : 789 추천:219

sitelink1  
sitelink2  
sitelink3  
sitelink4  
sitelink5  
sitelink6  
package util;

/**
* @author dialogboxes <chul80@kebi.com>
*
* Unicode 관련된 기능을 제공하는 Class<BR>
*
*/
public class UnicodeUtil {

  /**
   * 입력받은 byte의 2진수 코드를 출력한다.
   * 테스트용 method로 System.out으로 출력한다.
   *
   * @param b 출력할 byte
   */
  public static void printHex(byte b) {
    byte mask = 0x01;

    for (int i = 0; i < 8; i++) {
      if ((mask & (b >>> (8 - i - 1))) != 0)
        System.out.print("1");
      else
        System.out.print("0");
    }
  }

  private static int getOne(byte[] a, byte[] b, int cur) {
    int i = 0;

    b[0] = a[cur];

    if ((b[0] & (byte)0x80) == 0) {
      return 1;
    }

    if ((byte) (b[0] & (byte)0xC0) == (byte)0xC0) {
      b[1] = a[cur + 1];
    } else {
      // error
      b[0] = '?';
      return 1;
    }

    if ((byte) (b[0] & (byte)0xE0) == (byte)0xE0) {
      b[2] = a[cur + 2];
    } else {
      return 2;
    }

    if ((byte) (b[0] & (byte)0xF0) == (byte)0xF0) {
      b[3] = a[cur + 3];
    } else {
      return 3;
    }

    if ((byte) (b[0] & (byte)0xF8) == (byte)0xF8) {
      // error
      b[0] = '?';
      return 1;
    }

    return 4;

  }

  private static int decodeUTF8(byte[] a, int len) {
    byte t = 0;
    int r = 0;

    switch (len) {
      case 1 :
        return a[0];
      case 2 :
        // 첫번쩨 byte
        t = (byte) (a[0] & (byte)0x2F);
        r = r | t;
        r = r << 6;

        // 두번쩨 byte
        t = (byte) (a[1] & (byte)0x3F);
        r = r | t;

        return r;
      case 3 :
        // 첫번쩨 byte
        t = (byte) (a[0] & (byte)0x0F);
        r = r | t;
        r = r << 6;

        // 두번쩨 byte
        t = (byte) (a[1] & (byte)0x3F);
        r = r | t;
        r = r << 6;

        // 세번쩨 byte
        t = (byte) (a[2] & (byte)0x3F);
        r = r | t;

        return r;
      case 4 :
        // 첫번쩨 byte
        t = (byte) (a[0] & (byte)0x07);
        r = r | t;
        r = r << 6;

        // 두번쩨 byte
        t = (byte) (a[1] & (byte)0x3F);
        r = r | t;
        r = r << 6;

        // 세번쩨 byte
        t = (byte) (a[2] & (byte)0x3F);
        r = r | t;
        r = r << 6;

        // 네번쩨 byte
        t = (byte) (a[3] & (byte)0x3F);
        r = r | t;

        return r;
    }

    return -1;
  }

  /**
   *
   * utf-8로 인코딩된 byte array를 HTML에서 사용가능한 <XMP>�</XMP> 의 형식으로 변환한다.<BR>
   * <BR>
   * <UL>
   * <LI> 입력은 반드시 UTF-8로 인코딩된 byte array여야한다. 에러는 발생하지 않으나 결과가 깨어진다.
   * <LI> 처리중 잘못된 byte가 들어오면 '?'로 치환될 수 있다.(모든경우에 그렇지는 않다.)
   * <LI> 입력으로 들어오는 문자중 ASCII 문자를 제외한 모든 문자를 처리한다. 따라서 결과의 길이는 상당히 커진다.
   * </UL>
   *
   * @param a
   * @return
   */
  public static String utf8ToHtml(byte[] a) {
    StringBuffer buf = new StringBuffer();
    int cur = 0;
    int i = 0;
    byte[] tmp = new byte[4];

    while ((i = getOne(a, tmp, cur)) > 0) {
      cur += i;

      if (i == 1) {
        buf.append((char)tmp[0]);
      } else {
        i = decodeUTF8(tmp, i);
        buf.append("&#").append(i).append(";");
      }

      if (a.length == cur)
        break;
    }

    return buf.toString();
  }

  /*
   *유니코드를 한글로 변환하는 메서드 (오라클)
   */

public String fromDB (String uni12) throws SQLException                                                        
{                                                                                                              
    if (uni12 == null)                                                                                          
        return null;                                                                                            
    int len = uni12.length();
    char [] out = new char[len];                                                                                
    byte [] ksc = new byte [2];
    for (int i = 0; i < len ; i++)                                                                              
    {                                                                                                          
        char c = uni12.charAt(i);
        if (c < 0x3400 || c > 0x4dff)                                                                          
        {                                                                                                      
            out[i] = c;
        } else if (c >= 0x3d2e) // Unicode 1.2 한글 보충영역 A, B                                              
        {
            out[i] = 'ufffd';                                                                                  
                                                                                                                
        } else // Unicode 1.2의 KSC5601 대응 한글 영역
        {
            try                                                                                                
            {
                ksc[0] = (byte) ((c - 0x3400) / 94 + 0xb0);                                                    
                ksc[1] = (byte) ((c - 0x3400) % 94 + 0xa1);                                                    
                out[i] = new String(ksc, "KSC5601").charAt (0);                                                
            } catch (java.io.UnsupportedEncodingException ex)                                                  
            {                                                                                                  
                System.out.println(ex.toString());                                                              
            }                                                                                                  
        }                                                                                                      
    }                                                                                                          
    return new String (out);                                                                                    
  }
}