posts - 495,comments - 227,trackbacks - 0
          http://blog.163.com/wf_shunqiziran/blog/static/176307209201258102217810/

          private String getFilecharset(File sourceFile) {
                  
          byte[] first3Bytes = new byte[3];
                  
          try (BufferedInputStream bis = new BufferedInputStream(new FileInputStream(sourceFile))) {
                      bis.mark(
          0);
                      
          int read = bis.read(first3Bytes, 03);
                      
          if (read == -1) {
                          
          return "GBK"// 文件編碼為 ANSI
                      }
                      
                      
          if (first3Bytes[0== (byte0xFF && first3Bytes[1== (byte0xFE) {
                          
          return "UTF-16LE"// 文件編碼為 Unicode
                      }
                      
                      
          if (first3Bytes[0== (byte0xFE && first3Bytes[1== (byte0xFF) {
                          
          return "UTF-16BE"// 文件編碼為 Unicode big endian
                      }
                      
                      
          if (first3Bytes[0== (byte0xEF && first3Bytes[1== (byte0xBB && first3Bytes[2== (byte0xBF) {
                          
          return "UTF-8"// 文件編碼為 UTF-8
                      }
                      
                      bis.reset();
                      
                      
          while ((read = bis.read()) != -1) {
                          
          if (read >= 0xF0) {
                              
          break;
                          }
                          
          if (0x80 <= read && read <= 0xBF) {
                              
          break;
                          }
                          
          if (0xC0 <= read && read <= 0xDF) {
                              read 
          = bis.read();
                              
          if (0x80 <= read && read <= 0xBF) {
                                  
          // (0x80 - 0xBF),也可能在GB編碼內
                                  continue;
                              }
                              
                              
          break;
                          } 
          else if (0xE0 <= read && read <= 0xEF) {// 也有可能出錯,但是幾率較小
                              read = bis.read();
                              
          if (0x80 <= read && read <= 0xBF) {
                                  read 
          = bis.read();
                                  
          if (0x80 <= read && read <= 0xBF) {
                                      
          return "UTF-8";
                                  }
                                  
          break;
                              }
                              
          break;
                          }
                      }
                  } 
          catch (Exception e) {
                      e.printStackTrace();
                  }
                  
          return "GBK";
              }





          最近java讀取文件的時候,經常碰到中文亂碼,特研究了一下java 的編碼格式,在java 中
          java編碼與txt編碼對應
          java txt
          unicode unicode big endian
          utf-8 utf-8
          utf-16 unicode
          gb2312 ANSI
          java 讀取txt如果編碼格式不對就會出現亂碼格式,通過下邊方法獲取文本文件編碼格式,然后以指定的編碼讀取文件,就不會出現亂碼(簡單測試了一下,但是也不保證100%)
          private static String getFilecharset(File sourceFile) {
          String charset = "GBK";
          byte[] first3Bytes = new byte[3];
          try {
          boolean checked = false;
          BufferedInputStream bis = new BufferedInputStream(new FileInputStream(sourceFile));
          bis.mark(0);
          int read = bis.read(first3Bytes, 0, 3);
          if (read == -1) {
          return charset; //文件編碼為 ANSI
          } else if (first3Bytes[0] == (byte) 0xFF
          && first3Bytes[1] == (byte) 0xFE) {
          charset = "UTF-16LE"; //文件編碼為 Unicode
          checked = true;
          } else if (first3Bytes[0] == (byte) 0xFE
          && first3Bytes[1] == (byte) 0xFF) {
          charset = "UTF-16BE"; //文件編碼為 Unicode big endian
          checked = true;
          } else if (first3Bytes[0] == (byte) 0xEF
          && first3Bytes[1] == (byte) 0xBB
          && first3Bytes[2] == (byte) 0xBF) {
          charset = "UTF-8"; //文件編碼為 UTF-8
          checked = true;
          }
          bis.reset();
          if (!checked) {
          int loc = 0;
          while ((read = bis.read()) != -1) {
          loc++;
          if (read >= 0xF0)
          break;
          if (0x80 <= read && read <= 0xBF) // 單獨出現BF以下的,也算是GBK
          break;
          if (0xC0 <= read && read <= 0xDF) {
          read = bis.read();
          if (0x80 <= read && read <= 0xBF) // 雙字節 (0xC0 - 0xDF)
          // (0x80
          // - 0xBF),也可能在GB編碼內
          continue;
          else
          break;
          } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出錯,但是幾率較小
          read = bis.read();
          if (0x80 <= read && read <= 0xBF) {
          read = bis.read();
          if (0x80 <= read && read <= 0xBF) {
          charset = "UTF-8";
          break;
          } else
          break;
          } else
          break;
          }
          }
          }
          bis.close();
          } catch (Exception e) {
          e.printStackTrace();
          }
          return charset;
          }
          posted on 2015-05-07 15:48 SIMONE 閱讀(1514) 評論(0)  編輯  收藏

          只有注冊用戶登錄后才能發表評論。


          網站導航:
           
          主站蜘蛛池模板: 西昌市| 永靖县| 龙泉市| 股票| 来安县| 宁蒗| 沙坪坝区| 临漳县| 新昌县| 孟津县| 桐梓县| 黄大仙区| 突泉县| 竹溪县| 荥阳市| 松溪县| 保亭| 昌江| 陵川县| 房产| 丁青县| 沅江市| 手游| 桓台县| 山东省| 延边| 深水埗区| 大新县| 若羌县| 安龙县| 定远县| 越西县| 丰镇市| 九龙县| 长治市| 绥芬河市| 莱州市| 灌阳县| 东兴市| 哈巴河县| 调兵山市|