posts - 495,comments - 227,trackbacks - 0
          http://blog.163.com/wf_shunqiziran/blog/static/176307209201258102217810/

          private String getFilecharset(File sourceFile) {
                  
          byte[] first3Bytes = new byte[3];
                  
          try (BufferedInputStream bis = new BufferedInputStream(new FileInputStream(sourceFile))) {
                      bis.mark(
          0);
                      
          int read = bis.read(first3Bytes, 03);
                      
          if (read == -1) {
                          
          return "GBK"// 文件編碼為 ANSI
                      }
                      
                      
          if (first3Bytes[0== (byte0xFF && first3Bytes[1== (byte0xFE) {
                          
          return "UTF-16LE"// 文件編碼為 Unicode
                      }
                      
                      
          if (first3Bytes[0== (byte0xFE && first3Bytes[1== (byte0xFF) {
                          
          return "UTF-16BE"// 文件編碼為 Unicode big endian
                      }
                      
                      
          if (first3Bytes[0== (byte0xEF && first3Bytes[1== (byte0xBB && first3Bytes[2== (byte0xBF) {
                          
          return "UTF-8"// 文件編碼為 UTF-8
                      }
                      
                      bis.reset();
                      
                      
          while ((read = bis.read()) != -1) {
                          
          if (read >= 0xF0) {
                              
          break;
                          }
                          
          if (0x80 <= read && read <= 0xBF) {
                              
          break;
                          }
                          
          if (0xC0 <= read && read <= 0xDF) {
                              read 
          = bis.read();
                              
          if (0x80 <= read && read <= 0xBF) {
                                  
          // (0x80 - 0xBF),也可能在GB編碼內(nèi)
                                  continue;
                              }
                              
                              
          break;
                          } 
          else if (0xE0 <= read && read <= 0xEF) {// 也有可能出錯,但是幾率較小
                              read = bis.read();
                              
          if (0x80 <= read && read <= 0xBF) {
                                  read 
          = bis.read();
                                  
          if (0x80 <= read && read <= 0xBF) {
                                      
          return "UTF-8";
                                  }
                                  
          break;
                              }
                              
          break;
                          }
                      }
                  } 
          catch (Exception e) {
                      e.printStackTrace();
                  }
                  
          return "GBK";
              }





          最近java讀取文件的時候,經(jīng)常碰到中文亂碼,特研究了一下java 的編碼格式,在java 中
          java編碼與txt編碼對應(yīng)
          java txt
          unicode unicode big endian
          utf-8 utf-8
          utf-16 unicode
          gb2312 ANSI
          java 讀取txt如果編碼格式不對就會出現(xiàn)亂碼格式,通過下邊方法獲取文本文件編碼格式,然后以指定的編碼讀取文件,就不會出現(xiàn)亂碼(簡單測試了一下,但是也不保證100%)
          private static String getFilecharset(File sourceFile) {
          String charset = "GBK";
          byte[] first3Bytes = new byte[3];
          try {
          boolean checked = false;
          BufferedInputStream bis = new BufferedInputStream(new FileInputStream(sourceFile));
          bis.mark(0);
          int read = bis.read(first3Bytes, 0, 3);
          if (read == -1) {
          return charset; //文件編碼為 ANSI
          } else if (first3Bytes[0] == (byte) 0xFF
          && first3Bytes[1] == (byte) 0xFE) {
          charset = "UTF-16LE"; //文件編碼為 Unicode
          checked = true;
          } else if (first3Bytes[0] == (byte) 0xFE
          && first3Bytes[1] == (byte) 0xFF) {
          charset = "UTF-16BE"; //文件編碼為 Unicode big endian
          checked = true;
          } else if (first3Bytes[0] == (byte) 0xEF
          && first3Bytes[1] == (byte) 0xBB
          && first3Bytes[2] == (byte) 0xBF) {
          charset = "UTF-8"; //文件編碼為 UTF-8
          checked = true;
          }
          bis.reset();
          if (!checked) {
          int loc = 0;
          while ((read = bis.read()) != -1) {
          loc++;
          if (read >= 0xF0)
          break;
          if (0x80 <= read && read <= 0xBF) // 單獨出現(xiàn)BF以下的,也算是GBK
          break;
          if (0xC0 <= read && read <= 0xDF) {
          read = bis.read();
          if (0x80 <= read && read <= 0xBF) // 雙字節(jié) (0xC0 - 0xDF)
          // (0x80
          // - 0xBF),也可能在GB編碼內(nèi)
          continue;
          else
          break;
          } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出錯,但是幾率較小
          read = bis.read();
          if (0x80 <= read && read <= 0xBF) {
          read = bis.read();
          if (0x80 <= read && read <= 0xBF) {
          charset = "UTF-8";
          break;
          } else
          break;
          } else
          break;
          }
          }
          }
          bis.close();
          } catch (Exception e) {
          e.printStackTrace();
          }
          return charset;
          }
          posted on 2015-05-07 15:48 SIMONE 閱讀(1514) 評論(0)  編輯  收藏

          只有注冊用戶登錄后才能發(fā)表評論。


          網(wǎng)站導(dǎo)航:
           
          主站蜘蛛池模板: 贺兰县| 青海省| 邓州市| 黄冈市| 谷城县| 庄河市| 林西县| 新疆| 金华市| 洪洞县| 榆树市| 安远县| 北票市| 盐池县| 若羌县| 南城县| 尚志市| 普兰县| 山丹县| 邢台市| 丁青县| 祁东县| 双流县| 绍兴市| 井陉县| 巨野县| 龙里县| 安宁市| 兴宁市| 锦州市| 鄂伦春自治旗| 奉新县| 兴化市| 松潘县| 双流县| 西乌珠穆沁旗| 稷山县| 邢台市| 古丈县| 三河市| 金川县|