Terry.Li-彬

          虛其心,可解天下之問;專其心,可治天下之學;靜其心,可悟天下之理;恒其心,可成天下之業。

            BlogJava :: 首頁 :: 新隨筆 :: 聯系 :: 聚合  :: 管理 ::
            143 隨筆 :: 344 文章 :: 130 評論 :: 0 Trackbacks
          基于詞典的正向最大匹配中文分詞算法,能實現中英文數字混合分詞。比如能分出這樣的詞:bb霜、3室、樂phone、touch4、mp3、T恤

          第一次寫中文分詞程序,歡迎拍磚。

          public?class?MM2?
          {
          ????
          private?static?final?Log?log?=?LogFactory.getLog(MM2.class);
          ????
          ????
          private?static?HashMap<String,?Integer>?dictionary?=?null;?
          ????
          private?static?final?int?WORD_MAX_LENGTH?=?9;
          ????
          private?Reader?reader;
          ????
          ????
          static
          ????
          {
          ????????loadDictionary();
          ????}

          ????
          ????
          public?MM2(Reader?reader)?
          ????
          {?
          ????????
          this.reader?=?reader;?
          ????}
          ?
          ????
          ????
          //切分出由中文、字母、數字組成的句子
          ????public?ArrayList<Sentence>?getSentence()?throws?IOException
          ????
          {???
          ????????ArrayList
          <Sentence>?list=new?ArrayList<Sentence>();
          ????????StringBuffer?cb
          =new?StringBuffer();
          ????????
          int?d=reader.read();
          ????????
          int?offset=0;
          ????????
          boolean?b=false;
          ????????
          while(d>-1)
          ????????
          {
          ????????????
          int?type=Character.getType(d);
          ????????????
          if(type==2?||?type==9?||?type==5)
          ????????????
          {
          ????????????????d
          =toAscii(d);
          ????????????????cb.append((
          char)d);
          ????????????}

          ????????????
          else
          ????????????
          {
          ????????????????b
          =true;
          ????????????}

          ????????????d
          =reader.read();
          ????????????
          if(d==-1?||?b)
          ????????????
          {
          ????????????????
          if(d==-1)?offset++;
          ????????????????b
          =false;
          ????????????????
          char[]?ioBuffer?=?new?char[cb.length()];
          ????????????????cb.getChars(
          0,?cb.length(),?ioBuffer,?0);
          ????????????????Sentence?sen
          =new?Sentence(ioBuffer,offset-cb.length());
          ????????????????list.add(sen);
          ????????????????cb.setLength(
          0);
          ????????????}

          ????????????offset
          ++;
          ????????}

          ????????
          return?list;
          ????}

          ????
          ????
          //將句子切分出詞
          ????public?ArrayList<Token>?getToken(ArrayList<Sentence>?list)?throws?IOException
          ????
          {
          ????????ArrayList
          <Token>?tokenlist=new?ArrayList<Token>();
          ????????
          for(Sentence?sen:list)
          ????????
          {
          ????????????StringBuffer?word?
          =?new?StringBuffer();
          ????????????
          int?offset=sen.getStartOffset();
          ????????????
          int?bufferIndex?=?0;
          ????????????
          char?c;
          ????????????
          boolean?b=false;
          ????????????
          while(bufferIndex<sen.getText().length)
          ????????????
          {
          ????????????????offset
          ++;
          ????????????????c
          =sen.getText()[bufferIndex++];
          ????????????????
          if(word.length()==0)
          ????????????????????word.append(c);
          ????????????????
          else
          ????????????????
          {
          ????????????????????String?temp?
          =?(word.toString()?+?c).intern();
          ????????????????????
          if(dictionary.containsKey(temp)?&&?dictionary.get(temp)==1)
          ????????????????????????word.append(c);
          ????????????????????
          else?if(dictionary.containsKey(temp)?&&?bufferIndex<sen.getText().length)
          ????????????????????????word.append(c);
          ????????????????????
          else
          ????????????????????
          {
          ????????????????????????bufferIndex
          --;
          ????????????????????????offset
          --;
          ????????????????????????
          while(word.length()>1?&&?dictionary.get(word.toString())!=null?&&?dictionary.get(word.toString())==2)
          ????????????????????????
          {
          ????????????????????????????word.deleteCharAt(word.length()
          -1);
          ????????????????????????????bufferIndex
          --;
          ????????????????????????????offset
          --;
          ????????????????????????}

          ????????????????????????b
          =true;
          ????????????????????}

          ????????????????}

          ????????????????
          if(b?||?bufferIndex==sen.getText().length)
          ????????????????
          {
          ????????????????????Token?token?
          =?new?Token(word.toString(),offset-word.length(),offset,"word");
          ????????????????????word.setLength(
          0);
          ????????????????????tokenlist.add(token);
          ????????????????????b
          =false;
          ????????????????}

          ????????????}

          ????????}

          ????????
          return?tokenlist;
          ????}

          ????
          ????
          //將相連的單個英文或數字組合成詞
          ????public?ArrayList<Token>?getNewToken(ArrayList<Token>?list)?throws?IOException
          ????
          {
          ????????ArrayList
          <Token>?tokenlist=new?ArrayList<Token>();
          ????????Token?word
          =null;
          ????????
          for(int?i=0;i<list.size();i++)
          ????????
          {
          ????????????Token?t
          =list.get(i);
          ????????????
          if(t.getWord().length()==1?&&?Character.getType((int)t.getWord().charAt(0))!=5)
          ????????????
          {
          ????????????????
          if(word==null)
          ????????????????????word
          =t;
          ????????????????
          else?if(word.getEnd()==t.getStart())
          ????????????????
          {
          ????????????????????word.setEnd(t.getEnd());
          ????????????????????word.setWord(word.getWord()
          +t.getWord());
          ????????????????}

          ????????????????
          else
          ????????????????
          {
          ????????????????????tokenlist.add(word);
          ????????????????????word
          =t;
          ????????????????}

          ????????????}

          ????????????
          else?if(word!=null)
          ????????????
          {
          ????????????????tokenlist.add(word);
          ????????????????word
          =null;
          ????????????????tokenlist.add(t);
          ????????????}

          ????????????
          else
          ????????????????tokenlist.add(t);
          ????????}

          ????????
          if(word!=null)
          ????????????tokenlist.add(word);
          ????????
          return?tokenlist;
          ????}

          ????
          ????
          //雙角轉單角
          ????public?static?int?toAscii(int?codePoint)?
          ????
          {
          ????????
          if((codePoint>=65296?&&?codePoint<=65305)????//0-9
          ????????????????||?(codePoint>=65313?&&?codePoint<=65338)????//A-Z
          ????????????????||?(codePoint>=65345?&&?codePoint<=65370)????//a-z
          ????????????????)
          ????????
          {????
          ????????????codePoint?
          -=?65248;
          ????????}

          ????????
          return?codePoint;
          ????}

          ????
          ????
          //加載詞典
          ????public?static?void?loadDictionary()?
          ????
          {??
          ????????
          if?(dictionary?==?null)?
          ????????
          {????
          ????????????dictionary?
          =?new?HashMap<String,?Integer>();????
          ????????????InputStream?is?
          =?null;????
          ????????????BufferedReader?br?
          =?null;????????????
          ????????????
          try
          ????????????
          {
          ????????????????is?
          =?new?FileInputStream(new?File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI()));
          ????????????????br?
          =?new?BufferedReader(new?InputStreamReader(is,?"UTF-8"));
          ????????????????String?word?
          =?null;
          ????????????????
          while?((word?=?br.readLine())?!=?null)?
          ????????????????
          {
          ????????????????????word
          =word.toLowerCase();
          ????????????????????
          if?((word.indexOf("#")?==?-1)?&&?(word.length()?<=?WORD_MAX_LENGTH))
          ????????????????????
          {
          ????????????????????????dictionary.put(word.intern(),?
          1);????
          ????????????????????????
          int?i?=?word.length()-1;?
          ????????????????????????
          while(i?>=?2)
          ????????????????????????
          {
          ????????????????????????????String?temp?
          =?word.substring(0,?i).intern();?
          ????????????????????????????
          if?(!dictionary.containsKey(temp))
          ????????????????????????????????dictionary.put(temp,
          2);?
          ????????????????????????????i
          --;
          ????????????????????????}

          ????????????????????}

          ????????????????}

          ????????????}

          ????????????
          catch?(Exception?e)?
          ????????????
          {??????
          ????????????????log.info(e);
          ????????????}

          ????????????
          finally
          ????????????
          {
          ????????????????
          try?
          ????????????????
          {??????
          ????????????????????
          if(br!=null)
          ????????????????????????br.close();???
          ????????????????????
          if(is!=null)
          ????????????????????????is.close();??
          ????????????????}

          ????????????????
          catch?(IOException?e)
          ????????????????
          {?????
          ????????????????????log.info(e);
          ????????????????}
          ????????????
          ????????????}
          ?
          ????????}
          ?
          ????}

          ????
          ????
          public?static?String[]?segWords(Reader?input)
          ????
          {
          ????????ArrayList
          <String>?list=new?ArrayList<String>();
          ????????
          try
          ????????
          {
          ????????????MM2?f
          =new?MM2(input);
          ????????????ArrayList
          <Token>?tlist=?f.getNewToken(f.getToken(f.getSentence()));
          ????????????
          for(Token?t:tlist)
          ????????????
          {
          ????????????????list.add(t.getWord());
          ????????????}

          ????????}

          ????????
          catch(IOException?e)
          ????????
          {
          ????????????log.info(e);
          ????????}

          ????????
          return?(String[])list.toArray(new?String[0]);
          ????}

          ????
          ????
          public?static?void?main(String[]?args)?
          ????
          {
          ????????String[]?cc
          =MM2.segWords(new?StringReader("ibm商務機t60p".toLowerCase()));
          ????????
          for(String?c:cc)
          ????????
          {
          ????????????System.out.println(c);
          ????????}

          ????}

          }
          posted on 2011-08-05 08:34 禮物 閱讀(2122) 評論(2)  編輯  收藏

          評論

          # re: 基于詞典的正向最大匹配中文分詞算法,能實現中英文數字混合分詞 2013-07-25 22:09 yi
          這是全的么,樓主?我導入到MyEclipse里好多錯誤呀,除了import包之外還有好多錯,看不懂。。。  回復  更多評論
            

          # re: 基于詞典的正向最大匹配中文分詞算法,能實現中英文數字混合分詞 2013-08-22 20:01 love code
          麻煩 博主把dictionary.txt發給我吧,讓我學習學習
          1182787467@qq.com
          謝謝  回復  更多評論
            


          只有注冊用戶登錄后才能發表評論。

          網站導航:
           
          主站蜘蛛池模板: 琼结县| 大悟县| 黄石市| 措勤县| 江油市| 浦城县| 斗六市| 平湖市| 托克逊县| 隆德县| 嫩江县| 栖霞市| 宜州市| 鹤庆县| 鹤山市| 阳东县| 龙岩市| 明溪县| 辽宁省| 高尔夫| 嘉黎县| 项城市| 陆川县| 通山县| 淮北市| 泾阳县| 濮阳县| 玉树县| 宁城县| 盐池县| 龙泉市| 泾阳县| 红桥区| 西平县| 湾仔区| 丹东市| 汤阴县| 乐业县| 礼泉县| 巴南区| 长春市|