隨筆-23  評論-58  文章-0  trackbacks-0
          基于詞典的逆向最大匹配中文分詞算法,能實現中英文數字混合分詞。比如能分出這樣的詞:bb霜、3室、樂phone、touch4、mp3、T恤。實際分詞效果比正向分詞效果好

          查看第2版:逆向最大匹配分詞程序,能實現中英文數字混合分詞 (第二版)

          public class RMM
          {
              
          private static final Log log = LogFactory.getLog(RMM.class);
              
              
          private static HashMap<String, Integer> dictionary = null
              
          private static final int WORD_MAX_LENGTH = 9;
              
              
          static
              
          {
                  loadDictionary();
              }

              
              
          //將句子切分出詞,逆向最大匹配
              public static ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException
              
          {
                  Collections.reverse(list);
                  ArrayList
          <Token> tokenlist=new ArrayList<Token>();
                  
          for(Sentence sen:list)
                  
          {
                      StringBuffer word 
          = new StringBuffer();
                      
          int offset=sen.getStartOffset()+sen.getText().length;
                      
          int bufferIndex = sen.getText().length-1;
                      
          char c;
                      
          boolean b=false;
                      
          while(bufferIndex>-1)
                      
          {
                          offset
          --;
                          c
          =sen.getText()[bufferIndex--];
                          
          if(word.length()==0)
                              word.append(c);
                          
          else
                          
          {
                              String temp 
          = (c+word.toString()).intern();
                              
          if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
                                  word.insert(
          0, c);
                              
          else if(dictionary.containsKey(temp) && bufferIndex>-1)
                                  word.insert(
          0, c);
                              
          else
                              
          {
                                  bufferIndex
          ++;
                                  offset
          ++;
                                  
          while(word.length()>1 && dictionary.get(word.toString())!=null && dictionary.get(word.toString())==2)
                                  
          {
                                      word.deleteCharAt(
          0);
                                      bufferIndex
          ++;
                                      offset
          ++;
                                  }

                                  b
          =true;
                              }

                          }

                          
          if(b || bufferIndex==-1)
                          
          {
                              Token token 
          = new Token(word.toString(),offset,offset+word.length(),"word");
                              word.setLength(
          0);
                              tokenlist.add(token);
                              b
          =false;
                          }

                      }

                  }

                  Collections.reverse(tokenlist);
                  
          return tokenlist;
              }

              
              
          //加載詞典
              public static void loadDictionary() 
              
          {  
                  
          if (dictionary == null
                  
          {    
                      dictionary 
          = new HashMap<String, Integer>();    
                      InputStream is 
          = null;    
                      BufferedReader br 
          = null;            
                      
          try
                      
          {
                          is 
          = new FileInputStream(new File(RMM.class.getClassLoader().getResource("dictionary.txt").toURI()));
                          br 
          = new BufferedReader(new InputStreamReader(is, "UTF-8"));
                          String word 
          = null;
                          
          while ((word = br.readLine()) != null
                          
          {
                              word
          =word.toLowerCase();
                              
          if ((word.indexOf("#"== -1&& (word.length() <= WORD_MAX_LENGTH))
                              
          {
                                  dictionary.put(word.intern(), 
          1);    
                                  
          int i = 1
                                  
          while(i < word.length()-1)
                                  
          {
                                      String temp 
          = word.substring(i,word.length()).intern(); 
                                      
          if (!dictionary.containsKey(temp))
                                          dictionary.put(temp,
          2); 
                                      i
          ++;
                                  }

                              }

                          }

                      }

                      
          catch (Exception e) 
                      
          {      
                          log.info(e);
                      }

                      
          finally
                      
          {
                          
          try 
                          
          {      
                              
          if(br!=null)
                                  br.close();   
                              
          if(is!=null)
                                  is.close();  
                          }

                          
          catch (IOException e)
                          
          {     
                              log.info(e);
                          }
                      
                      }
           
                  }
           
              }

              
              
          public static String[] segWords(Reader reader)
              
          {
                  ArrayList
          <String> list=new ArrayList<String>();
                  
          try
                  
          {
                      ArrayList
          <Token> tlist= Util.getNewToken(getToken(Util.getSentence(reader)));
                      
          for(Token t:tlist)
                      
          {
                          list.add(t.getWord());
                      }

                  }

                  
          catch(IOException e)
                  
          {
                      log.info(e);
                  }

                  
          return (String[])list.toArray(new String[0]);
              }

              
              
          public static void main(String[] args) 
               
          {
                  String[] cc
          =RMM.segWords(new StringReader("急、急、急、花里林居,二房二廳,業主誠心,出租".toLowerCase()));
                  
          for(String c:cc)
                  
          {
                      System.out.println(c);
                  }

              }

          }


          public class Util
          {
           //切分出由中文、字母、數字組成的句子
           public static ArrayList<Sentence> getSentence(Reader reader) throws IOException
           {  
            ArrayList<Sentence> list=new ArrayList<Sentence>();
            StringBuffer cb=new StringBuffer();
            int d=reader.read();
            int offset=0;
            boolean b=false;
            while(d>-1)
            {
             int type=Character.getType(d);
             if(type==2 || type==9 || type==5)
             {
              d=toAscii(d);
              cb.append((char)d);
             }
             else
             {
              b=true;
             }
             d=reader.read();
             if(d==-1 || b)
             {
              if(d==-1) offset++;
              b=false;
              char[] ioBuffer = new char[cb.length()];
              cb.getChars(0, cb.length(), ioBuffer, 0);
              Sentence sen=new Sentence(ioBuffer,offset-cb.length());
              list.add(sen);
              cb.setLength(0);
             }
             offset++;
            }
            return list;
           }
           
           //將相連的單個英文或數字組合成詞
           public static ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
           {
            ArrayList<Token> tokenlist=new ArrayList<Token>();
            Token word=null;
            for(int i=0;i<list.size();i++)
            {
             Token t=list.get(i);
             if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)
             {
              if(word==null)
               word=t;
              else if(word.getEnd()==t.getStart())
              {
               word.setEnd(t.getEnd());
               word.setWord(word.getWord()+t.getWord());
              }
              else
              {
               tokenlist.add(word);
               word=t;
              }
             }
             else if(word!=null)
             {
              tokenlist.add(word);
              word=null;
              tokenlist.add(t);
             }
             else
              tokenlist.add(t);
            }
            if(word!=null)
             tokenlist.add(word);
            return tokenlist;
           }
           
           //雙角轉單角
           public static int toAscii(int codePoint)
           {
            if((codePoint>=65296 && codePoint<=65305) //0-9
              || (codePoint>=65313 && codePoint<=65338) //A-Z
              || (codePoint>=65345 && codePoint<=65370) //a-z
              )
            { 
             codePoint -= 65248;
            }
            return codePoint;
           }
          }








          posted on 2011-08-19 13:22 nianzai 閱讀(4489) 評論(2)  編輯  收藏 所屬分類: 中文分詞

          評論:
          # re: 基于詞典的逆向最大匹配中文分詞算法,逆向分詞比正向分詞效果好 [未登錄] 2011-10-21 16:38 | zxj
          樓主,代碼中的Sentence 類呢?  回復  更多評論
            
          # re: 基于詞典的逆向最大匹配中文分詞算法,逆向分詞比正向分詞效果好 2011-11-08 15:55 | nianzai
          參考正向最大匹配中文分詞算法  回復  更多評論
            
          主站蜘蛛池模板: 蒙山县| 西和县| 娄底市| 满洲里市| 永修县| 滕州市| 定边县| 凉城县| 唐河县| 蓬安县| 甘孜| 汾西县| 壶关县| 霸州市| 怀柔区| 满洲里市| 全州县| 商河县| 黄骅市| 营口市| 社会| 新龙县| 达拉特旗| 丰原市| 临武县| 五常市| 肇庆市| 九台市| 襄樊市| 莎车县| 韶山市| 勃利县| 金溪县| 宣汉县| 区。| 公主岭市| 玉林市| 临武县| 祁东县| 广丰县| 仙桃市|