隨筆-23  評論-58  文章-0  trackbacks-0
          基于詞典的正向最大匹配中文分詞算法,能實現中英文數字混合分詞。比如能分出這樣的詞:bb霜、3室、樂phone、touch4、mp3、T恤

          第一次寫中文分詞程序,歡迎拍磚。

          查看第2版:正向最大匹配分詞程序,能實現中英文數字混合分詞 (第二版)

          public class MM2 
          {
              
          private static final Log log = LogFactory.getLog(MM2.class);
              
              
          private static HashMap<String, Integer> dictionary = null
              
          private static final int WORD_MAX_LENGTH = 9;
              
          private Reader reader;
              
              
          static
              
          {
                  loadDictionary();
              }

              
              
          public MM2(Reader reader) 
              

                  
          this.reader = reader; 
              }
           
              
              
          //切分出由中文、字母、數字組成的句子
              public ArrayList<Sentence> getSentence() throws IOException
              
          {   
                  ArrayList
          <Sentence> list=new ArrayList<Sentence>();
                  StringBuffer cb
          =new StringBuffer();
                  
          int d=reader.read();
                  
          int offset=0;
                  
          boolean b=false;
                  
          while(d>-1)
                  
          {
                      
          int type=Character.getType(d);
                      
          if(type==2 || type==9 || type==5)
                      
          {
                          d
          =toAscii(d);
                          cb.append((
          char)d);
                      }

                      
          else
                      
          {
                          b
          =true;
                      }

                      d
          =reader.read();
                      
          if(d==-1 || b)
                      
          {
                          
          if(d==-1) offset++;
                          b
          =false;
                          
          char[] ioBuffer = new char[cb.length()];
                          cb.getChars(
          0, cb.length(), ioBuffer, 0);
                          Sentence sen
          =new Sentence(ioBuffer,offset-cb.length());
                          list.add(sen);
                          cb.setLength(
          0);
                      }

                      offset
          ++;
                  }

                  
          return list;
              }

              
              
          //將句子切分出詞
              public ArrayList<Token> getToken(ArrayList<Sentence> list) throws IOException
              
          {
                  ArrayList
          <Token> tokenlist=new ArrayList<Token>();
                  
          for(Sentence sen:list)
                  
          {
                      StringBuffer word 
          = new StringBuffer();
                      
          int offset=sen.getStartOffset();
                      
          int bufferIndex = 0;
                      
          char c;
                      
          boolean b=false;
                      
          while(bufferIndex<sen.getText().length)
                      
          {
                          offset
          ++;
                          c
          =sen.getText()[bufferIndex++];
                          
          if(word.length()==0)
                              word.append(c);
                          
          else
                          
          {
                              String temp 
          = (word.toString() + c).intern();
                              
          if(dictionary.containsKey(temp) && dictionary.get(temp)==1)
                                  word.append(c);
                              
          else if(dictionary.containsKey(temp) && bufferIndex<sen.getText().length)
                                  word.append(c);
                              
          else
                              
          {
                                  bufferIndex
          --;
                                  offset
          --;
                                  
          while(word.length()>1 && dictionary.get(word.toString())!=null && dictionary.get(word.toString())==2)
                                  
          {
                                      word.deleteCharAt(word.length()
          -1);
                                      bufferIndex
          --;
                                      offset
          --;
                                  }

                                  b
          =true;
                              }

                          }

                          
          if(b || bufferIndex==sen.getText().length)
                          
          {
                              Token token 
          = new Token(word.toString(),offset-word.length(),offset,"word");
                              word.setLength(
          0);
                              tokenlist.add(token);
                              b
          =false;
                          }

                      }

                  }

                  
          return tokenlist;
              }

              
              
          //將相連的單個英文或數字組合成詞
              public ArrayList<Token> getNewToken(ArrayList<Token> list) throws IOException
              
          {
                  ArrayList
          <Token> tokenlist=new ArrayList<Token>();
                  Token word
          =null;
                  
          for(int i=0;i<list.size();i++)
                  
          {
                      Token t
          =list.get(i);
                      
          if(t.getWord().length()==1 && Character.getType((int)t.getWord().charAt(0))!=5)
                      
          {
                          
          if(word==null)
                              word
          =t;
                          
          else if(word.getEnd()==t.getStart())
                          
          {
                              word.setEnd(t.getEnd());
                              word.setWord(word.getWord()
          +t.getWord());
                          }

                          
          else
                          
          {
                              tokenlist.add(word);
                              word
          =t;
                          }

                      }

                      
          else if(word!=null)
                      
          {
                          tokenlist.add(word);
                          word
          =null;
                          tokenlist.add(t);
                      }

                      
          else
                          tokenlist.add(t);
                  }

                  
          if(word!=null)
                      tokenlist.add(word);
                  
          return tokenlist;
              }

              
              
          //雙角轉單角
              public static int toAscii(int codePoint) 
              
          {
                  
          if((codePoint>=65296 && codePoint<=65305)    //0-9
                          || (codePoint>=65313 && codePoint<=65338)    //A-Z
                          || (codePoint>=65345 && codePoint<=65370)    //a-z
                          )
                  
          {    
                      codePoint 
          -= 65248;
                  }

                  
          return codePoint;
              }

              
              
          //加載詞典
              public static void loadDictionary() 
              
          {  
                  
          if (dictionary == null
                  
          {    
                      dictionary 
          = new HashMap<String, Integer>();    
                      InputStream is 
          = null;    
                      BufferedReader br 
          = null;            
                      
          try
                      
          {
                          is 
          = new FileInputStream(new File(MM2.class.getClassLoader().getResource("dictionary.txt").toURI()));
                          br 
          = new BufferedReader(new InputStreamReader(is, "UTF-8"));
                          String word 
          = null;
                          
          while ((word = br.readLine()) != null
                          
          {
                              word
          =word.toLowerCase();
                              
          if ((word.indexOf("#"== -1&& (word.length() <= WORD_MAX_LENGTH))
                              
          {
                                  dictionary.put(word.intern(), 
          1);    
                                  
          int i = word.length()-1
                                  
          while(i >= 2)
                                  
          {
                                      String temp 
          = word.substring(0, i).intern(); 
                                      
          if (!dictionary.containsKey(temp))
                                          dictionary.put(temp,
          2); 
                                      i
          --;
                                  }

                              }

                          }

                      }

                      
          catch (Exception e) 
                      
          {      
                          log.info(e);
                      }

                      
          finally
                      
          {
                          
          try 
                          
          {      
                              
          if(br!=null)
                                  br.close();   
                              
          if(is!=null)
                                  is.close();  
                          }

                          
          catch (IOException e)
                          
          {     
                              log.info(e);
                          }
                      
                      }
           
                  }
           
              }

              
              
          public static String[] segWords(Reader input)
              
          {
                  ArrayList
          <String> list=new ArrayList<String>();
                  
          try
                  
          {
                      MM2 f
          =new MM2(input);
                      ArrayList
          <Token> tlist= f.getNewToken(f.getToken(f.getSentence()));
                      
          for(Token t:tlist)
                      
          {
                          list.add(t.getWord());
                      }

                  }

                  
          catch(IOException e)
                  
          {
                      log.info(e);
                  }

                  
          return (String[])list.toArray(new String[0]);
              }

              
              
          public static void main(String[] args) 
              
          {
                  String[] cc
          =MM2.segWords(new StringReader("ibm商務機t60p".toLowerCase()));
                  
          for(String c:cc)
                  
          {
                      System.out.println(c);
                  }

              }

          }
          posted on 2011-08-04 15:31 nianzai 閱讀(3465) 評論(1)  編輯  收藏 所屬分類: 中文分詞

          評論:
          # re: 基于詞典的正向最大匹配中文分詞算法,能實現中英文數字混合分詞 2014-09-13 18:30 | 余道
          您好,您沒有給出Sentence和Token的定義,我猜不出啊

          hdwgz@qq.com  回復  更多評論
            
          主站蜘蛛池模板: 曲松县| 赤水市| 于都县| 汉阴县| 文登市| 泰顺县| 大同县| 阜康市| 固阳县| 浑源县| 柘城县| 武平县| 全南县| 商丘市| 靖安县| 水富县| 长宁县| 五指山市| 凤凰县| 邓州市| 泰来县| 崇信县| 红河县| 尤溪县| 昌平区| 长岛县| 新源县| 洪泽县| 金阳县| 句容市| 府谷县| 太和县| 洪江市| 天峻县| 天气| 伊春市| 喀喇沁旗| 万盛区| 永胜县| 永昌县| 马龙县|