Rising Sun

            BlogJava :: 首頁 :: 新隨筆 :: 聯系 :: 聚合  :: 管理 ::
            148 隨筆 :: 0 文章 :: 22 評論 :: 0 Trackbacks

          看了網上的許多對于lucene 分詞解析的文章一知半解且代碼比較老舊,為透徹、系統、全面、深刻的了解分詞是怎么一個過程,通過自定義一個分詞器來分析理解。 其中分詞部分利用ICTCLAS4j接口實現。結構如下所示:


           

                   要實現自定義的ICTCLAS4jAnalyzer必須繼承Analyzer類,并重寫createComponents方法。直接上代碼,看到了吧是從StandardAnalyzer 類中直接復制過來的。把實現ICTCLAS4jICTCLAS4jTokenizer替換就搞定了。

           @Override

              protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) {

                final ICTCLAS4jTokenizer src = new ICTCLAS4jTokenizer(reader);

                //src.setMaxTokenLength(maxTokenLength);

                TokenStream tok = new ICTCLAS4jFilter(matchVersion, src);

                tok = new LowerCaseFilter(matchVersion, tok);

                tok = new StopFilter(matchVersion, tok, STOP_WORDS_SET);

                return new TokenStreamComponents(src, tok) {

                  @Override

                  protected void setReader(final Reader reader) throws IOException {

                    //src.setMaxTokenLength(ICTCLAS4jAnalyzer.this.maxTokenLength);

                    super.setReader(reader);

                  }

                };

          }

           

                   ICTCLAS4jTokenizer需重新incrementToken方法,并設定CharTermAttribute(存放詞條),OffsetAttribute(存放詞條的偏移地址),構造函數中寫入需分詞的字符串,通過ICTCLAS4j返回分詞列表在通過incrementToken實現分詞。代碼如下:

           

          package com.zhy.analysis.ictclas4j;

           

          import java.io.FileInputStream;

          import java.io.IOException;

          import java.io.InputStream;

          import java.io.Reader;

          import java.util.ArrayList;

           

          import org.apache.lucene.analysis.Tokenizer;

          import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

          import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

          import org.ictclas4j.bean.SegResult;

          import org.ictclas4j.segment.SegTag;

           

          /**

           * @author brockhong

           *

           */

           

          public class ICTCLAS4jTokenizer extends Tokenizer {

           

               private static SegTag segment;

               private StringBuilder sb = new StringBuilder();

               private ArrayList<String> words = new ArrayList<String>();

               private int startOffest = 0;

               private int length = 0;

               private int wordIdx = 0;

           

               public ICTCLAS4jTokenizer(Reader input) {

                   super(input);

                   char[] buf = new char[8192];

                   int d = -1;

                   try {

                        while ((d = input.read(buf)) != -1) {

                             sb.append(buf, 0, d);

                        }

                   } catch (IOException e) {

                        e.printStackTrace();

                   }

                   SegResult sr = seg().split(sb.toString());

                   words = sr.getWords();

               }

                   private static SegTag seg() {

                   try {

                        if (segment == null) {

                             final InputStream coreDictIn = new FileInputStream(

                                      "data/coreDict.dct");

                             final InputStream bigramDictIn = new FileInputStream(

                                      "data/BigramDict.dct");

                             final InputStream personTaggerDctIn = new FileInputStream(

                                      "data/nr.dct");

                             final InputStream personTaggerCtxIn = new FileInputStream(

                                      "data/nr.ctx");

                             final InputStream transPersonTaggerDctIn = new FileInputStream(

                                      "data/tr.dct");

                             final InputStream transPersonTaggerCtxIn = new FileInputStream(

                                      "data/tr.ctx");

                             final InputStream placeTaggerDctIn = new FileInputStream(

                                      "data/ns.dct");

                             final InputStream placeTaggerCtxIn = new FileInputStream(

                                      "data/ns.ctx");

                            final InputStream lexTaggerCtxIn = new FileInputStream(

                                      "data/lexical.ctx");

                             segment = new SegTag(1, coreDictIn, bigramDictIn,

                                      personTaggerDctIn, personTaggerCtxIn,

                                      transPersonTaggerDctIn, transPersonTaggerCtxIn,

                                      placeTaggerDctIn, placeTaggerCtxIn, lexTaggerCtxIn);

                        }

                   } catch (Exception e) {

                        e.printStackTrace();

                   }

                   return segment;

               }

               private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

               private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

           

               @Override

               public boolean incrementToken() throws IOException {

                   while (true) {

                        length = 0;

                        if (wordIdx < words.size()) {

                             String word = words.get(wordIdx);

           

                             termAtt.copyBuffer(word.toCharArray(), 0, word.length());

                             offsetAtt.setOffset(correctOffset(startOffest),

                                      correctOffset(startOffest + length));

                             wordIdx++;

                             startOffest += length;

                             return true;

                        } else {

                             return false;

                        }

           

                   }

               }

          }   

           

                   ICTCLAS4jFilter 分詞過濾器直接使用StandardAnalyzer的過濾器,作為自定義過濾器。

           

          ICTCLAS4j改造過程來自網上,修改SegTagoutputResult讓其輸出的分詞輸入到列表中。并修復了ICTCLAS4j 在分詞中沒有時報錯代碼。

          附上analyzer 測試類如下:

           

          import java.io.Reader;

          import org.apache.lucene.analysis.Analyzer;

          import org.apache.lucene.analysis.TokenStream;

          import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

          import org.apache.lucene.util.Version;

          import java.io.StringReader;

          import com.zhy.analysis.ictclas4j.ICTCLAS4jAnalyzer;

          /**

           *  @author brockhong

           */

          public class Ictclas4janalyzer {

                public static void main(String[] args) throws Exception {

                     Analyzer analyzer = new ICTCLAS4jAnalyzer(Version.LUCENE_45);

                    

                     Reader r = new StringReader("張萌萌是勤奮地漂亮的姑娘,/用一塊錢打造經濟的航空領域中的航空母艦地點在深圳。ABCD.#$% Hello World!\n又一段文本123 3.0");     

                     TokenStream ts=analyzer.tokenStream("fff", r);     

                      CharTermAttribute term=ts.addAttribute(CharTermAttribute.class); 

                  ts.reset(); 

                  while(ts.incrementToken()){ 

                      System.out.println(term.toString()); 

                  } 

                  ts.end(); 

                  ts.close(); 

                }

          }

          Lucene寫入測試類:

          import java.io.File;

          import java.io.IOException;

          import org.apache.lucene.analysis.Analyzer;

          import org.apache.lucene.analysis.standard.StandardAnalyzer;

          import org.apache.lucene.document.Document;

          import org.apache.lucene.document.StringField;

          import org.apache.lucene.document.TextField;

          import org.apache.lucene.document.Field.Store;

          import org.apache.lucene.index.IndexWriter;

          import org.apache.lucene.index.IndexWriterConfig;

          import org.apache.lucene.store.Directory;

          import org.apache.lucene.store.FSDirectory;

          import org.apache.lucene.util.Version;

          import com.zhy.analysis.ictclas4j.ICTCLAS4jAnalyzer;

          /** @author brockhong */

          public class Testictclas4j {

                        public static void main(String[] args) throws Exception {

                                      // 設置寫入目錄(好幾種呵呵)

                                      Directory d = FSDirectory.open(new File("D:/luceneTest2"));

                                      // 設置分詞 StandardAnalyzer(會把句子中的字單個分詞)

                                      Analyzer analyzer = new ICTCLAS4jAnalyzer(Version.LUCENE_45);

                                      // 設置索引寫入配置

                                      IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45,          analyzer);

                                      IndexWriter indexwriter = new IndexWriter(d, config);

                                      Document doc = new Document();

                                      doc.add(new StringField("id", "1", Store.YES));

                                      doc.add(new StringField("name", "brockhong", Store.YES));

                                      doc.add(new TextField("content",

                                                                  "張萌萌是勤奮地漂亮的姑娘,/用一塊錢打造經濟的航空領域中的航空母艦地點在深圳。ABCD.#$% Hello World!\n又一段文本123 3.0",Store.YES));

                                      // 寫入數據

                                      indexwriter.addDocument(doc);

                                      // 提交

                                      indexwriter.commit();             }}



          下載jar/Files/brock/ictclas4j.7z

          posted on 2015-01-07 10:11 brock 閱讀(1103) 評論(0)  編輯  收藏 所屬分類: Lucene

          只有注冊用戶登錄后才能發表評論。


          網站導航:
           
          主站蜘蛛池模板: 泰来县| 陆良县| 海兴县| 无极县| 晋中市| 奉节县| 乌鲁木齐市| 东港市| 汾阳市| 哈尔滨市| 南昌市| 吉隆县| 图片| 巴里| 新竹市| 永新县| 什邡市| 台东市| 兴隆县| 华容县| 韶山市| 石狮市| 理塘县| 天祝| 阳东县| 波密县| 九龙城区| 皋兰县| 兴海县| 老河口市| 吉水县| 嵊泗县| 石棉县| 台中市| 云林县| 交口县| 湖口县| 东乡| 大新县| 阿巴嘎旗| 特克斯县|