隨筆-295  評論-26  文章-1  trackbacks-0
          字符串切分
          ?package demo.analysis;
          ?
          ?import java.io.IOException;
          ?
          ?import jeasy.analysis.MMAnalyzer;
          ?
          ?public class Segment?
          ?{
          ????????
          ???? public static void main(String[] args)?
          ???? {
          ???????? String text = "據路透社報道,印度尼西亞社會事務部一官員星期二(29日)表示,"
          ??????????????? + "日惹市附近當地時間27日晨5時53分發生的里氏6.2級地震已經造成至少5427人死亡,"
          ??????????????? + "20000余人受傷,近20萬人無家可歸。";?
          ?????????
          ???????? MMAnalyzer analyzer = new MMAnalyzer();
          ???????? try?
          ???????? {
          ???????????????? System.out.println(analyzer.segment(text, " | "));
          ???????? }?
          ???????? catch (IOException e)?
          ???????? {
          ???????????????? e.printStackTrace();
          ???????? }
          ???? }
          ?}
          ?
          生成效果:
          據 | 路透社 | 報道 | 印度尼西亞 | 社會 | 事務 | 部 | 官員 | 星期二 | 29日 | 表示 | 日惹 | 市 |
          附近 | 當地時間 | 27日 | 晨 | 5時 | 53分 | 發生 | 里氏 | 6.2級 | 地震 | 已經 | 造成 | 至少 |
          5427人 | 死亡 | 20000 | 余人 | 受傷 | 近 | 20萬人 | 無家可歸 |
          ?
          Lucene搜索
          ?package demo.analysis;
          ?
          ?import jeasy.analysis.MMAnalyzer;
          ?
          ?import org.apache.lucene.analysis.Analyzer;
          ?import org.apache.lucene.document.Document;
          ?import org.apache.lucene.document.Field;
          ?import org.apache.lucene.index.IndexWriter;
          ?import org.apache.lucene.queryParser.QueryParser;
          ?import org.apache.lucene.search.Hits;
          ?import org.apache.lucene.search.IndexSearcher;
          ?import org.apache.lucene.search.Query;
          ?import org.apache.lucene.store.Directory;
          ?import org.apache.lucene.store.RAMDirectory;
          ?
          ?public class Segment?
          ?{
          ?????
          ???? public static void main(String[] args)?
          ???? {
          ???????? String fieldName = "text";
          ???????? String text = "據路透社報道,印度尼西亞社會事務部一官員星期二(29日)表示,"
          ???????????? + "日惹市附近當地時間27日晨5時53分發生的里氏6.2級地震已經造成至少5427人死亡,"
          ???????????? + "20000余人受傷,近20萬人無家可歸。"; //檢索內容
          ?
          ???????? //采用正向最大匹配的中文分詞算法
          ???????? Analyzer analyzer = new MMAnalyzer();
          ?
          ???????? Directory directory = new RAMDirectory();
          ???????? //Directory directory = FSDirectory.getDirectory("/tmp/testindex", true);
          ?
          ???????? try
          ????????? {
          ???????????? IndexWriter iwriter = new IndexWriter(directory, analyzer, true);
          ???????????? iwriter.setMaxFieldLength(25000);
          ???????????? Document doc = new Document();
          ???????????? doc.add(new Field(fieldName, text, Field.Store.YES, Field.Index.TOKENIZED));
          ???????????? iwriter.addDocument(doc);
          ???????????? iwriter.close();
          ?????????????
          ???????????? IndexSearcher isearcher = new IndexSearcher(directory);
          ???????????? QueryParser parser = new QueryParser(fieldName, analyzer);
          ???????????? Query query = parser.parse("印度尼西亞 6.2級地震");//檢索詞
          ???????????? Hits hits = isearcher.search(query);
          ???????????? System.out.println("命中:" + hits.length());
          ?
          ???????????? for (int i = 0; i < hits.length(); i++)?
          ???????????? {
          ???????????????? Document hitDoc = hits.doc(i);
          ???????????????? System.out.println("內容:" + hitDoc.get(fieldName));
          ???????????? }
          ?
          ???????????? isearcher.close();
          ???????????? directory.close();
          ???????? }?
          ???????? catch (Exception e)?
          ???????? {
          ???????????? e.printStackTrace();
          ???????? }???
          ???? }
          ?????
          ?}
          ?
          生成效果:
          命中:1
          內容:據路透社報道,印度尼西亞社會事務部一官員星期二(29日)表示,日惹市附近當地時間27日晨5時53分發生的
          里氏6.2級地震已經造成至少5427人死亡,20000余人受傷,近20萬人無家可歸。
          搜索詞加亮
          ?package demo.analysis;
          ?
          ?import jeasy.analysis.MMAnalyzer;
          ?
          ?import org.apache.lucene.analysis.Analyzer;
          ?import org.apache.lucene.analysis.TokenStream;
          ?import org.apache.lucene.document.Document;
          ?import org.apache.lucene.document.Field;
          ?import org.apache.lucene.index.IndexReader;
          ?import org.apache.lucene.index.IndexWriter;
          ?import org.apache.lucene.index.TermPositionVector;
          ?import org.apache.lucene.queryParser.QueryParser;
          ?import org.apache.lucene.search.Hits;
          ?import org.apache.lucene.search.IndexSearcher;
          ?import org.apache.lucene.search.Query;
          ?import org.apache.lucene.search.highlight.Highlighter;
          ?import org.apache.lucene.search.highlight.QueryScorer;
          ?import org.apache.lucene.search.highlight.TokenSources;
          ?import org.apache.lucene.store.Directory;
          ?import org.apache.lucene.store.RAMDirectory;
          ?
          ?public class Segment
          ?{
          ?
          ???? public static void main(String[] args)
          ???? {
          ???????? String fieldName = "text";
          ???????? String text = "據路透社報道,印度尼西亞社會事務部一官員星期二(29日)表示,"
          ???????????? + "日惹市附近當地時間27日晨5時53分發生的里氏6.2級地震已經造成至少5427人死亡,"
          ???????????? + "20000余人受傷,近20萬人無家可歸。"; //檢索內容
          ?
          ???????? //采用正向最大匹配的中文分詞算法
          ???????? Analyzer analyzer = new MMAnalyzer();
          ?
          ???????? Directory directory = new RAMDirectory();
          ???????? //Directory directory = FSDirectory.getDirectory("/tmp/testindex", true);
          ?
          ???????? try
          ???????? {
          ???????????? IndexWriter iwriter = new IndexWriter(directory, analyzer, true);
          ???????????? iwriter.setMaxFieldLength(25000);
          ???????????? Document doc = new Document();
          ???????????? doc.add(new Field(fieldName, text, Field.Store.YES,
          ???????????????????? Field.Index.TOKENIZED,
          ???????????????????? Field.TermVector.WITH_POSITIONS_OFFSETS));
          ???????????? iwriter.addDocument(doc);
          ???????????? iwriter.close();
          ?
          ???????????? IndexSearcher isearcher = new IndexSearcher(directory);
          ???????????? QueryParser parser = new QueryParser(fieldName, analyzer);
          ???????????? Query query = parser.parse("印度尼西亞 6.2級地震");//檢索詞
          ???????????? Hits hits = isearcher.search(query);
          ???????????? System.out.println("命中:" + hits.length());
          ?
          ???????????? Highlighter highlighter = new Highlighter(new QueryScorer(query));
          ???????????? for (int i = 0; i < hits.length(); i++)
          ???????????? {
          ???????????????? text = hits.doc(i).get(fieldName);
          ???????????????? TermPositionVector tpv = (TermPositionVector) IndexReader.open(
          ???????????????????? directory).getTermFreqVector(hits.id(i), fieldName);
          ???????????????? TokenStream tokenStream = TokenSources.getTokenStream(tpv);
          ???????????????? String result = highlighter.getBestFragments(tokenStream, text, 3, "...");
          ???????????????? System.out.println("內容:" + result);
          ???????????? }
          ?
          ???????????? isearcher.close();
          ???????????? directory.close();
          ???????? }
          ???????? catch (Exception e)
          ???????? {
          ???????????? e.printStackTrace();
          ???????? }
          ???? }
          ?
          ?}
          ?
          生成效果:
          命中:1
          內容:據路透社報道,<B>印度尼西亞</B>社會事務部一官員星期二(29日)表示,日惹市附近當地時間27日晨
          5時53分發生的里氏<B>6.2級</B><B>地震</B>已經造成至少5427人死亡,20000余人受傷,近20萬人無家可歸


          大盤預測 國富論
          posted on 2008-10-29 10:21 華夢行 閱讀(903) 評論(0)  編輯  收藏

          只有注冊用戶登錄后才能發表評論。


          網站導航:
          博客園   IT新聞   Chat2DB   C++博客   博問  
           
          主站蜘蛛池模板: 翁源县| 永德县| 邢台县| 武安市| 泰安市| 仁寿县| 浙江省| 保靖县| 屏东市| 沭阳县| 福鼎市| 新建县| 莱阳市| 青浦区| 苏尼特右旗| 巩留县| 无极县| 武乡县| 炎陵县| 濮阳市| 大连市| 大安市| 察雅县| 汨罗市| 吉安县| 阿克苏市| 镇坪县| 华容县| 朝阳市| 轮台县| 新密市| 庆阳市| 卢湾区| 延安市| 阿拉尔市| 通州市| 昌邑市| 新田县| 海口市| 措勤县| 琼结县|