posts - 40, comments - 7, trackbacks - 0

Lucene In Action ch 5 筆記 --高級搜索技術

----- 2006-2-15

該章介紹了Lucene的一些高級技術,如結果排序,搜索多個Index,過慮技術....下面就看看這些高級技巧吧.

I.Sorting search results

在Lucene中有兩種特別的類型是用來排序的:Score和Index order

要排序結果可以使用IndexSearcher的重載serach函數,提供一個Sort參數.看個例子. SortingExample.java

01 ?package?lia.advsearching;
02 ?
03 ?import?org.apache.commons.lang.StringUtils;
04 ?import?org.apache.lucene.document.Document;
05 ?import?org.apache.lucene.index.Term;
06 ?import?org.apache.lucene.search.Hits;
07 ?import?org.apache.lucene.search.IndexSearcher;
08 ?import?org.apache.lucene.search.Query;
09 ?import?org.apache.lucene.search.RangeQuery;
10 ?import?org.apache.lucene.search.Sort;
11 ?import?org.apache.lucene.search.SortField;
12 ?import?org.apache.lucene.store.Directory;
13 ?import?org.apache.lucene.store.FSDirectory;
14 ?
15 ?import?java.io.IOException;
16 ?import?java.text.DecimalFormat;
17 ?
18 ?public?class?SortingExample?{
19 ???private?Directory?directory;
20 ?
21 ???public?SortingExample(Directory?directory)?{
22 ?????this.directory?=?directory;
23 ???}
24 ?? // 顯示搜索結果
25 ???public?void?displayHits(Query?query,?Sort?sort)
26 ???????throws?IOException?{
27 ?????IndexSearcher?searcher?=?new?IndexSearcher(directory);
28 ?
29 ?????Hits?hits?=?searcher.search(query,?sort);? // 安 sort 來排序搜索結果
30 ?
31 ?????System.out.println("\nResults?for:?"?+
32 ?????????query.toString()?+?"?sorted?by?"?+?sort);? // 打印 query 和 sort
33 ?
34 ?????System.out.println(StringUtils.rightPad("Title",?30)?+?? // 使用 StringUtils( 來自 Apache commons) 打印結果
35 ?????????StringUtils.rightPad("pubmonth",?10)?+
36 ?????????StringUtils.center("id",?4)?+
37 ?????????StringUtils.center("score",?15));
38 ?
39 ?????DecimalFormat?scoreFormatter?=?new?DecimalFormat("0.######");
40 ?????for?(int?i?=?0;?i?<?hits.length();?i++)?{???????????????? // 打印結果
41 ???????Document?doc?=?hits.doc(i);
42 ???????System.out.println(
43 ???????????StringUtils.rightPad(
44 ???????????????StringUtils.abbreviate(doc.get("title"),?29),?30)?+
45 ???????????StringUtils.rightPad(doc.get("pubmonth"),?10)?+
46 ???????????StringUtils.center(""?+?hits.id(i),?4)?+
47 ???????????StringUtils.leftPad(
48 ???????????????scoreFormatter.format(hits.score(i)),?12));
49 ???????System.out.println("???"?+?doc.get("category"));
50 ?//??????System.out.println(searcher.explain(query,?hits.id(i)));
51 ?????}
52 ?
53 ?????searcher.close();
54 ???}
55 ?
56 ???public?static?void?main(String[]?args)?throws?Exception?{
57 ?????Term?earliest?=?new?Term("pubmonth",?"190001");
58 ?????Term?latest?=?new?Term("pubmonth",?"201012");
59 ?????RangeQuery?allBooks?=?new?RangeQuery(earliest,?latest,?true);???? //? query
60 ?
61 ?????String?indexDir?=?System.getProperty("index.dir");? // index 的目錄
62 ?
63 ?????FSDirectory?directory?=
64 ?????????FSDirectory.getDirectory(indexDir,?false);
65 ?????SortingExample?example?=?new?SortingExample(directory);
66 ?
67 ?????example.displayHits(allBooks,?Sort.RELEVANCE);? // 使用 Lucene 默認的排序
68 ?
69 ?????example.displayHits(allBooks,?Sort.INDEXORDER);? // 根據 IndexOrder 排序
70 ?
71 ?????example.displayHits(allBooks,?new?Sort("category"));? // 根據 category 排序
72 ?
73 ?????example.displayHits(allBooks,?new?Sort("pubmonth",?true));? // 根據 pubmonth 排序
74 ?
75 ?????example.displayHits(allBooks,
76 ?????????new?Sort(new?SortField[]{
77 ???????????new?SortField("category"),
78 ???????????SortField.FIELD_SCORE,
79 ???????????new?SortField("pubmonth",?SortField.INT,?true)
80 ?????????}));? ///
81 ?
82 ?
83 ?????example.displayHits(allBooks,?new?Sort(new?SortField[]?{SortField.FIELD_SCORE,?new?SortField("category")}));
84 ???}
85 ?}

當sort 參數是null ,new Sort(),和Sort.RELEVANCE 時,使用的是Lucene的默認排序(按照Relevance的遞減排序), 默認搜索的結果如下:

先按照Score遞減排序如果Score相同則按照Docnum 遞增排序.

If the order documents were indexed is relevant, you can use Sort.INDEXORDER .

下面是其輸出結果:(安裝ID來排序)

要利用Field排序,該field要滿足第二章排序(參考我的Blog上的內容)的要求. 下面是使用category field的輸出.

默認的field排序是按照自然排序,利用Sort的重載函數,提供一個reverse參數可以改變順序.結果如下:

example.displayHits(allBooks, new Sort("pubmonth", true)); 提供了 true 參數 .???

還可以根據多個 Field 排序 . 用法如下 :

example.displayHits(allBooks,

new Sort(new SortField[]{

new SortField("category"),

SortField.FIELD_SCORE,

new SortField("pubmonth", SortField.INT, true)

}));

結果如下 :

當使用 SortField.STRING 類型來排序時 , 結果可能會跟 Locale 有關 , 可以使用如下方法設置

public SortField (String field, Locale locale)

public SortField (String field, Locale locale, boolean reverse)

在排序時候 , 要占用更多的資源 . 這一點值得注意 .

II. 使用 PhrasePrefixQuery

PhrasePrefixQuery 可以說是 PhraseQuery 的一個增強版 , 可以在同一個位置放置多個 term,slop 設置和 PhraseQuery 的一樣 . 看個例子

01 ?package?lia.advsearching;
02 ?
03 ?import?junit.framework.TestCase;
04 ?import?org.apache.lucene.analysis.WhitespaceAnalyzer;
05 ?import?org.apache.lucene.document.Document;
06 ?import?org.apache.lucene.document.Field;
07 ?import?org.apache.lucene.index.IndexWriter;
08 ?import?org.apache.lucene.index.Term;
09 ?import?org.apache.lucene.search.BooleanQuery;
10 ?import?org.apache.lucene.search.Hits;
11 ?import?org.apache.lucene.search.IndexSearcher;
12 ?import?org.apache.lucene.search.PhrasePrefixQuery;
13 ?import?org.apache.lucene.search.PhraseQuery;
14 ?import?org.apache.lucene.store.RAMDirectory;
15 ?
16 ?import?java.io.IOException;
17 ?
18 ?public?class?PhrasePrefixQueryTest?extends?TestCase?{
19 ???private?IndexSearcher?searcher;
20 ?
21 ???protected?void?setUp()?throws?Exception?{
22 ?????RAMDirectory?directory?=?new?RAMDirectory();
23 ?????IndexWriter?writer?=?new?IndexWriter(directory,
24 ?????????new?WhitespaceAnalyzer(),?true);
25 ?????Document?doc1?=?new?Document();
26 ?????doc1.add(Field.Text("field",
27 ???????????????"the?quick?brown?fox?jumped?over?the?lazy?dog"));?? /// 一個文檔含有 quick fox
28 ?????writer.addDocument(doc1);
29 ?????Document?doc2?=?new?Document();
30 ?????doc2.add(Field.Text("field",
31 ???????????????"the?fast?fox?hopped?over?the?hound"));???????????? /// 另一個文檔含有 fast fox
32 ?????writer.addDocument(doc2);
33 ?????writer.close();
34 ?
35 ?????searcher?=?new?IndexSearcher(directory);
36 ???}
37 ?
38 ???public?void?testBasic()?throws?Exception?{
39 ?????PhrasePrefixQuery?query?=?new?PhrasePrefixQuery();?? // 構造一個 PhrasePrefixQuery
40 ?????query.add(new?Term[]?{???????????????????????????????? // 搜索一個 含有 quick fox 或者 fast fox 的文檔
41 ???????new?Term("field",?"quick"),
42 ???????new?Term("field",?"fast")
43 ?????});?????????????????????????????
44 ?????query.add(new?Term("field",?"fox"));?????????????????????? // 默認的 slop
45 ?????System.out.println(query);
46 ?
47 ?????Hits?hits?=?searcher.search(query);
48 ?????assertEquals("fast?fox?match",?1,?hits.length());
49 ?
50 ?????query.setSlop(1);????????????????????????????????????????? // 設置 slop
51 ?????hits?=?searcher.search(query);
52 ?????assertEquals("both?match",?2,?hits.length());
53 ???}
54 ?
55 ???public?void?testAgainstOR()?throws?Exception?{
56 ?????PhraseQuery?quickFox?=?new?PhraseQuery();
57 ?????quickFox.setSlop(1);
58 ?????quickFox.add(new?Term("field",?"quick"));
59 ?????quickFox.add(new?Term("field",?"fox"));
60 ?
61 ?????PhraseQuery?fastFox?=?new?PhraseQuery();
62 ?????fastFox.add(new?Term("field",?"fast"));
63 ?????fastFox.add(new?Term("field",?"fox"));
64 ?
65 ?????BooleanQuery?query?=?new?BooleanQuery();???????????????? // 使用 BooleanQuery 和 PhraseQuery? 構造和上面等級的搜索條件
66 ?????query.add(quickFox,?false,?false);
67 ?????query.add(fastFox,?false,?false);
68 ?????Hits?hits?=?searcher.search(query);
69 ?????assertEquals(2,?hits.length());
70 ???}
71 ?
72 ?
73 ???private?void?debug(Hits?hits)?throws?IOException?{
74 ?????for?(int?i=0;?i?<?hits.length();?i++)?{
75 ???????Document?doc?=?hits.doc(i);
76 ???????System.out.println(hits.score(i)?+?":?"?+?doc.get("field"));
77 ?????}
78 ?
79 ???}
80 ?}

注意:

One difference between PhrasePrefixQuery and the BooleanQuery of Phrase-

Query ’s approach is that the slop factor is applied globally with PhrasePrefix-

Query —it’s applied on a per-phrase basis with PhraseQuery .

? Lucene’s QueryParser doesn’t currently support PhrasePrefixQuery .

III. Querying on multiple fields at once

MultiFieldQueryParser支持對于多個字段進行同一個關鍵字的搜索. 該類使用比較簡單看看例子:

01 ?package?lia.advsearching;
02 ?
03 ?import?lia.common.LiaTestCase;
04 ?import?org.apache.lucene.analysis.SimpleAnalyzer;
05 ?import?org.apache.lucene.queryParser.MultiFieldQueryParser;
06 ?import?org.apache.lucene.search.Hits;
07 ?import?org.apache.lucene.search.IndexSearcher;
08 ?import?org.apache.lucene.search.Query;
09 ?
10 ?public?class?MultiFieldQueryParserTest?extends?LiaTestCase?{
11 ???public?void?testDefaultOperator()?throws?Exception?{
12 ?????Query?query?=?MultiFieldQueryParser.parse("development",
13 ?????????new?String[]{"title",?"subject"},
14 ?????????new?SimpleAnalyzer());
15 ?
16 ?????IndexSearcher?searcher?=?new?IndexSearcher(directory);
17 ?????Hits?hits?=?searcher.search(query);
18 ?
19 ?????assertHitsIncludeTitle(hits,?"Java?Development?with?Ant");
20 ?
21 ?????//?has?"development"?in?the?subject?field
22 ?????assertHitsIncludeTitle(hits,?"Extreme?Programming?Explained");
23 ???}
24 ?
25 ???public?void?testSpecifiedOperator()?throws?Exception?{
26 ?????Query?query?=?MultiFieldQueryParser.parse("development",
27 ?????????new?String[]{"title",?"subject"},??????????????????????????? /// 在兩個 Field 中搜索
28 ?????????new?int[]{MultiFieldQueryParser.REQUIRED_FIELD,
29 ???????????????????MultiFieldQueryParser.REQUIRED_FIELD},
30 ?????????new?SimpleAnalyzer());
31 ?
32 ?????IndexSearcher?searcher?=?new?IndexSearcher(directory);
33 ?????Hits?hits?=?searcher.search(query);
34 ?
35 ?????assertHitsIncludeTitle(hits,?"Java?Development?with?Ant");
36 ?????assertEquals("one?and?only?one",?1,?hits.length());
37 ???}
38 ?
39 ?}

注意:

Generally speaking, querying on multiple fields isn’t the best practice for user-entered queries. More commonly, all words you want searched are indexed into a contents or keywords field by combining various fields. A synthetic contents field in our test environment uses this scheme to put author and subjects together:

doc.add(Field.UnStored("contents", author + " " + subjects));

We used a space ( " " ) between author and subjects to separate words for the analyzer. Allowing users to enter text in the simplest manner possible without the need to qualify field names generally makes for a less confusing user experience.

If you choose to use MultiFieldQueryParser , be sure your queries are fabricated appropriately using the QueryParser and Analyzer diagnostic techniques shown in chapters 3 and 4. Plenty of odd interactions with analysis occur using Query-Parser , and these are compounded using MultiFieldQueryParser .