Lucene In Action ch 5 筆記 --高級(jí)搜索技術(shù)
----- 2006-2-15
該章介紹了Lucene的一些高級(jí)技術(shù),如 結(jié)果排序,搜索多個(gè)Index,過慮技術(shù)....下面就看看這些高級(jí)技巧吧.
I.Sorting search results
在Lucene中有兩種特別的類型是用來排序的:Score和Index order
要排序結(jié)果 可以使用IndexSearcher的重載serach函數(shù),提供一個(gè)Sort參數(shù).看個(gè)例子.
SortingExample.java
01
?package?lia.advsearching;
02
?
03
?import?org.apache.commons.lang.StringUtils;
04
?import?org.apache.lucene.document.Document;
05
?import?org.apache.lucene.index.Term;
06
?import?org.apache.lucene.search.Hits;
07
?import?org.apache.lucene.search.IndexSearcher;
08
?import?org.apache.lucene.search.Query;
09
?import?org.apache.lucene.search.RangeQuery;
10
?import?org.apache.lucene.search.Sort;
11
?import?org.apache.lucene.search.SortField;
12
?import?org.apache.lucene.store.Directory;
13
?import?org.apache.lucene.store.FSDirectory;
14
?
15
?import?java.io.IOException;
16
?import?java.text.DecimalFormat;
17
?
18
?public?class?SortingExample?{
19
???private?Directory?directory;
20
?
21
???public?SortingExample(Directory?directory)?{
22
?????this.directory?=?directory;
23
???}
24
?? //
顯示搜索結(jié)果
25
???public?void?displayHits(Query?query,?Sort?sort)
26
???????throws?IOException?{
27
?????IndexSearcher?searcher?=?new?IndexSearcher(directory);
28
?
29
?????Hits?hits?=?searcher.search(query,?sort);? //
安
sort
來排序搜索結(jié)果
30
?
31
?????System.out.println("\nResults?for:?"?+
32
?????????query.toString()?+?"?sorted?by?"?+?sort);? //
打印
query
和
sort
33
?
34
?????System.out.println(StringUtils.rightPad("Title",?30)?+?? //
使用
StringUtils(
來自
Apache commons)
打印結(jié)果
35
?????????StringUtils.rightPad("pubmonth",?10)?+
36
?????????StringUtils.center("id",?4)?+
37
?????????StringUtils.center("score",?15));
38
?
39
?????DecimalFormat?scoreFormatter?=?new?DecimalFormat("0.######");
40
?????for?(int?i?=?0;?i?<?hits.length();?i++)?{???????????????? //
打印結(jié)果
41
???????Document?doc?=?hits.doc(i);
42
???????System.out.println(
43
???????????StringUtils.rightPad(
44
???????????????StringUtils.abbreviate(doc.get("title"),?29),?30)?+
45
???????????StringUtils.rightPad(doc.get("pubmonth"),?10)?+
46
???????????StringUtils.center(""?+?hits.id(i),?4)?+
47
???????????StringUtils.leftPad(
48
???????????????scoreFormatter.format(hits.score(i)),?12));
49
???????System.out.println("???"?+?doc.get("category"));
50
?//??????System.out.println(searcher.explain(query,?hits.id(i)));
51
?????}
52
?
53
?????searcher.close();
54
???}
55
?
56
???public?static?void?main(String[]?args)?throws?Exception?{
57
?????Term?earliest?=?new?Term("pubmonth",?"190001");
58
?????Term?latest?=?new?Term("pubmonth",?"201012");
59
?????RangeQuery?allBooks?=?new?RangeQuery(earliest,?latest,?true);???? //? query
60
?
61
?????String?indexDir?=?System.getProperty("index.dir");? // index
的目錄
62
?
63
?????FSDirectory?directory?=
64
?????????FSDirectory.getDirectory(indexDir,?false);
65
?????SortingExample?example?=?new?SortingExample(directory);
66
?
67
?????example.displayHits(allBooks,?Sort.RELEVANCE);? //
使用
Lucene
默認(rèn)的排序
68
?
69
?????example.displayHits(allBooks,?Sort.INDEXORDER);? //
根據(jù)
IndexOrder
排序
70
?
71
?????example.displayHits(allBooks,?new?Sort("category"));? //
根據(jù)
category
排序
72
?
73
?????example.displayHits(allBooks,?new?Sort("pubmonth",?true));? //
根據(jù)
pubmonth
排序
74
?
75
?????example.displayHits(allBooks,
76
?????????new?Sort(new?SortField[]{
77
???????????new?SortField("category"),
78
???????????SortField.FIELD_SCORE,
79
???????????new?SortField("pubmonth",?SortField.INT,?true)
80
?????????}));? ///
81
?
82
?
83
?????example.displayHits(allBooks,?new?Sort(new?SortField[]?{SortField.FIELD_SCORE,?new?SortField("category")}));
84
???}
85
?}
當(dāng)sort 參數(shù)是null ,new Sort(),和Sort.RELEVANCE 時(shí),使用的是Lucene的默認(rèn)排序(按照Relevance的遞減排序), 默認(rèn)搜索的結(jié)果如下:
先按照Score遞減排序 如果Score相同則按照Docnum 遞增排序.
If the order documents were indexed is relevant, you can use
Sort.INDEXORDER
.
下面是其輸出結(jié)果:(安裝ID來排序)
要利用Field排序,該field要滿足第二章排序(參考我的Blog上的內(nèi)容)的要求. 下面是使用category field的輸出.
默認(rèn)的field排序是按照自然排序,利用Sort的重載函數(shù),提供一個(gè)reverse參數(shù)可以改變順序.結(jié)果如下:
example.displayHits(allBooks, new Sort("pubmonth", true)); 提供了 true 參數(shù) .???
還可以根據(jù)多個(gè) Field 排序 . 用法如下 :
example.displayHits(allBooks,
new Sort(new SortField[]{
new SortField("category"),
SortField.FIELD_SCORE,
new SortField("pubmonth", SortField.INT, true)
}));
結(jié)果如下
:
當(dāng)使用
SortField.STRING
類型來排序時(shí)
,
結(jié)果可能會(huì)跟
Locale
有關(guān)
,
可以使用如下方法設(shè)置
public SortField (String field, Locale locale)
public SortField (String field, Locale locale, boolean reverse)
在排序時(shí)候 , 要占用更多的資源 . 這一點(diǎn)值得注意 .
II.
使用
PhrasePrefixQuery
PhrasePrefixQuery
可以說是
PhraseQuery
的一個(gè)增強(qiáng)版
,
可以在同一個(gè)位置放置多個(gè)
term,slop
設(shè)置和
PhraseQuery
的一樣
.
看個(gè)例子
01
?package?lia.advsearching;
02
?
03
?import?junit.framework.TestCase;
04
?import?org.apache.lucene.analysis.WhitespaceAnalyzer;
05
?import?org.apache.lucene.document.Document;
06
?import?org.apache.lucene.document.Field;
07
?import?org.apache.lucene.index.IndexWriter;
08
?import?org.apache.lucene.index.Term;
09
?import?org.apache.lucene.search.BooleanQuery;
10
?import?org.apache.lucene.search.Hits;
11
?import?org.apache.lucene.search.IndexSearcher;
12
?import?org.apache.lucene.search.PhrasePrefixQuery;
13
?import?org.apache.lucene.search.PhraseQuery;
14
?import?org.apache.lucene.store.RAMDirectory;
15
?
16
?import?java.io.IOException;
17
?
18
?public?class?PhrasePrefixQueryTest?extends?TestCase?{
19
???private?IndexSearcher?searcher;
20
?
21
???protected?void?setUp()?throws?Exception?{
22
?????RAMDirectory?directory?=?new?RAMDirectory();
23
?????IndexWriter?writer?=?new?IndexWriter(directory,
24
?????????new?WhitespaceAnalyzer(),?true);
25
?????Document?doc1?=?new?Document();
26
?????doc1.add(Field.Text("field",
27
???????????????"the?quick?brown?fox?jumped?over?the?lazy?dog"));?? ///
一個(gè)文檔含有
quick fox
28
?????writer.addDocument(doc1);
29
?????Document?doc2?=?new?Document();
30
?????doc2.add(Field.Text("field",
31
???????????????"the?fast?fox?hopped?over?the?hound"));???????????? ///
另一個(gè)文檔含有
fast fox
32
?????writer.addDocument(doc2);
33
?????writer.close();
34
?
35
?????searcher?=?new?IndexSearcher(directory);
36
???}
37
?
38
???public?void?testBasic()?throws?Exception?{
39
?????PhrasePrefixQuery?query?=?new?PhrasePrefixQuery();?? //
構(gòu)造一個(gè)
PhrasePrefixQuery
40
?????query.add(new?Term[]?{???????????????????????????????? //
搜索一個(gè)
含有
quick fox
或者
fast fox
的文檔
41
???????new?Term("field",?"quick"),
42
???????new?Term("field",?"fast")
43
?????});?????????????????????????????
44
?????query.add(new?Term("field",?"fox"));?????????????????????? //
默認(rèn)的
slop
45
?????System.out.println(query);
46
?
47
?????Hits?hits?=?searcher.search(query);
48
?????assertEquals("fast?fox?match",?1,?hits.length());
49
?
50
?????query.setSlop(1);????????????????????????????????????????? //
設(shè)置
slop
51
?????hits?=?searcher.search(query);
52
?????assertEquals("both?match",?2,?hits.length());
53
???}
54
?
55
???public?void?testAgainstOR()?throws?Exception?{
56
?????PhraseQuery?quickFox?=?new?PhraseQuery();
57
?????quickFox.setSlop(1);
58
?????quickFox.add(new?Term("field",?"quick"));
59
?????quickFox.add(new?Term("field",?"fox"));
60
?
61
?????PhraseQuery?fastFox?=?new?PhraseQuery();
62
?????fastFox.add(new?Term("field",?"fast"));
63
?????fastFox.add(new?Term("field",?"fox"));
64
?
65
?????BooleanQuery?query?=?new?BooleanQuery();???????????????? //
使用
BooleanQuery
和
PhraseQuery?
構(gòu)造和上面等級(jí)的搜索條件
66
?????query.add(quickFox,?false,?false);
67
?????query.add(fastFox,?false,?false);
68
?????Hits?hits?=?searcher.search(query);
69
?????assertEquals(2,?hits.length());
70
???}
71
?
72
?
73
???private?void?debug(Hits?hits)?throws?IOException?{
74
?????for?(int?i=0;?i?<?hits.length();?i++)?{
75
???????Document?doc?=?hits.doc(i);
76
???????System.out.println(hits.score(i)?+?":?"?+?doc.get("field"));
77
?????}
78
?
79
???}
80
?}
注意:
One difference between
PhrasePrefixQuery
and the
BooleanQuery
of
Phrase-
Query
’s approach is that the slop factor is applied globally with
PhrasePrefix-
Query
—it’s applied on a per-phrase basis with
PhraseQuery
.
? Lucene’s QueryParser doesn’t currently support PhrasePrefixQuery .
III.
Querying on multiple fields at once
MultiFieldQueryParser支持對(duì)于多個(gè)字段進(jìn)行同一個(gè)關(guān)鍵字的搜索. 該類使用比較簡單看看例子:
01
?package?lia.advsearching;
02
?
03
?import?lia.common.LiaTestCase;
04
?import?org.apache.lucene.analysis.SimpleAnalyzer;
05
?import?org.apache.lucene.queryParser.MultiFieldQueryParser;
06
?import?org.apache.lucene.search.Hits;
07
?import?org.apache.lucene.search.IndexSearcher;
08
?import?org.apache.lucene.search.Query;
09
?
10
?public?class?MultiFieldQueryParserTest?extends?LiaTestCase?{
11
???public?void?testDefaultOperator()?throws?Exception?{
12
?????Query?query?=?MultiFieldQueryParser.parse("development",
13
?????????new?String[]{"title",?"subject"},
14
?????????new?SimpleAnalyzer());
15
?
16
?????IndexSearcher?searcher?=?new?IndexSearcher(directory);
17
?????Hits?hits?=?searcher.search(query);
18
?
19
?????assertHitsIncludeTitle(hits,?"Java?Development?with?Ant");
20
?
21
?????//?has?"development"?in?the?subject?field
22
?????assertHitsIncludeTitle(hits,?"Extreme?Programming?Explained");
23
???}
24
?
25
???public?void?testSpecifiedOperator()?throws?Exception?{
26
?????Query?query?=?MultiFieldQueryParser.parse("development",
27
?????????new?String[]{"title",?"subject"},??????????????????????????? ///
在兩個(gè)
Field
中搜索
28
?????????new?int[]{MultiFieldQueryParser.REQUIRED_FIELD,
29
???????????????????MultiFieldQueryParser.REQUIRED_FIELD},
30
?????????new?SimpleAnalyzer());
31
?
32
?????IndexSearcher?searcher?=?new?IndexSearcher(directory);
33
?????Hits?hits?=?searcher.search(query);
34
?
35
?????assertHitsIncludeTitle(hits,?"Java?Development?with?Ant");
36
?????assertEquals("one?and?only?one",?1,?hits.length());
37
???}
38
?
39
?}
注意:
Generally speaking, querying on multiple fields isn’t the best practice for user-entered queries. More commonly, all words you want searched are indexed into a
contents
or
keywords
field by combining various fields. A synthetic
contents
field in our test environment uses this scheme to put author and subjects together:
doc.add(Field.UnStored("contents", author + " " + subjects));
We used a space (
" "
) between author and subjects to separate words for the analyzer. Allowing users to enter text in the simplest manner possible without the need to qualify field names generally makes for a less confusing user experience.
If you choose to use
MultiFieldQueryParser
, be sure your queries are fabricated appropriately using the
QueryParser
and
Analyzer
diagnostic techniques shown in chapters 3 and 4. Plenty of odd interactions with analysis occur using
Query-Parser
, and these are compounded using
MultiFieldQueryParser
.