??xml version="1.0" encoding="utf-8" standalone="yes"?>
/**
* dExcel文g
*/
import java.io.*;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import java.util.Date;
import org.apache.poi.hssf.usermodel.HSSFRow;
public class ExcelReader {
// 创徏文g输入?br />
private BufferedReader reader = null;
// 文gcd
private String filetype;
// 文g二进制输入流
private InputStream is = null;
// 当前的Sheet
private int currSheet;
// 当前位置
private int currPosition;
// Sheet数量
private int numOfSheets;
// HSSFWorkbook
HSSFWorkbook workbook = null;
// 讄Cell之间以空格分?br />
private static String EXCEL_LINE_DELIMITER = " ";
// 讄最大列?br /> private static int MAX_EXCEL_COLUMNS = 64;
public int rows = 0;
public int getRows() {
return rows;
}
// 构造函数创Z个ExcelReader
public ExcelReader(String inputfile) throws IOException, Exception {
// 判断参数是否为空或没有意?br />
if (inputfile == null || inputfile.trim().equals("")) {
throw new IOException("no input file specified");
}
// 取得文g名的后缀名赋值给filetype
this.filetype = inputfile.substring(inputfile.lastIndexOf(".") + 1);
// 讄开始行?
currPosition = 0;
// 讄当前位置?
currSheet = 0;
// 创徏文g输入?br />
is = new FileInputStream(inputfile);
// 判断文g格式
if (filetype.equalsIgnoreCase("txt")) {
// 如果是txt则直接创建BufferedReaderd
reader = new BufferedReader(new InputStreamReader(is));
}
else if (filetype.equalsIgnoreCase("xls")) {
// 如果是Excel文g则创建HSSFWorkbookd
workbook = new HSSFWorkbook(is);
// 讄Sheet?br />
numOfSheets = workbook.getNumberOfSheets();
}
else {
throw new Exception("File Type Not Supported");
}
}
// 函数readLined文g的一?br />
public String readLine() throws IOException {
// 如果是txt文g则通过readerd
if (filetype.equalsIgnoreCase("txt")) {
String str = reader.readLine();
// I则略去,直接d下一?br />
while (str.trim().equals("")) {
str = reader.readLine();
}
return str;
}
// 如果是XLS文g则通过POI提供的APId文g
else if (filetype.equalsIgnoreCase("xls")) {
// ҎcurrSheetD得当前的sheet
HSSFSheet sheet = workbook.getSheetAt(currSheet);
rows = sheet.getLastRowNum();
// 判断当前行是否到但前Sheet的结?br />
if (currPosition > sheet.getLastRowNum()) {
// 当前行位|清?br />
currPosition = 0;
// 判断是否q有Sheet
while (currSheet != numOfSheets - 1) {
// 得到下一张Sheet
sheet = workbook.getSheetAt(currSheet + 1);
// 当前行数是否已经到达文g末尾
if (currPosition == sheet.getLastRowNum()) {
// 当前Sheet指向下一张Sheet
currSheet++;
continue;
}
else {
// 获取当前行数
int row = currPosition;
currPosition++;
// d当前行数?br />
return getLine(sheet, row);
}
}
return null;
}
// 获取当前行数
int row = currPosition;
currPosition++;
// d当前行数?br />
return getLine(sheet, row);
}
return null;
}
// 函数getLineq回Sheet的一行数?br />
private String getLine(HSSFSheet sheet, int row) {
// Ҏ行数取得Sheet的一?br />
HSSFRow rowline = sheet.getRow(row);
// 创徏字符创缓冲区
StringBuffer buffer = new StringBuffer();
// 获取当前行的列数
int filledColumns = rowline.getLastCellNum();
HSSFCell cell = null;
// 循环遍历所有列
for (int i = 0; i < filledColumns; i++) {
// 取得当前Cell
cell = rowline.getCell( (short) i);
String cellvalue = null;
if (cell != null) {
// 判断当前Cell的Type
switch (cell.getCellType()) {
// 如果当前Cell的Type为NUMERIC
case HSSFCell.CELL_TYPE_NUMERIC: {
// 判断当前的cell是否为Date
if (HSSFDateUtil.isCellDateFormatted(cell)) {
// 如果是Datecd则,取得该Cell的Date?br />
Date date = cell.getDateCellValue();
// 把Date转换成本地格式的字符?br />
cellvalue = cell.getDateCellValue().toLocaleString();
}
// 如果是纯数字
else {
// 取得当前Cell的数?br />
Integer num = new Integer( (int) cell
.getNumericCellValue());
cellvalue = String.valueOf(num);
}
break;
}
// 如果当前Cell的Type为STRIN
case HSSFCell.CELL_TYPE_STRING:
// 取得当前的Cell字符?br />
cellvalue = cell.getStringCellValue().replaceAll("'", "''");
break;
// 默认的Cell?br />
default:
cellvalue = " ";
}
}
else {
cellvalue = "";
}
// 在每个字D之间插入分割符
buffer.append(cellvalue).append(EXCEL_LINE_DELIMITER);
}
// 以字W串q回该行的数?br />
return buffer.toString();
}
// close函数执行的关闭操作
public void close() {
// 如果is不ؓI,则关闭InputSteam文g输入?br />
if (is != null) {
try {
is.close();
}
catch (IOException e) {
is = null;
}
}
// 如果reader不ؓI则关闭BufferedReader文g输入?br />
if (reader != null) {
try {
reader.close();
}
catch (IOException e) {
reader = null;
}
}
}
public static void main(String[] args) {
try {
ExcelReader er = new ExcelReader("d:\\xp.xls");
String line = er.readLine();
while (line != null) {
System.out.println(line);
line = er.readLine();
}
er.close();
}
catch (Exception e) {
e.printStackTrace();
}
}
}
package searchfileexample;
import javax.servlet.*;
import javax.servlet.http.*;
import java.io.*;
import java.util.*;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Date;
import org.apache.lucene.demo.FileDocument;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.FileReader;
import org.apache.lucene.index.*;
import java.text.DateFormat;
import org.apache.poi.hdf.extractor.WordDocument;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.PrintWriter;
import java.io.FileInputStream;
import java.io.*;
import org.textmining.text.extraction.WordExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
/**
* l某个目录下的所有文件生成烦?br />
* <p>Title: </p>
* <p>Description: </p>
* <p>Copyright: Copyright (c) 2007</p>
* <p>Company: </p>
* @author not attributable
* @version 1.0
* Ҏ文g的不同,可以把烦引文件创建到不同的文件夹下去Q这样可以分cM存烦引信息?br />
*/
public class IndexFilesServlet
extends HttpServlet {
static final File INDEX_DIR = new File("index");
//Initialize global variables
public void init() throws ServletException {
}
//Process the HTTP Get request
public void service(HttpServletRequest request, HttpServletResponse response) throws
ServletException, IOException {
final File docDir = new File("a"); //需要生成烦引的文g的文件夹
if (!docDir.exists() || !docDir.canRead()) {
System.out.println("Document directory '" + docDir.getAbsolutePath() +
"' does not exist or is not readable, please check the path");
System.exit(1);
}
Date start = new Date();
try {
IndexWriter writer = new IndexWriter(INDEX_DIR, new StandardAnalyzer(), true); //true-覆盖原有的烦?false-不覆盖原有的索引
System.out.println("Indexing to directory '" + INDEX_DIR + "'...");
indexDocs(writer, docDir);
System.out.println("Optimizing...");
writer.optimize();
writer.close();
Date end = new Date();
System.out.println(end.getTime() - start.getTime() +
" total milliseconds");
}
catch (IOException e) {
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
}
}
//Clean up resources
public void destroy() {
}
public void indexDocs(IndexWriter writer, File file) throws IOException {
// do not try to index files that cannot be read
int index = 0;
String filehouzui = "";
index = file.getName().indexOf(".");
//strFileName = strFileName.substring(0, index) +DateUtil.getCurrDateTime() + "." + strFileName.substring(index + 1);
filehouzui = file.getName().substring(index + 1);
if (file.canRead()) {
if (file.isDirectory()) {
String[] files = file.list();
// an IO error could occur
if (files != null) {
for (int i = 0; i < files.length; i++) {
indexDocs(writer, new File(file, files[i]));
}
}
}
else {
System.out.println("adding " + file);
try {
if (filehouzui.equals("doc")) {
writer.addDocument(getWordDocument(file, new FileInputStream(file)));
}
else if (filehouzui.equals("txt")) {
writer.addDocument(getTxtDocument(file, new FileInputStream(file)));
}
else if (filehouzui.equals("xls")) {
writer.addDocument(getExcelDocument(file, new FileInputStream(file)));
}
//writer.addDocument(parseFile(file));
//writer.addDocument(FileDocument.Document(file));//path 存放文g的相对\?br />
}
// at least on windows, some temporary files raise this exception with an "access denied" message
// checking if the file can be read doesn't help
catch (Exception fnfe) {
;
}
}
}
}
/**
*@paramfile
*
*把File变成Document
*/
public Document parseFile(File file) throws Exception {
Document doc = new Document();
doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES,
Field.Index.UN_TOKENIZED)); //取文件的l对路径
try {
doc.add(new Field("contents", new FileReader(file))); //索引文g内容
doc.add(new Field("title", file.getName(), Field.Store.YES,
Field.Index.UN_TOKENIZED));
//索引最后修Ҏ?br />
doc.add(new Field("modified",
String.valueOf(DateFormat.
getDateTimeInstance().format(new
Date(file.lastModified()))), Field.Store.YES,
Field.Index.UN_TOKENIZED));
//doc.removeField("title");
}
catch (Exception e) {
e.printStackTrace();
}
return doc;
}
/**
*@paramfile
*
*使用POIdword文
* 不太好用Q读取word文不全
*/
public Document getDocument(File file, FileInputStream is) throws Exception {
String bodyText = null;
try {
WordDocument wd = new WordDocument(is);
StringWriter docTextWriter = new StringWriter();
wd.writeAllText(new PrintWriter(docTextWriter));
bodyText = docTextWriter.toString();
docTextWriter.close();
// bodyText = new WordExtractor().extractText(is);
System.out.println("word content====" + bodyText);
}
catch (Exception e) {
;
}
if ( (bodyText != null)) {
Document doc = new Document();
doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES,
Field.Index.UN_TOKENIZED)); //取文件的l对路径
doc.add(new Field("contents", bodyText, Field.Store.YES,
Field.Index.TOKENIZED));
return doc;
}
return null;
}
//Document doc = getDocument(new FileInputStream(new File(file)));
/**
*@paramfile
*
*使用tm-extractors-0.4.jardword文
* 好用
*/
public Document getWordDocument(File file, FileInputStream is) throws
Exception {
String bodyText = null;
try {
WordExtractor extractor = new WordExtractor();
System.out.println("word文档");
bodyText = extractor.extractText(is);
if ( (bodyText != null)) {
Document doc = new Document();
doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES,
Field.Index.UN_TOKENIZED)); //取文件的l对路径
doc.add(new Field("contents", bodyText, Field.Store.YES,
Field.Index.TOKENIZED));
System.out.println("word content====" + bodyText);
return doc;
}
}
catch (Exception e) {
;
}
return null;
}
/**
*@paramfile
*
*dTXT文档
*/
public Document getTxtDocument(File file, FileInputStream is) throws
Exception {
try {
Reader textReader = new FileReader(file);
Document doc = new Document();
doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES,
Field.Index.UN_TOKENIZED)); //取文件的l对路径
doc.add(new Field("contents", textReader));
return doc;
}
catch (Exception e) {
;
}
return null;
}
/**
* 使用POIdExcel文g
* @param file File
* @param is FileInputStream
* @throws Exception
* @return Document
*/
public Document getExcelDocument(File file, FileInputStream is) throws
Exception {
String bodyText = "";
try {
System.out.println("dexcel文g");
ExcelReader er = new ExcelReader(file.getAbsolutePath());
bodyText = er.readLine();
int rows = 0;
rows = er.getRows();
for (int i = 0; i < rows; i++) {
bodyText = bodyText + er.readLine();
System.out.println("bodyText===" + bodyText);
}
Document doc = new Document();
doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES,
Field.Index.UN_TOKENIZED)); //取文件的l对路径
doc.add(new Field("contents", bodyText, Field.Store.YES,
Field.Index.TOKENIZED));
System.out.println("word content====" + bodyText);
return doc;
}
catch (Exception e) {
System.out.println(e);
}
return null;
}
}
package searchfileexample;
import javax.servlet.*;
import javax.servlet.http.*;
import java.io.*;
import java.util.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.FilterIndexReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Date;
import org.apache.lucene.queryParser.*;
public class SearchFileServlet
extends HttpServlet {
private static final String CONTENT_TYPE = "text/html; charset=GBK";
//Initialize global variables
public void init() throws ServletException {
}
/** Use the norms from one field for all fields. Norms are read into memory,
* using a byte of memory per document per searched field. This can cause
* search of large collections with a large number of fields to run out of
* memory. If all of the fields contain only a single token, then the norms
* are all identical, then single norm vector may be shared. */
private static class OneNormsReader
extends FilterIndexReader {
private String field;
public OneNormsReader(IndexReader in, String field) {
super(in);
this.field = field;
}
public byte[] norms(String field) throws IOException {
return in.norms(this.field);
}
}
//Process the HTTP Get request
public void service(HttpServletRequest request, HttpServletResponse response) throws
ServletException, IOException {
response.setContentType(CONTENT_TYPE);
PrintWriter out = response.getWriter();
String[] args = {
"a", "b"};
String usage =
"Usage: java org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-raw] [-norms field]";
if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
System.out.println(usage);
System.exit(0);
}
String index = "index"; //该值是用来存放生成的烦引文件的文g夹的名称Q不能改?br />
String field = "contents"; //不能修改 field 的?br />
String queries = null; //是用来存N要检索的关键字的一个文件?br />
queries = "D:/lfy_programe/全文?SearchFileExample/aa.txt";
System.out.println("-----------------------" + request.getContextPath());
int repeat = 1;
boolean raw = false;
String normsField = null;
for (int i = 0; i < args.length; i++) {
if ("-index".equals(args[i])) {
index = args[i + 1];
i++;
}
else if ("-field".equals(args[i])) {
field = args[i + 1];
i++;
}
else if ("-queries".equals(args[i])) {
queries = args[i + 1];
i++;
}
else if ("-repeat".equals(args[i])) {
repeat = Integer.parseInt(args[i + 1]);
i++;
}
else if ("-raw".equals(args[i])) {
raw = true;
}
else if ("-norms".equals(args[i])) {
normsField = args[i + 1];
i++;
}
}
IndexReader reader = IndexReader.open(index);
if (normsField != null) {
reader = new OneNormsReader(reader, normsField);
}
Searcher searcher = new IndexSearcher(reader); //用来打开索引文g
Analyzer analyzer = new StandardAnalyzer(); //分析?br />
//Analyzer analyzer = new StandardAnalyzer();
BufferedReader in = null;
if (queries != null) {
in = new BufferedReader(new FileReader(queries));
}
else {
in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
}
QueryParser parser = new QueryParser(field, analyzer);
out.println("<html>");
out.println("<head><title>SearchFileServlet</title></head>");
out.println("<body bgcolor=\"#ffffff\">");
while (true) {
if (queries == null) { // prompt the user
System.out.println("Enter query: ");
}
String line = in.readLine(); //l成查询关键字字W串
System.out.println("查询字符?==" + line);
if (line == null || line.length() == -1) {
break;
}
line = line.trim();
if (line.length() == 0) {
break;
}
Query query = null;
try {
query = parser.parse(line);
}
catch (ParseException ex) {
}
System.out.println("Searching for: " + query.toString(field)); //每个关键?/p>
Hits hits = searcher.search(query);
if (repeat > 0) { // repeat & time as benchmark
Date start = new Date();
for (int i = 0; i < repeat; i++) {
hits = searcher.search(query);
}
Date end = new Date();
System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms");
}
out.println("<p>查询刎ͼ" + hits.length() + "个含有[" +
query.toString(field) + "]的文?lt;/p>");
System.out.println("查询刎ͼ" + hits.length() + " 个含?[" +
query.toString(field) + "]的文?);
final int HITS_PER_PAGE = 10; //查询q回的最大记录数
int currentNum = 5; //当前记录?/p>
for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) {
//start = start + currentNum;
int end = Math.min(hits.length(), start + HITS_PER_PAGE);
for (int i = start; i < end; i++) {
//if (raw) { // output raw format
System.out.println("doc=" + hits.id(i) + " score=" + hits.score(i)); //score是接q度的意?br />
//continue;
//}
Document doc = hits.doc(i);
String path = doc.get("path");
if (path != null) {
System.out.println( (i + 1) + ". " + path);
out.println("<p>" + (i + 1) + ". " + path + "</p>");
String title = doc.get("title");
System.out.println(" modified: " + doc.get("modified"));
if (title != null) {
System.out.println(" Title: " + doc.get("title"));
}
}
else {
System.out.println( (i + 1) + ". " + "No path for this document");
}
}
if (queries != null) { // non-interactive
break;
}
if (hits.length() > end) {
System.out.println("more (y/n) ? ");
line = in.readLine();
if (line.length() == 0 || line.charAt(0) == 'n') {
break;
}
}
}
}
reader.close();
out.println("</body></html>");
}
//Clean up resources
public void destroy() {
}
}
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Date;
import org.apache.lucene.demo.FileDocument;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import java.io.FileReader;
import org.apache.lucene.index.*;
import java.text.DateFormat;
import org.apache.poi.hdf.extractor.WordDocument;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.PrintWriter;
import java.io.FileInputStream;
import java.io.*;
import org.textmining.text.extraction.WordExtractor;
/**
* l某个目录下的所有文件生成烦?br />
* <p>Title: </p>
* <p>Description: </p>
* <p>Copyright: Copyright (c) 2007</p>
* <p>Company: </p>
* @author not attributable
* @version 1.0
* Ҏ文g的不同,可以把烦引文件创建到不同的文件夹下去Q这样可以分cM存烦引信息?br />
*/
/** Index all text files under a directory. */
public class IndexFiles {
private IndexFiles() {}
static final File INDEX_DIR = new File("index");
/** Index all text files under a directory. */
public static void main(String[] args) {
String usage = "java org.apache.lucene.demo.IndexFiles <root_directory>";
//String[] arg = {"a","b"};
//System.out.println(arg[0]);
/*
if (args.length == 0) {
System.err.println("Usage: " + usage);
System.exit(1);
}*/
/*
if (INDEX_DIR.exists()) {
System.out.println("Cannot save index to '" +INDEX_DIR+ "' directory, please delete it first");
System.exit(1);
}*/
final File docDir = new File("a"); //需要生成烦引的文g的文件夹
if (!docDir.exists() || !docDir.canRead()) {
System.out.println("Document directory '" + docDir.getAbsolutePath() +
"' does not exist or is not readable, please check the path");
System.exit(1);
}
Date start = new Date();
try {
IndexWriter writer = new IndexWriter(INDEX_DIR, new StandardAnalyzer(), true); //true-覆盖原有的烦?false-不覆盖原有的索引
System.out.println("Indexing to directory '" + INDEX_DIR + "'...");
indexDocs(writer, docDir);
System.out.println("Optimizing...");
writer.optimize();
writer.close();
Date end = new Date();
System.out.println(end.getTime() - start.getTime() +
" total milliseconds");
}
catch (IOException e) {
System.out.println(" caught a " + e.getClass() +
"\n with message: " + e.getMessage());
}
}
static void indexDocs(IndexWriter writer, File file) throws IOException {
// do not try to index files that cannot be read
if (file.canRead()) {
if (file.isDirectory()) {
String[] files = file.list();
// an IO error could occur
if (files != null) {
for (int i = 0; i < files.length; i++) {
indexDocs(writer, new File(file, files[i]));
}
}
}
else {
System.out.println("adding " + file);
try {
writer.addDocument(getDocument2(file, new FileInputStream(file)));
//writer.addDocument(parseFile(file));
//writer.addDocument(FileDocument.Document(file));//path 存放文g的相对\?br />
}
// at least on windows, some temporary files raise this exception with an "access denied" message
// checking if the file can be read doesn't help
catch (Exception fnfe) {
;
}
}
}
}
/**
*@paramfile
*
*把File变成Document
*/
static Document parseFile(File file) throws Exception {
Document doc = new Document();
doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES,
Field.Index.UN_TOKENIZED)); //取文件的l对路径
try {
doc.add(new Field("contents", new FileReader(file))); //索引文g内容
doc.add(new Field("title", file.getName(), Field.Store.YES,
Field.Index.UN_TOKENIZED));
//索引最后修Ҏ?br />
doc.add(new Field("modified",
String.valueOf(DateFormat.
getDateTimeInstance().format(new
Date(file.lastModified()))), Field.Store.YES,
Field.Index.UN_TOKENIZED));
//doc.removeField("title");
}
catch (Exception e) {
e.printStackTrace();
}
return doc;
}
/**
*@paramfile
*
*转换word文
static String changeWord(File file) throws Exception {
String re = "";
try {
WordDocument wd = new WordDocument(is);
StringWriter docTextWriter = new StringWriter();
wd.writeAllText(new PrintWriter(docTextWriter));
docTextWriter.close();
bodyText = docTextWriter.toString();
} catch (Exception e) {
e.printStackTrace();
}
return re;
}*/
/**
*@paramfile
*
*使用POIdword文档
*/
static Document getDocument(File file, FileInputStream is) throws Exception {
String bodyText = null;
try {
//BufferedReader wt = new BufferedReader(new InputStreamReader(is));
//bodyText = wt.readLine();
//System.out.println("word ===="+bodyText);
WordDocument wd = new WordDocument(is);
StringWriter docTextWriter = new StringWriter();
wd.writeAllText(new PrintWriter(docTextWriter));
bodyText = docTextWriter.toString();
docTextWriter.close();
// bodyText = new WordExtractor().extractText(is);
System.out.println("word content====" + bodyText);
}
catch (Exception e) {
;
}
if ( (bodyText != null)) {
Document doc = new Document();
doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES,
Field.Index.UN_TOKENIZED)); //取文件的l对路径
doc.add(new Field("contents", bodyText, Field.Store.YES,
Field.Index.TOKENIZED));
return doc;
}
return null;
}
//Document doc = getDocument(new FileInputStream(new File(file)));
/**
*@paramfile
*
*使用tm-extractors-0.4.jardword文档
*/
static Document getDocument2(File file, FileInputStream is) throws Exception {
String bodyText = null;
try {
//FileInputStream in = new FileInputStream("D:/lfy_programe/全文?SearchFileExample/a/aa.doc");
// FileInputStream in = new FileInputStream ("D:/szqxjzhbase/技术测?新徏 Microsoft Word 文.doc");
WordExtractor extractor = new WordExtractor();
System.out.println(is.available());
bodyText = extractor.extractText(is);
// System.out.println("the result length is"+str.length());
System.out.println("word content===="+bodyText);
}
catch (Exception e) {
;
}
if ( (bodyText != null)) {
Document doc = new Document();
doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES,
Field.Index.UN_TOKENIZED)); //取文件的l对路径
doc.add(new Field("contents", bodyText, Field.Store.YES,
Field.Index.TOKENIZED));
return doc;
}
return null;
}
}
package searchfileexample;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.FilterIndexReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Date;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Fieldable;
/** Simple command-line based search demo. */
public class SearchFiles {
/** Use the norms from one field for all fields. Norms are read into memory,
* using a byte of memory per document per searched field. This can cause
* search of large collections with a large number of fields to run out of
* memory. If all of the fields contain only a single token, then the norms
* are all identical, then single norm vector may be shared. */
private static class OneNormsReader extends FilterIndexReader {
private String field;
public OneNormsReader(IndexReader in, String field) {
super(in);
this.field = field;
}
public byte[] norms(String field) throws IOException {
return in.norms(this.field);
}
}
private SearchFiles() {}
/** Simple command-line based search demo. */
public static void main(String[] arg) throws Exception {
String[] args = {"a","b"};
String usage =
"Usage: java org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-raw] [-norms field]";
if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
System.out.println(usage);
System.exit(0);
}
String index = "index";//该值是用来存放生成的烦引文件的文g夹的名称Q不能改?br /> String field = "contents";//不能修改 field 的?br /> String queries = null;//是用来存N要检索的关键字的一个文件?br /> queries = "D:/lfy_programe/全文?SearchFileExample/aa.txt";
int repeat = 1;
boolean raw = false;
String normsField = null;
for (int i = 0; i < args.length; i++) {
if ("-index".equals(args[i])) {
index = args[i+1];
i++;
} else if ("-field".equals(args[i])) {
field = args[i+1];
i++;
} else if ("-queries".equals(args[i])) {
queries = args[i+1];
i++;
} else if ("-repeat".equals(args[i])) {
repeat = Integer.parseInt(args[i+1]);
i++;
} else if ("-raw".equals(args[i])) {
raw = true;
} else if ("-norms".equals(args[i])) {
normsField = args[i+1];
i++;
}
}
IndexReader reader = IndexReader.open(index);
if (normsField != null)
reader = new OneNormsReader(reader, normsField);
Searcher searcher = new IndexSearcher(reader);//用来打开索引文g
Analyzer analyzer = new StandardAnalyzer();//分析?br />
//Analyzer analyzer = new StandardAnalyzer();
BufferedReader in = null;
if (queries != null) {
in = new BufferedReader(new FileReader(queries));
} else {
in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
}
QueryParser parser = new QueryParser(field, analyzer);
while (true) {
if (queries == null) // prompt the user
System.out.println("Enter query: ");
String line = in.readLine();//l成查询关键字字W串
System.out.println("查询字符?=="+line);
if (line == null || line.length() == -1)
break;
line = line.trim();
if (line.length() == 0)
break;
Query query = parser.parse(line);
System.out.println("Searching for: " + query.toString(field));//每个关键?/p>
Hits hits = searcher.search(query);
if (repeat > 0) { // repeat & time as benchmark
Date start = new Date();
for (int i = 0; i < repeat; i++) {
hits = searcher.search(query);
}
Date end = new Date();
System.out.println("Time: "+(end.getTime()-start.getTime())+"ms");
}
System.out.println("查询刎ͼ" + hits.length() + " 个含?["+query.toString(field)+"]的文?);
final int HITS_PER_PAGE = 10;//查询q回的最大记录数
int currentNum = 2;//当前记录?br />
for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) {
//start = start + currentNum;
int end = Math.min(hits.length(), start + HITS_PER_PAGE);
for (int i = start; i < end; i++) {
//if (raw) { // output raw format
System.out.println("doc="+hits.id(i)+" score="+hits.score(i));//score是接q度的意?br />
//continue;
//}
Document doc = hits.doc(i);
String path = doc.get("path");
if (path != null) {
System.out.println((i+1) + ". " + path);
String title = doc.get("title");
System.out.println(" modified: " + doc.get("modified"));
if (title != null) {
System.out.println(" Title: " + doc.get("title"));
}
} else {
System.out.println((i+1) + ". " + "No path for this document");
}
}
if (queries != null) // non-interactive
break;
if (hits.length() > end) {
System.out.println("more (y/n) ? ");
line = in.readLine();
if (line.length() == 0 || line.charAt(0) == 'n')
break;
}
}
}
reader.close();
}
}
package searchfileexample;
import javax.servlet.*;
import javax.servlet.http.*;
import java.io.*;
import java.util.*;
import org.textmining.text.extraction.WordExtractor;
public class ReadWord extends HttpServlet {
private static final String CONTENT_TYPE = "text/html; charset=GBK";
//Initialize global variables
public void init() throws ServletException {
}
//Process the HTTP Get request
public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
response.setContentType(CONTENT_TYPE);
FileInputStream in = new FileInputStream ("D:/lfy_programe/全文?SearchFileExample/a/aa.doc");
// FileInputStream in = new FileInputStream ("D:/szqxjzhbase/技术测?新徏 Microsoft Word 文.doc");
WordExtractor extractor = new WordExtractor();
System.out.println(in.available());
String str = null;
try {
str = extractor.extractText(in);
}
catch (Exception ex) {
}
// System.out.println("the result length is"+str.length());
System.out.println(str);
}
//Clean up resources
public void destroy() {
}
}
1.英文的模p查询问?br /> 查询时的关键字的后边加上通配W?nbsp; " * " 可以了?/p>
2.IndexFiles.java
用来索引文g的javac?/p>
3.SearchFiles.java
用来搜烦的javac?/p>
4.ReadWord.java
使用tm-extractors-0.4.jar来读取word文g
import javax.servlet.*;
import javax.servlet.http.*;
import java.io.*;
import java.util.*;
import org.textmining.text.extraction.WordExtractor;
public class ReadWord extends HttpServlet {
private static final String CONTENT_TYPE = "text/html; charset=GBK";
//Initialize global variables
public void init() throws ServletException {
}
//Process the HTTP Get request
public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
response.setContentType(CONTENT_TYPE);
FileInputStream in = new FileInputStream ("D:/lfy_programe/全文?SearchFileExample/a/aa.doc");
// FileInputStream in = new FileInputStream ("D:/szqxjzhbase/技术测?新徏 Microsoft Word 文档.doc");
WordExtractor extractor = new WordExtractor();
System.out.println(in.available());
String str = null;
try {
str = extractor.extractText(in);
}
catch (Exception ex) {
}
// System.out.println("the result length is"+str.length());
System.out.println(str);
}
//Clean up resources
public void destroy() {
}
}