夢幻e家人

          java咖啡
          隨筆 - 15, 文章 - 0, 評論 - 11, 引用 - 0

          導航

          <2008年3月>
          2425262728291
          2345678
          9101112131415
          16171819202122
          23242526272829
          303112345

          常用鏈接

          留言簿(2)

          隨筆分類

          隨筆檔案

          搜索

          •  

          最新評論

          閱讀排行榜

          評論排行榜

          全文檢索

          package searchfileexample;

          import org.apache.lucene.analysis.standard.StandardAnalyzer;
          import org.apache.lucene.index.IndexWriter;

          import java.io.File;
          import java.io.FileNotFoundException;
          import java.io.IOException;
          import java.util.Date;
          import org.apache.lucene.demo.FileDocument;
          import org.apache.lucene.document.Document;
          import org.apache.lucene.document.Field;
          import java.io.FileReader;
          import org.apache.lucene.index.*;
          import java.text.DateFormat;
          import org.apache.poi.hdf.extractor.WordDocument;
          import java.io.InputStream;
          import java.io.StringWriter;
          import java.io.PrintWriter;
          import java.io.FileInputStream;
          import java.io.*;
          import org.textmining.text.extraction.WordExtractor;

          /**
           * 給某個目錄下的所有文件生成索引
           * <p>Title: </p>
           * <p>Description: </p>
           * <p>Copyright: Copyright (c) 2007</p>
           * <p>Company: </p>
           * @author not attributable
           * @version 1.0
           * 根據文件的不同,可以把索引文件創建到不同的文件夾下去,這樣可以分類保存索引信息。
           */

          /** Index all text files under a directory. */
          public class IndexFiles {

            private IndexFiles() {}

            static final File INDEX_DIR = new File("index");

            /** Index all text files under a directory. */
            public static void main(String[] args) {
              String usage = "java org.apache.lucene.demo.IndexFiles <root_directory>";
              //String[] arg = {"a","b"};
              //System.out.println(arg[0]);
              /*
                   if (args.length == 0) {
                System.err.println("Usage: " + usage);
                System.exit(1);
                   }*/
              /*
                  if (INDEX_DIR.exists()) {
                    System.out.println("Cannot save index to '" +INDEX_DIR+ "' directory, please delete it first");
                    System.exit(1);
                  }*/

              final File docDir = new File("a"); //需要生成索引的文件的文件夾
              if (!docDir.exists() || !docDir.canRead()) {
                System.out.println("Document directory '" + docDir.getAbsolutePath() +
                                   "' does not exist or is not readable, please check the path");
                System.exit(1);
              }

              Date start = new Date();
              try {
                IndexWriter writer = new IndexWriter(INDEX_DIR, new StandardAnalyzer(), true); //true-覆蓋原有的索引 false-不覆蓋原有的索引
                System.out.println("Indexing to directory '" + INDEX_DIR + "'...");
                indexDocs(writer, docDir);
                System.out.println("Optimizing...");
                writer.optimize();
                writer.close();

                Date end = new Date();
                System.out.println(end.getTime() - start.getTime() +
                                   " total milliseconds");

              }
              catch (IOException e) {
                System.out.println(" caught a " + e.getClass() +
                                   "\n with message: " + e.getMessage());
              }
            }

            static void indexDocs(IndexWriter writer, File file) throws IOException {
              // do not try to index files that cannot be read
              if (file.canRead()) {
                if (file.isDirectory()) {
                  String[] files = file.list();
                  // an IO error could occur
                  if (files != null) {
                    for (int i = 0; i < files.length; i++) {
                      indexDocs(writer, new File(file, files[i]));
                    }
                  }
                }
                else {
                  System.out.println("adding " + file);
                  try {

                    writer.addDocument(getDocument2(file, new FileInputStream(file)));
                    //writer.addDocument(parseFile(file));

                    //writer.addDocument(FileDocument.Document(file));//path 存放文件的相對路徑
                  }
                  // at least on windows, some temporary files raise this exception with an "access denied" message
                  // checking if the file can be read doesn't help
                  catch (Exception fnfe) {
                    ;
                  }
                }
              }
            }

            /**
             *@paramfile
             *
             *把File變成Document
             */
            static Document parseFile(File file) throws Exception {
              Document doc = new Document();
              doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES,
                                Field.Index.UN_TOKENIZED)); //取文件的絕對路徑
              try {
                doc.add(new Field("contents", new FileReader(file))); //索引文件內容
                doc.add(new Field("title", file.getName(), Field.Store.YES,
                                  Field.Index.UN_TOKENIZED));
                //索引最后修改時間
                doc.add(new Field("modified",
                                  String.valueOf(DateFormat.
                                                 getDateTimeInstance().format(new
                    Date(file.lastModified()))), Field.Store.YES,
                                  Field.Index.UN_TOKENIZED));
                //doc.removeField("title");
              }
              catch (Exception e) {
                e.printStackTrace();
              }
              return doc;
            }

            /**
             *@paramfile
             *
             *轉換word文檔

                   static String changeWord(File file) throws Exception {
              String re = "";
              try {
                WordDocument wd = new WordDocument(is);
                  StringWriter docTextWriter = new StringWriter();
                  wd.writeAllText(new PrintWriter(docTextWriter));
                  docTextWriter.close();
                  bodyText = docTextWriter.toString();

              } catch (Exception e) {
                  e.printStackTrace();
              }
              return re;
                   }*/
            /**
             *@paramfile
             *
             *使用POI讀取word文檔
             */
            static Document getDocument(File file, FileInputStream is) throws Exception {

              String bodyText = null;

              try {

                //BufferedReader wt = new BufferedReader(new InputStreamReader(is));
                //bodyText = wt.readLine();
                //System.out.println("word ===="+bodyText);

                WordDocument wd = new WordDocument(is);
                StringWriter docTextWriter = new StringWriter();
                wd.writeAllText(new PrintWriter(docTextWriter));
                bodyText = docTextWriter.toString();
                docTextWriter.close();
                //   bodyText   =   new   WordExtractor().extractText(is);
                System.out.println("word content====" + bodyText);
              }
              catch (Exception e) {
                ;

              }

              if ( (bodyText != null)) {
                Document doc = new Document();
                doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES,
                                  Field.Index.UN_TOKENIZED)); //取文件的絕對路徑
                doc.add(new Field("contents", bodyText, Field.Store.YES,
                                  Field.Index.TOKENIZED));

                return doc;
              }
              return null;
            }

            //Document   doc   =   getDocument(new   FileInputStream(new   File(file)));
            /**
             *@paramfile
             *
             *使用tm-extractors-0.4.jar讀取word文檔
             */
            static Document getDocument2(File file, FileInputStream is) throws Exception {

              String bodyText = null;

              try {

                //FileInputStream in = new FileInputStream("D:/lfy_programe/全文檢索/SearchFileExample/a/aa.doc");
                //  FileInputStream in = new FileInputStream ("D:/szqxjzhbase/技術測試/新建 Microsoft Word 文檔.doc");
                WordExtractor extractor = new WordExtractor();
                System.out.println(is.available());

                bodyText = extractor.extractText(is);

          //    System.out.println("the result length is"+str.length());
                System.out.println("word content===="+bodyText);

              }
              catch (Exception e) {
                ;

              }

              if ( (bodyText != null)) {
                Document doc = new Document();
                doc.add(new Field("path", file.getAbsolutePath(), Field.Store.YES,
                                  Field.Index.UN_TOKENIZED)); //取文件的絕對路徑
                doc.add(new Field("contents", bodyText, Field.Store.YES,
                                  Field.Index.TOKENIZED));

                return doc;
              }
              return null;
            }

          }


           

          package searchfileexample;


          import org.apache.lucene.analysis.Analyzer;
          import org.apache.lucene.analysis.standard.StandardAnalyzer;
          import org.apache.lucene.document.Document;
          import org.apache.lucene.index.FilterIndexReader;
          import org.apache.lucene.index.IndexReader;
          import org.apache.lucene.queryParser.QueryParser;
          import org.apache.lucene.search.Hits;
          import org.apache.lucene.search.IndexSearcher;
          import org.apache.lucene.search.Query;
          import org.apache.lucene.search.Searcher;


          import java.io.BufferedReader;
          import java.io.FileReader;
          import java.io.IOException;
          import java.io.InputStreamReader;
          import java.util.Date;
          import org.apache.lucene.analysis.SimpleAnalyzer;
          import org.apache.lucene.analysis.KeywordAnalyzer;
          import org.apache.lucene.analysis.WhitespaceAnalyzer;
          import org.apache.lucene.document.Fieldable;

          /** Simple command-line based search demo. */
          public class SearchFiles {

            /** Use the norms from one field for all fields.  Norms are read into memory,
             * using a byte of memory per document per searched field.  This can cause
             * search of large collections with a large number of fields to run out of
             * memory.  If all of the fields contain only a single token, then the norms
             * are all identical, then single norm vector may be shared. */
            private static class OneNormsReader extends FilterIndexReader {
              private String field;

              public OneNormsReader(IndexReader in, String field) {
                super(in);
                this.field = field;
              }

              public byte[] norms(String field) throws IOException {
                return in.norms(this.field);
              }
            }

            private SearchFiles() {}

            /** Simple command-line based search demo. */
            public static void main(String[] arg) throws Exception {
              String[] args = {"a","b"};
              String usage =
                "Usage: java org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-raw] [-norms field]";
              if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) {
                System.out.println(usage);
                System.exit(0);
              }

              String index = "index";//該值是用來存放生成的索引文件的文件夾的名稱,不能改動
              String field = "contents";//不能修改  field  的值
              String queries = null;//是用來存放需要檢索的關鍵字的一個文件。
              queries = "D:/lfy_programe/全文檢索/SearchFileExample/aa.txt";

              int repeat = 1;
              boolean raw = false;
              String normsField = null;

              for (int i = 0; i < args.length; i++) {
                if ("-index".equals(args[i])) {
                  index = args[i+1];
                  i++;
                } else if ("-field".equals(args[i])) {
                  field = args[i+1];
                  i++;
                } else if ("-queries".equals(args[i])) {
                  queries = args[i+1];
                  i++;
                } else if ("-repeat".equals(args[i])) {
                  repeat = Integer.parseInt(args[i+1]);
                  i++;
                } else if ("-raw".equals(args[i])) {
                  raw = true;
                } else if ("-norms".equals(args[i])) {
                  normsField = args[i+1];
                  i++;
                }
              }

              IndexReader reader = IndexReader.open(index);

              if (normsField != null)
                reader = new OneNormsReader(reader, normsField);

              Searcher searcher = new IndexSearcher(reader);//用來打開索引文件
              Analyzer analyzer = new StandardAnalyzer();//分析器
              //Analyzer analyzer = new StandardAnalyzer();

              BufferedReader in = null;
              if (queries != null) {
                in = new BufferedReader(new FileReader(queries));
              } else {
                in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
              }
                QueryParser parser = new QueryParser(field, analyzer);
              while (true) {
                if (queries == null)                        // prompt the user
                  System.out.println("Enter query: ");

                String line = in.readLine();//組成查詢關鍵字字符串
                System.out.println("查詢字符串==="+line);

                if (line == null || line.length() == -1)
                  break;

                line = line.trim();
                if (line.length() == 0)
                  break;

                Query query = parser.parse(line);
                System.out.println("Searching for: " + query.toString(field));//每個關鍵字

                Hits hits = searcher.search(query);

                if (repeat > 0) {                           // repeat & time as benchmark
                  Date start = new Date();
                  for (int i = 0; i < repeat; i++) {
                    hits = searcher.search(query);
                  }
                  Date end = new Date();
                  System.out.println("Time: "+(end.getTime()-start.getTime())+"ms");
                }

                System.out.println("查詢到:" + hits.length() + " 個含有 ["+query.toString(field)+"]的文檔");

                final int HITS_PER_PAGE = 10;//查詢返回的最大記錄數
                int currentNum = 2;//當前記錄數
                for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) {
                  //start = start + currentNum;
                  int end = Math.min(hits.length(), start + HITS_PER_PAGE);
                  for (int i = start; i < end; i++) {

                    //if (raw) {                              // output raw format
                      System.out.println("doc="+hits.id(i)+" score="+hits.score(i));//score是接近度的意思
                      //continue;
                    //}

                    Document doc = hits.doc(i);
                    String path = doc.get("path");


                    if (path != null) {
                      System.out.println((i+1) + ". " + path);
                      String title = doc.get("title");
                      System.out.println("   modified: " + doc.get("modified"));
                      if (title != null) {
                        System.out.println("   Title: " + doc.get("title"));
                      }
                    } else {
                      System.out.println((i+1) + ". " + "No path for this document");
                    }
                  }

                  if (queries != null)                      // non-interactive
                    break;

                  if (hits.length() > end) {
                    System.out.println("more (y/n) ? ");
                    line = in.readLine();
                    if (line.length() == 0 || line.charAt(0) == 'n')
                      break;
                  }
                }
              }
              reader.close();
            }
          }


           

          package searchfileexample;

          import javax.servlet.*;
          import javax.servlet.http.*;
          import java.io.*;
          import java.util.*;
          import org.textmining.text.extraction.WordExtractor;

          public class ReadWord extends HttpServlet {
            private static final String CONTENT_TYPE = "text/html; charset=GBK";

            //Initialize global variables
            public void init() throws ServletException {
            }

            //Process the HTTP Get request
            public void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
              response.setContentType(CONTENT_TYPE);
              FileInputStream in = new FileInputStream ("D:/lfy_programe/全文檢索/SearchFileExample/a/aa.doc");
                 //  FileInputStream in = new FileInputStream ("D:/szqxjzhbase/技術測試/新建 Microsoft Word 文檔.doc");
             WordExtractor extractor = new WordExtractor();
             System.out.println(in.available());
            String str = null;
            try {
              str = extractor.extractText(in);
            }
            catch (Exception ex) {
            }
          //    System.out.println("the result length is"+str.length());
             System.out.println(str);

            }

            //Clean up resources
            public void destroy() {
            }
          }

          1.英文的模糊查詢問題
          查詢時的關鍵字的后邊加上通配符  " * " 就可以了。

          2.IndexFiles.java
          用來索引文件的java類

          3.SearchFiles.java
          用來搜索的java類

          4.ReadWord.java
          使用tm-extractors-0.4.jar來讀取word文件


           

           

          posted on 2008-03-18 10:35 軒轅 閱讀(267) 評論(0)  編輯  收藏 所屬分類: java

          主站蜘蛛池模板: 奈曼旗| 襄城县| 萨迦县| 施秉县| 夹江县| 平谷区| 桦川县| 胶州市| 大荔县| 华池县| 江华| 五峰| 南和县| 林西县| 大城县| 苗栗市| 吉林市| 屯留县| 福贡县| 博白县| 桐柏县| 兰溪市| 科技| 龙川县| 阿城市| 措勤县| 远安县| 疏勒县| 肥西县| 香格里拉县| 黑水县| 迁西县| 全南县| 洛南县| 广宗县| 三河市| 屯留县| 鄂尔多斯市| 金寨县| 东明县| 南阳市|