CONAN ZONE

你越掙扎我就越興奮

BlogJava

管理

0 Posts :: 282 Stories :: 0 Comments :: 0 Trackbacks

留言簿(6)

文章分類(325)

文章檔案(282)

guy's blog

square's blog

搜索

積分與排名

積分 - 409952
排名 - 136

最新評論

使用SolrJ生成索引

代碼很簡單, 直接看就明白了, 可以在實際工作中借鑒, 原文在這里. 這個例子使用兩種方式來演示如何生成全量索引:
一個是從db中通過sql生成全量索引
一個是通過tika解析文件生成全量索引

package SolrJExample;

import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer;
import org.apache.solr.client.solrj.impl.XMLResponseParser;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrInputDocument;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.xml.sax.ContentHandler;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.sql.*;
import java.util.ArrayList;
import java.util.Collection;

/* Example class showing the skeleton of using Tika and
   Sql on the client to index documents from
   both structured documents and a SQL database.

   NOTE: The SQL example and the Tika example are entirely orthogonal.
   Both are included here to make a
   more interesting example, but you can omit either of them.

*/
public class SqlTikaExample {
  private StreamingUpdateSolrServer _server;
  private long _start = System.currentTimeMillis();
  private AutoDetectParser _autoParser;
  private int _totalTika = 0;
  private int _totalSql = 0;

  private Collection _docs = new ArrayList();

  public static void main(String[] args) {
    try {
      SqlTikaExample idxer = new SqlTikaExample("http://localhost:8983/solr");

      idxer.doTikaDocuments(new File("/Users/Erick/testdocs"));
      idxer.doSqlDocuments();

      idxer.endIndexing();
    } catch (Exception e) {
      e.printStackTrace();
    }
  }

  private SqlTikaExample(String url) throws IOException, SolrServerException {
      // Create a multi-threaded communications channel to the Solr server.
      // Could be CommonsHttpSolrServer as well.
      //
    _server = new StreamingUpdateSolrServer(url, 10, 4);

    _server.setSoTimeout(1000);  // socket read timeout
    _server.setConnectionTimeout(1000);
    _server.setMaxRetries(1); // defaults to 0.  > 1 not recommended.
         // binary parser is used by default for responses
    _server.setParser(new XMLResponseParser());

      // One of the ways Tika can be used to attempt to parse arbitrary files.
    _autoParser = new AutoDetectParser();
  }

    // Just a convenient place to wrap things up.
  private void endIndexing() throws IOException, SolrServerException {
    if (_docs.size() > 0) { // Are there any documents left over?
      _server.add(_docs, 300000); // Commit within 5 minutes
    }
    _server.commit(); // Only needs to be done at the end,
                      // commitWithin should do the rest.
                      // Could even be omitted
                      // assuming commitWithin was specified.
    long endTime = System.currentTimeMillis();
    log("Total Time Taken: " + (endTime - _start) +
         " milliseconds to index " + _totalSql +
        " SQL rows and " + _totalTika + " documents");
  }

  // I hate writing System.out.println() everyplace,
  // besides this gives a central place to convert to true logging
  // in a production system.
  private static void log(String msg) {
    System.out.println(msg);
  }

  /**
   * ***************************Tika processing here
   */
  // Recursively traverse the filesystem, parsing everything found.
  private void doTikaDocuments(File root) throws IOException, SolrServerException {

    // Simple loop for recursively indexing all the files
    // in the root directory passed in.
    for (File file : root.listFiles()) {
      if (file.isDirectory()) {
        doTikaDocuments(file);
        continue;
      }
        // Get ready to parse the file.
      ContentHandler textHandler = new BodyContentHandler();
      Metadata metadata = new Metadata();
      ParseContext context = new ParseContext();

      InputStream input = new FileInputStream(file);

        // Try parsing the file. Note we haven't checked at all to
        // see whether this file is a good candidate.
      try {
        _autoParser.parse(input, textHandler, metadata, context);
      } catch (Exception e) {
          // Needs better logging of what went wrong in order to
          // track down "bad" documents.
        log(String.format("File %s failed", file.getCanonicalPath()));
        e.printStackTrace();
        continue;
      }
      // Just to show how much meta-data and what form it's in.
      dumpMetadata(file.getCanonicalPath(), metadata);

      // Index just a couple of the meta-data fields.
      SolrInputDocument doc = new SolrInputDocument();

      doc.addField("id", file.getCanonicalPath());

      // Crude way to get known meta-data fields.
      // Also possible to write a simple loop to examine all the
      // metadata returned and selectively index it and/or
      // just get a list of them.
      // One can also use the LucidWorks field mapping to
      // accomplish much the same thing.
      String author = metadata.get("Author");

      if (author != null) {
        doc.addField("author", author);
      }

      doc.addField("text", textHandler.toString());

      _docs.add(doc);
      ++_totalTika;

      // Completely arbitrary, just batch up more than one document
      // for throughput!
      if (_docs.size() >= 1000) {
          // Commit within 5 minutes.
        UpdateResponse resp = _server.add(_docs, 300000);
        if (resp.getStatus() != 0) {
          log("Some horrible error has occurred, status is: " +
                  resp.getStatus());
        }
        _docs.clear();
      }
    }
  }

    // Just to show all the metadata that's available.
  private void dumpMetadata(String fileName, Metadata metadata) {
    log("Dumping metadata for file: " + fileName);
    for (String name : metadata.names()) {
      log(name + ":" + metadata.get(name));
    }
    log("\n\n");
  }

  /**
   * ***************************SQL processing here
   */
  private void doSqlDocuments() throws SQLException {
    Connection con = null;
    try {
      Class.forName("com.mysql.jdbc.Driver").newInstance();
      log("Driver Loaded

");

      con = DriverManager.getConnection("jdbc:mysql://192.168.1.103:3306/test?"
                + "user=testuser&password=test123");

      Statement st = con.createStatement();
      ResultSet rs = st.executeQuery("select id,title,text from test");

      while (rs.next()) {
        // DO NOT move this outside the while loop
        // or be sure to call doc.clear()
        SolrInputDocument doc = new SolrInputDocument(); 
        String id = rs.getString("id");
        String title = rs.getString("title");
        String text = rs.getString("text");

        doc.addField("id", id);
        doc.addField("title", title);
        doc.addField("text", text);

        _docs.add(doc);
        ++_totalSql;

        // Completely arbitrary, just batch up more than one
        // document for throughput!
        if (_docs.size() > 1000) {
             // Commit within 5 minutes.
          UpdateResponse resp = _server.add(_docs, 300000);
          if (resp.getStatus() != 0) {
            log("Some horrible error has occurred, status is: " +
                  resp.getStatus());
          }
          _docs.clear();
        }
      }
    } catch (Exception ex) {
      ex.printStackTrace();
    } finally {
      if (con != null) {
        con.close();
      }
    }
  }
}

posted on 2012-05-30 14:43 CONAN 閱讀(1052) 評論(0) 編輯收藏所屬分類: Solr