????1。用jacob.
????其實(shí)jacob是一個(gè)bridage,連接java和com或者win32函數(shù)的一個(gè)中間件,jacob并不能直接抽取word,excel等文件,需要自己寫dll哦,不過已經(jīng)有為你寫好的了,就是jacob的作者一并提供了。
???jacob下載:
http://www.matrix.org.cn/down_view.asp?id=13
????下載了jacob并放到指定的路徑之后(dll放到path,jar文件放到classpath),就可以寫你自己的抽取程序了,下面是一個(gè)例子:
import java.io.File;
import com.jacob.com.*;
import com.jacob.activeX.*;
public class FileExtracter
{
?????? public static void main(String[] args)
?????? {
????????????? ActiveXComponent app = new ActiveXComponent("Word.Application");
????????????? String inFile = "c:\\test.doc";
????????????? String tpFile = "c:\\temp.htm";
????????????? String otFile = "c:\\temp.xml";
????????????? boolean flag = false;
????????????? try
????????????? {
???????????????????? app.setProperty("Visible", new Variant(false));
???????????????????? Object docs = app.getProperty("document.").toDispatch();
???????????????????? Object doc = Dispatch
?????????????????????????????????? .invoke(docs, "Open", Dispatch.Method, new Object[]
?????????????????????????????????? {inFile, new Variant(false), new Variant(true)}, new int[1])
?????????????????????????????????? .toDispatch();
???????????????????? Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[]
???????????????????? {tpFile, new Variant(8)}, new int[1]);
???????????????????? Variant f = new Variant(false);
???????????????????? Dispatch.call(doc, "Close", f);
???????????????????? flag = true;
????????????? } catch (Exception e)
????????????? {
???????????????????? e.printStackTrace();
????????????? } finally
????????????? {
???????????????????? app.invoke("Quit", new Variant[]
???????????????????? {});
????????????? }
?????? }
}
????2。用apache的poi來抽取word,excel。
????poi是apache的一個(gè)項(xiàng)目,不過就算用poi你可能都覺得很煩,不過不要緊,這里提供了更加簡單的一個(gè)接口給你:
????下載經(jīng)過封裝后的poi包:
http://www.matrix.org.cn/down_view.asp?id=14
????下載之后,放到你的classpath就可以了,下面是如何使用它的一個(gè)例子:
???????
import java.io.*;
import org.textmining.text.extraction.WordExtractor;
/**
?* <p>
?* Title: pdf extraction
?* </p>
?* <p>
?* Description: email:chris@matrix.org.cn
?* </p>
?* <p>
?* Copyright: Matrix Copyright (c) 2003
?* </p>
?* <p>
?* Company: Matrix.org.cn
?* </p>
?*
?* @author chris
?* @version 1.0,who use this example pls remain the declare
?*/
public class PdfExtractor
{
?????? public PdfExtractor()
?????? {
?????? }
?????? public static void main(String args[]) throws Exception
?????? {
????????????? FileInputStream in = new FileInputStream("c:\\a.doc");
????????????? WordExtractor extractor = new WordExtractor();
????????????? String str = extractor.extractText(in);
????????????? System.out.println("the result length is" + str.length());
????????????? System.out.println("the result is" + str);
?????? }
}
????3。pdfbox-用來抽取pdf文件
???但是pdfbox對中文支持還不好,先下載pdfbox:
下面是一個(gè)如何使用pdfbox抽取pdf文件的例子:
?
import org.pdfbox.pdmodel.PDdocument.
import org.pdfbox.pdfparser.PDFParser;
import java.io.*;
import org.pdfbox.util.PDFTextStripper;
import java.util.Date;
/**
?* <p>
?* Title: pdf extraction
?* </p>
?* <p>
?* Description: email:chris@matrix.org.cn
?* </p>
?* <p>
?* Copyright: Matrix Copyright (c) 2003
?* </p>
?* <p>
?* Company: Matrix.org.cn
?* </p>
?*
?* @author chris
?* @version 1.0,who use this example pls remain the declare
?*/
public class PdfExtracter
{
?????? public PdfExtracter()
?????? {
?????? }
?????? public String GetTextFromPdf(String filename) throws Exception
?????? {
????????????? String temp=null;
????????????? PDdocument.nbsppdfdocument.null;
????????????? FileInputStream is=new FileInputStream(filename);
????????????? PDFParser parser = new PDFParser( is );
????????????? parser.parse();
????????????? pdfdocument.nbsp= parser.getPDdocument.);
????????????? ByteArrayOutputStream out = new ByteArrayOutputStream();
????????????? OutputStreamWriter writer = new OutputStreamWriter( out );
????????????? PDFTextStripper stripper = new PDFTextStripper();
????????????? stripper.writeText(pdfdocument.getdocument.), writer );
????????????? writer.close();
????????????? byte[] contents = out.toByteArray();
?????????????
????????????? String ts=new String(contents);
????????????? System.out.println("the string length is"+contents.length+"\n");
????????????? return ts;
?????? }
?????? public static void main(String args[])
?????? {
????????????? PdfExtracter pf=new PdfExtracter();
????????????? PDdocument.nbsppdfdocument.nbsp= null;
?????????????
????????????? try
????????????? {
???????????????????? String ts=pf.GetTextFromPdf("c:\\a.pdf");
???????????????????? System.out.println(ts);
????????????? }
????????????? catch(Exception e)
????????????? {
???????????????????? e.printStackTrace();
????????????? }
?????? }
}
?????4.抽取支持中文的pdf文件-xpdf
???xpdf是一個(gè)開源項(xiàng)目,我們可以調(diào)用他的本地方法來實(shí)現(xiàn)抽取中文pdf文件。
下載xpdf函數(shù)包:
http://www.matrix.org.cn/down_view.asp?id=15
同時(shí)需要下載支持中文的補(bǔ)丁包:
http://www.matrix.org.cn/down_view.asp?id=16
???按照readme放好中文的patch,就可以開始寫調(diào)用本地方法的java程序了
下面是一個(gè)如何調(diào)用的例子:
import java.io.*;
/**
?* <p>
?* Title: pdf extraction
?* </p>
?* <p>
?* Description: email:chris@matrix.org.cn
?* </p>
?* <p>
?* Copyright: Matrix Copyright (c) 2003
?* </p>
?* <p>
?* Company: Matrix.org.cn
?* </p>
?*
?* @author chris
?* @version 1.0,who use this example pls remain the declare
?*/
public class PdfWin
{
?????? public PdfWin()
?????? {
?????? }
?????? public static void main(String args[]) throws Exception
?????? {
????????????? String PATH_TO_XPDF = "C:\\Program Files\\xpdf\\pdftotext.exe";
????????????? String filename = "c:\\a.pdf";
????????????? String[] cmd = new String[]
????????????? {PATH_TO_XPDF, "-enc", "UTF-8", "-q", filename, "-"};
????????????? Process p = Runtime.getRuntime().exec(cmd);
????????????? BufferedInputStream bis = new BufferedInputStream(p.getInputStream());
????????????? InputStreamReader reader = new InputStreamReader(bis, "UTF-8");
????????????? StringWriter out = new StringWriter();
????????????? char[] buf = new char[10000];
????????????? int len;
????????????? while ((len = reader.read(buf)) >= 0)
????????????? {
???????????????????? // out.write(buf, 0, len);
???????????????????? System.out.println("the length is" + len);
????????????? }
????????????? reader.close();
????????????? String ts = new String(buf);
????????????? System.out.println("the str is" + ts);
?????? }
}