隨筆-8  評論-2  文章-24  trackbacks-0

          ????1。用jacob.
          ????其實(shí)jacob是一個(gè)bridage,連接java和com或者win32函數(shù)的一個(gè)中間件,jacob并不能直接抽取word,excel等文件,需要自己寫dll哦,不過已經(jīng)有為你寫好的了,就是jacob的作者一并提供了。
          ???jacob下載:
          http://www.matrix.org.cn/down_view.asp?id=13
          ????下載了jacob并放到指定的路徑之后(dll放到path,jar文件放到classpath),就可以寫你自己的抽取程序了,下面是一個(gè)例子:

          import java.io.File;

          import com.jacob.com.*;

          import com.jacob.activeX.*;

          ?

          public class FileExtracter

          {

          ?????? public static void main(String[] args)

          ?????? {

          ????????????? ActiveXComponent app = new ActiveXComponent("Word.Application");

          ????????????? String inFile = "c:\\test.doc";

          ????????????? String tpFile = "c:\\temp.htm";

          ????????????? String otFile = "c:\\temp.xml";

          ????????????? boolean flag = false;

          ????????????? try

          ????????????? {

          ???????????????????? app.setProperty("Visible", new Variant(false));

          ???????????????????? Object docs = app.getProperty("document").toDispatch();

          ???????????????????? Object doc = Dispatch

          ?????????????????????????????????? .invoke(docs, "Open", Dispatch.Method, new Object[]

          ?????????????????????????????????? {inFile, new Variant(false), new Variant(true)}, new int[1])

          ?????????????????????????????????? .toDispatch();

          ???????????????????? Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[]

          ???????????????????? {tpFile, new Variant(8)}, new int[1]);

          ???????????????????? Variant f = new Variant(false);

          ???????????????????? Dispatch.call(doc, "Close", f);

          ???????????????????? flag = true;

          ????????????? } catch (Exception e)

          ????????????? {

          ???????????????????? e.printStackTrace();

          ????????????? } finally

          ????????????? {

          ???????????????????? app.invoke("Quit", new Variant[]

          ???????????????????? {});

          ????????????? }

          ?

          ?????? }

          }




          ????2。用apache的poi來抽取word,excel。
          ????poi是apache的一個(gè)項(xiàng)目,不過就算用poi你可能都覺得很煩,不過不要緊,這里提供了更加簡單的一個(gè)接口給你:
          ????下載經(jīng)過封裝后的poi包:
          http://www.matrix.org.cn/down_view.asp?id=14
          ????下載之后,放到你的classpath就可以了,下面是如何使用它的一個(gè)例子:
          ???????

          import java.io.*;

          import org.textmining.text.extraction.WordExtractor;

          /**

          ?* <p>

          ?* Title: pdf extraction

          ?* </p>

          ?* <p>

          ?* Description: email:chris@matrix.org.cn

          ?* </p>

          ?* <p>

          ?* Copyright: Matrix Copyright (c) 2003

          ?* </p>

          ?* <p>

          ?* Company: Matrix.org.cn

          ?* </p>

          ?*

          ?* @author chris

          ?* @version 1.0,who use this example pls remain the declare

          ?*/

          ?

          public class PdfExtractor

          {

          ?????? public PdfExtractor()

          ?????? {

          ?????? }

          ?????? public static void main(String args[]) throws Exception

          ?????? {

          ????????????? FileInputStream in = new FileInputStream("c:\\a.doc");

          ????????????? WordExtractor extractor = new WordExtractor();

          ????????????? String str = extractor.extractText(in);

          ????????????? System.out.println("the result length is" + str.length());

          ????????????? System.out.println("the result is" + str);

          ?????? }

          }




          ????3。pdfbox-用來抽取pdf文件
          ???但是pdfbox對中文支持還不好,先下載pdfbox:

          http://www.matrix.org.cn/down_view.asp?id=12
          下面是一個(gè)如何使用pdfbox抽取pdf文件的例子:
          ?

          import org.pdfbox.pdmodel.PDdocument

          import org.pdfbox.pdfparser.PDFParser;

          import java.io.*;

          import org.pdfbox.util.PDFTextStripper;

          import java.util.Date;

          /**

          ?* <p>

          ?* Title: pdf extraction

          ?* </p>

          ?* <p>

          ?* Description: email:chris@matrix.org.cn

          ?* </p>

          ?* <p>

          ?* Copyright: Matrix Copyright (c) 2003

          ?* </p>

          ?* <p>

          ?* Company: Matrix.org.cn

          ?* </p>

          ?*

          ?* @author chris

          ?* @version 1.0,who use this example pls remain the declare

          ?*/

          ?

          public class PdfExtracter

          {

          ?

          ?????? public PdfExtracter()

          ?????? {

          ?????? }

          ?????? public String GetTextFromPdf(String filename) throws Exception

          ?????? {

          ????????????? String temp=null;

          ????????????? PDdocumentnbsppdfdocumentnull;

          ????????????? FileInputStream is=new FileInputStream(filename);

          ????????????? PDFParser parser = new PDFParser( is );

          ????????????? parser.parse();

          ????????????? pdfdocumentnbsp= parser.getPDdocument);

          ????????????? ByteArrayOutputStream out = new ByteArrayOutputStream();

          ????????????? OutputStreamWriter writer = new OutputStreamWriter( out );

          ????????????? PDFTextStripper stripper = new PDFTextStripper();

          ????????????? stripper.writeText(pdfdocumentgetdocument), writer );

          ????????????? writer.close();

          ????????????? byte[] contents = out.toByteArray();

          ?????????????

          ????????????? String ts=new String(contents);

          ????????????? System.out.println("the string length is"+contents.length+"\n");

          ????????????? return ts;

          ?????? }

          ?????? public static void main(String args[])

          ?????? {

          ????????????? PdfExtracter pf=new PdfExtracter();

          ????????????? PDdocumentnbsppdfdocumentnbsp= null;

          ?????????????

          ????????????? try

          ????????????? {

          ???????????????????? String ts=pf.GetTextFromPdf("c:\\a.pdf");

          ???????????????????? System.out.println(ts);

          ????????????? }

          ????????????? catch(Exception e)

          ????????????? {

          ???????????????????? e.printStackTrace();

          ????????????? }

          ?????? }

          }


          ?????4.抽取支持中文的pdf文件-xpdf
          ???xpdf是一個(gè)開源項(xiàng)目,我們可以調(diào)用他的本地方法來實(shí)現(xiàn)抽取中文pdf文件。
          下載xpdf函數(shù)包:
          http://www.matrix.org.cn/down_view.asp?id=15
          同時(shí)需要下載支持中文的補(bǔ)丁包:
          http://www.matrix.org.cn/down_view.asp?id=16
          ???按照readme放好中文的patch,就可以開始寫調(diào)用本地方法的java程序了
          下面是一個(gè)如何調(diào)用的例子:

          import java.io.*;

          /**

          ?* <p>

          ?* Title: pdf extraction

          ?* </p>

          ?* <p>

          ?* Description: email:chris@matrix.org.cn

          ?* </p>

          ?* <p>

          ?* Copyright: Matrix Copyright (c) 2003

          ?* </p>

          ?* <p>

          ?* Company: Matrix.org.cn

          ?* </p>

          ?*

          ?* @author chris

          ?* @version 1.0,who use this example pls remain the declare

          ?*/

          ?

          public class PdfWin

          {

          ?????? public PdfWin()

          ?????? {

          ?????? }

          ?????? public static void main(String args[]) throws Exception

          ?????? {

          ????????????? String PATH_TO_XPDF = "C:\\Program Files\\xpdf\\pdftotext.exe";

          ????????????? String filename = "c:\\a.pdf";

          ????????????? String[] cmd = new String[]

          ????????????? {PATH_TO_XPDF, "-enc", "UTF-8", "-q", filename, "-"};

          ????????????? Process p = Runtime.getRuntime().exec(cmd);

          ????????????? BufferedInputStream bis = new BufferedInputStream(p.getInputStream());

          ????????????? InputStreamReader reader = new InputStreamReader(bis, "UTF-8");

          ????????????? StringWriter out = new StringWriter();

          ????????????? char[] buf = new char[10000];

          ????????????? int len;

          ????????????? while ((len = reader.read(buf)) >= 0)

          ????????????? {

          ???????????????????? // out.write(buf, 0, len);

          ???????????????????? System.out.println("the length is" + len);

          ????????????? }

          ????????????? reader.close();

          ????????????? String ts = new String(buf);

          ????????????? System.out.println("the str is" + ts);

          ?????? }

          }

          ?
          posted on 2006-11-27 10:26 MyBox 閱讀(201) 評論(0)  編輯  收藏

          只有注冊用戶登錄后才能發(fā)表評論。


          網(wǎng)站導(dǎo)航:
           
          主站蜘蛛池模板: 大方县| 绍兴市| 吴旗县| 商河县| 浦东新区| 屯门区| 肇东市| 青海省| 舒城县| 台中县| 当涂县| 无棣县| 会泽县| 柯坪县| 峨边| 英吉沙县| 深泽县| 专栏| 吉安市| 镇赉县| 葫芦岛市| 宿迁市| 鲁甸县| 龙胜| 衡东县| 肇源县| 淳化县| 曲阜市| 永安市| 凤冈县| 商都县| 思南县| 台州市| 沅江市| 河间市| 乾安县| 威远县| 革吉县| 陇川县| 徐水县| 天津市|