工具包系列(2):imageSpider工具——可定制的圖像抓取
這個(gè)工具是一個(gè)可定制的圖像抓取工具
我希望這個(gè)小工具的功能點(diǎn)有以下幾項(xiàng):1.給定頁面抓取頁面的圖片;2.給定頁面和過濾規(guī)則,抓取頁面的圖片并存到本地磁盤或內(nèi)存;
主要的技術(shù)點(diǎn)不多:1.圖片鏈接的獲取(htmlparser搞定);2.圖片的讀寫(imageIo搞定);3.規(guī)則的制定(來源于需求)
介于方法的多樣,第一個(gè)版本的spider只是很簡(jiǎn)單的功能實(shí)現(xiàn),未來希望加入的就是可擴(kuò)展的規(guī)則對(duì)象
少廢話,上代碼:
主類:
1: /**
2: *
3: */
4: package com.taobao.cd.http.image;
5:
6: import java.io.IOException;
7: import java.io.InputStream;
8: import java.net.URL;
9: import java.net.URLConnection;
10: import java.util.HashSet;
11: import java.util.Set;
12: import java.util.concurrent.ExecutorService;
13: import java.util.concurrent.Executors;
14:
15: import javax.imageio.ImageIO;
16: import javax.imageio.ImageReader;
17: import javax.imageio.stream.ImageInputStream;
18:
19: import org.htmlparser.NodeFilter;
20: import org.htmlparser.Parser;
21: import org.htmlparser.filters.TagNameFilter;
22: import org.htmlparser.tags.ImageTag;
23: import org.htmlparser.util.NodeList;
24:
25: import com.taobao.cd.http.util.HttpUtil;
26: import com.taobao.cd.http.util.ImageReaderFactory;
27: import com.taobao.cd.http.util.ParserPool;
28:
29: /**
30: * 這是一個(gè)圖片抓取器,通過給定url抓取該頁面的所有img 可定制,過濾 ver 1.0: 只是初級(jí)實(shí)現(xiàn)圖片抓取
31: *
32: * @author zunyuan.jy
33: *
34: * @date 2011-11-2
35: */
36: public class ImageSpider {
37:
38: private Set<String> imgSet; // 用于記錄已經(jīng)下載過的圖像url
39:
40: private int customedSize; // 支持定制的圖像大小,單位是KB
41:
42: public ImageSpider() {
43: this(0);
44: }
45:
46: public ImageSpider(int s) {
47: this.customedSize = s;
48: imgSet = new HashSet<String>();
49: }
50:
51: /**
52: * 抓取指定url頁面的所有圖像數(shù)據(jù)
53: *
54: * @param url
55: * 頁面url
56: * @param path
57: * 要將圖片保存的路徑
58: * @throws Exception
59: */
60: public void crawl(String url, final String path) throws Exception {
61: URL u = new URL(url);
62: URLConnection con = (u.openConnection());
63: con.setRequestProperty("User-Agent", HttpUtil.UA);
64: org.htmlparser.scanners.ScriptScanner.STRICT = false;
65: org.htmlparser.lexer.Lexer.STRICT_REMARKS = false;
66: Parser parser = ParserPool.getInstance().borrowOne();
67: parser.setConnection(con);
68:
69: NodeFilter filter = new TagNameFilter("img");
70: NodeList nodes = parser.extractAllNodesThatMatch(filter);
71: ImageTag node = null;
72: String imgSrc;
73: String suffix;
74: if (nodes != null) {
75: for (int i = 0; i < nodes.size(); i++) {
76: node = (ImageTag) nodes.elementAt(i);
77: imgSrc = node.getImageURL();
78:
79: if (!imgSet.contains(imgSrc)) {
80: imgSet.add(imgSrc);
81: suffix = imgSrc.substring(imgSrc.lastIndexOf(".") + 1);
82: if (suffix.equalsIgnoreCase(ImageUtil.JPG)
83: || suffix.equalsIgnoreCase(ImageUtil.PNG)
84: || suffix.equalsIgnoreCase(ImageUtil.GIF)
85: || suffix.equalsIgnoreCase(ImageUtil.BMP)) {
86: URL uu = new URL(imgSrc);
87: if (customedSize == 0 || filterSize(uu, suffix)) {
88: ImageUtil.writeImg(uu, path, suffix);
89: }
90: } else {
91: System.err.println(suffix
92: + ":img format not supported!");
93: }
94: }
95: }
96: }
97: }
98:
99: private boolean filterSize(URL u, String suffix) throws IOException {
100: InputStream is = u.openStream();
101: ImageInputStream stream = ImageIO.createImageInputStream(is);
102: ImageReader ir = ImageReaderFactory.getInstance().createImageReader(
103: suffix);
104: if (ir != null) {
105: ir.setInput(stream, true, false);
106: int w = ir.getWidth(0);
107: int h = ir.getHeight(0);
108: if (w * h < customedSize * 1024 * 3 + 100) {
109: return true;
110: } else {
111: return false;
112: }
113: } else {
114: System.err.println(u.getFile() + ":read img header error!");
115: return false;
116: }
117: }
118:
119: /**
120: * @param args
121: */
122: public static void main(String[] args) {
123: // TODO Auto-generated method stub
124:
125: }
126: }
util方法片段:
1: /**
2: * 保存圖像
3: * @param u 圖像的url對(duì)象
4: * @param path 圖像待保存的位置
5: * @param fileSuffix 保存的圖像格式
6: * @throws IOException
7: */
8: public static void writeImg(URL u, String path, String fileSuffix)
9: throws IOException {
10:
11: BufferedImage bimg = ImageIO.read(u);
12: if (bimg != null) {
13: String cd = CommonUtils.formatCurrentDate();
14: if (!path.endsWith(FILE_SEP)) {
15: path += FILE_SEP;
16: }
17: String fileName = path + cd + "_" + System.currentTimeMillis()
18: + "." + fileSuffix;
19: ImageIO.write(bimg, fileSuffix, new File(fileName));
20: } else {
21: System.err.println("read img error!");
22: }
23: }
上個(gè)工具就提到的parserpool,這個(gè)工具也會(huì)用到,所以也放上代碼:
1: /**
2: *
3: */
4: package com.taobao.cd.http.util;
5:
6: import org.apache.commons.pool.ObjectPool;
7: import org.apache.commons.pool.impl.StackObjectPool;
8: import org.htmlparser.Parser;
9:
10: /**
11: * 這是一個(gè)對(duì)象池,只負(fù)責(zé)生成空的parser,并管理這些parser
12: * 當(dāng)用完一個(gè)parser后就返還給對(duì)象池,同時(shí)對(duì)象池負(fù)責(zé)清空這個(gè)parser
13: * @author zunyuan.jy
14: *
15: * @date 2011-10-28
16: */
17: public class ParserPool {
18: /*singleton*/
19: private static ParserPool parserPool = new ParserPool();
20: /**/
21:
22: private ObjectPool pool;
23:
24: private ParserPool(){
25: pool = new StackObjectPool(new ParserFactory());
26: }
27: public static synchronized ParserPool getInstance() {
28: if(parserPool==null)
29: return new ParserPool();
30: return parserPool;
31: }
32: public void returnOne(Parser parser) throws Exception {
33: pool.returnObject(parser);
34: }
35:
36: public Parser borrowOne() throws Exception {
37: return (Parser) pool.borrowObject();
38: }
39:
40: public void addOne(Parser parser) throws Exception {
41: pool.addObject();
42: }
43: }
最后附上測(cè)試代碼:
1: /**
2: *
3: */
4: package com.taobao.cd.http.image;
5:
6: import org.junit.Test;
7:
8: import junit.framework.TestCase;
9:
10: /**
11: * @author zunyuan.jy
12: *
13: * @date 2011-11-2
14: */
15: public class ImageSpiderTest extends TestCase {
16:
17: @Test
18: public void testCrawl() {
19: String path = "D:\\個(gè)人工具代碼\\img";
20: ImageSpider is = new ImageSpider(5);
21: try {
22: long start = System.nanoTime();
23: is.crawl("http://www.163.com", path);
24: long end = System.nanoTime();
25: System.out.println("time usage:"+(end-start));
26: } catch (Exception e) {
27: e.printStackTrace();
28: }
29: }
30: }
測(cè)試結(jié)果如下:
com/ad_cookies:img format not supported!
gif?a=&c=860010-0503010000:img format not supported!
time usage:8622310262
前兩行是兩個(gè)出錯(cuò)的圖片
最后一行是執(zhí)行時(shí)間
加入了規(guī)則后可能時(shí)間長(zhǎng)一點(diǎn),因?yàn)橛羞^濾規(guī)則檢查。另外就是過濾的規(guī)則我現(xiàn)在只加了一個(gè)大小的過濾,而且大小也算的不準(zhǔn),后續(xù)有什么好的計(jì)算圖像的大小的方法和工具歡迎推薦。我這里只是一個(gè)粗略的size*1024*3+100來計(jì)算BYTE數(shù)了~~見笑
P.S. 上面關(guān)于parser池的方法里還少一個(gè)ParserFactory的定義,這個(gè)可以看看ObjectPool的文檔或者看我以前的文章就明白了,使用非常簡(jiǎn)單
P.S. 代碼里的所有錯(cuò)誤都拋了異常,或者用system.err輸出了~需要的可以改進(jìn)
代碼已上傳到git,有興趣的可以加入coding,
git@github.com:changedi/CDLib.git這是git地址posted on 2011-11-02 15:43 changedi 閱讀(2069) 評(píng)論(1) 編輯 收藏 所屬分類: Java技術(shù)