夢幻之旅

DEBUG - 天道酬勤

:: 管理 ::

671 隨筆 :: 6 文章 :: 256 評論 :: 0 Trackbacks

package com.roadway.test;

import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class TeskSRC {
    public String getHtmlCode(String httpUrl) {
        String htmlCode = "";
        try {
            InputStream in;
            URL url = new java.net.URL(httpUrl);
            HttpURLConnection connection = (HttpURLConnection) url
                    .openConnection();
            connection = (HttpURLConnection) url.openConnection();
            connection.setRequestProperty("User-Agent", "Mozilla/4.0");
            connection.connect();
            in = connection.getInputStream();
            byte[] buffer = new byte[512];
            int length = -1;
            while((length = in.read(buffer,0,512)) != -1){
                htmlCode += new String(buffer,0,length);
            }
        } catch (Exception e) {
        }
        if(htmlCode == null){
            return "";
        }
        return htmlCode;
    }
    public static void main(String[] args){
        TeskSRC ts = new TeskSRC();
        String searchImgReg = "(?x)(src|SRC|background|BACKGROUND)=('|\")(http://([\\w-]+\\.)+[\\w-]+(:[0-9]+)*(/[\\w-]+)*(/[\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";
        String content = ts.getHtmlCode("http://www.163.com");
        Pattern pattern = Pattern.compile(searchImgReg);
        Matcher matcher = pattern.matcher(content);
        while(matcher.find()){
            System.out.println(matcher.group(3));
        }
        //searchImgReg  = "(?x)(src|SRC|background|BACKGROUND)=('|\")/?(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('|\")";
    }
}

posted on 2008-04-30 10:58 HUIKK 閱讀(3516) 評論(3) 編輯收藏所屬分類: Regular Exp

這個我以前實現過一個類似功能的, 不過是腳本(windows下用gnu win32)
grep -o "images/.*\.\(\(jpg\)\|\(gif\)\)" a.css | xargs -n1 echo http://www.haoting.com | tr " " "/" > pic.txt
wget -i pic.txt

ps: 用 java 實現時, 如果用多線程能不能提高效率?
我以前寫過一個下載一個網站的所有內容(就是給定一個 URI 下的所有子 URI 網頁和圖片等), 但是用多線程實現時發現在線程稍微多點-比如20個)時就會導致很多網頁下載到一般就沒了, 很郁悶啊回復更多評論

# re: 正則表達式抓取網頁面上所有圖片[未登錄] 2011-10-22 18:43 huang

我愛你，lz，我寫了一晚上的正則表達式還沒你的好用。回復更多評論

# re: 正則表達式抓取網頁面上所有圖片 2013-11-28 15:07 find you!

贊！！！！回復更多評論

新用戶注冊刷新評論列表


只有注冊用戶登錄后才能發表評論。




網站導航: 博客園 IT新聞 Chat2DB C++博客博問管理
相關文章: java 正則抓網頁正則表達式抓取網頁面上所有圖片

夢幻之旅

公告

常用鏈接

留言簿(21)

隨筆分類(644)

隨筆檔案(669)

文章檔案(6)

最新隨筆

積分與排名

最新評論

閱讀排行榜

評論排行榜

評論