夢幻之旅

DEBUG - 天道酬勤

:: 首頁 :: 新隨筆 :: 聯系 :: 聚合

:: 管理 ::

671 隨筆 :: 6 文章 :: 256 評論 :: 0 Trackbacks

抓網頁

今天晚上,幫我一個同門師兄,解決一下問題.
題目是,抓取一個網站的所以頁面,并抓下這些頁碼的所有網址.
代碼如下:

package com.hwp.test;

import java.io.InputStream;

import java.net.HttpURLConnection;

import java.net.URL;

import java.util.ArrayList;

import java.util.HashMap;

import java.util.List;

import java.util.Map;

import java.util.Set;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

public class SearchEngine

{

private Map<String, List<String>> pageNameUrls;

public SearchEngine()

{

pageNameUrls = new HashMap<String, List<String>>();

}

private String getContent(String httpUrl)

{

String htmlCode = "";

try

{

InputStream in;

URL url = new java.net.URL(httpUrl);

HttpURLConnection connection = (HttpURLConnection) url

.openConnection();

connection = (HttpURLConnection) url.openConnection();

connection.setRequestProperty("User-Agent", "Mozilla/4.0");

connection.connect();

in = connection.getInputStream();

byte[] buffer = new byte[512];

int length = -1;

while ((length = in.read(buffer, 0, 512)) != -1)

{

htmlCode += new String(buffer, 0, length);

}

catch (Exception e)

{}

if (htmlCode == null)

{

return "";

}

return htmlCode;

}

private List<String> getPageUrls(String page)

{

List<String> urls = new ArrayList<String>();

String content = this.getContent(page);

String reg = "http://([\\w-]+\\.)+[\\w-]+(/[\\w- ./?%&=]*)?";

Pattern pattern = Pattern.compile(reg);

Matcher matcher = pattern.matcher(content);

String url = "";

while (matcher.find())

{

url = matcher.group();

if (!urls.contains(url))

{

urls.add(url);

}

return urls;

}

public void test(String url, String baseUrl)

{

String content = this.getContent(url);

// System.out.println(content);

String reg = "(" + baseUrl

+ "(/[\\w-]+)*(/[\\w-]+\\.(htm|html|xhtml|jsp|asp|php)))";

Pattern pattern = Pattern.compile(reg);

Matcher matcher = pattern.matcher(content);

while (matcher.find())

{

String tempUrl = matcher.group();

if (!this.pageNameUrls.containsKey(tempUrl))

{

//System.out.println(tempUrl);

this.pageNameUrls.put(tempUrl, this.getPageUrls(tempUrl));

test(tempUrl, baseUrl);

}

public static void main(String[] args)

{

String url = "http://www.aygfsteel.com";

String baseUrl = "http://www.aygfsteel.com";

SearchEngine se = new SearchEngine();

se.test(url, baseUrl);

Map<String, List<String>> map= se.pageNameUrls;

Set<Map.Entry<String, List<String>>> set = map.entrySet();

for(Map.Entry<String, List<String>> entry: set)

{

System.out.println(entry.getKey());

System.out.println(entry.getValue());

}

posted on 2008-07-14 23:24 HUIKK 閱讀(405) 評論(0) 編輯收藏所屬分類: Regular Exp

新用戶注冊刷新評論列表


只有注冊用戶登錄后才能發表評論。




網站導航: 博客園 IT新聞 Chat2DB C++博客博問管理
相關文章: java 正則抓網頁正則表達式抓取網頁面上所有圖片

夢幻之旅

公告

常用鏈接

留言簿(21)

隨筆分類(644)

隨筆檔案(669)

文章檔案(6)

最新隨筆

積分與排名

最新評論

閱讀排行榜

評論排行榜