日本久久久久久,亚洲精品在线看,亚洲永久一区二区三区在线

甜菜侯爵

用正則表達式提取網(wǎng)頁中的鏈接

個人感覺效率肯定還能進一步提高。。。。
不過實在是對正則不是太熟悉，只好暫時這樣了。

代碼如下：

1

/** The regex for search link with the tag "a" */
2

private final String A_REGEX = "<a.*?/a>";
3

/** The regex for search url with the tag "href" */
4

private final String HREF_REGEX = "href=\".*?\"";
5

/** The pattern for linke with the tag "a" */
6

private final Pattern A_PATTERN = Pattern.compile(A_REGEX);
7

/** The pattern for url with the tag "href" */
8

private final Pattern HREF_PATTERN = Pattern.compile(HREF_REGEX);
9

/**
10

* Get url address from the url and the content of the url
11

* @param url the url need to be get links
12

* @param content the content of the given url
13

* @return a list with the url address of the links
14

*/
15

public List<String> getLinkList( URL url, String content )
16

{
17

List<String> linkList = new LinkedList<String>();
18

final Matcher a_matcher = A_PATTERN.matcher(content);
19

while (a_matcher.find())
20

{
21

//JUST FOR TEST!
22

// System.out.println(a_matcher.group());
23

//get url address
24

final Matcher myurl = HREF_PATTERN.matcher(a_matcher.group());
25

while (myurl.find())
26

{
27

String urlAddress = myurl.group().replaceAll("href=|>|\"|\"", "");
28

if( urlAddress.startsWith("http") )
29

{
30

linkList.add(urlAddress);
31

}
32

else if( urlAddress.startsWith("/") || urlAddress.startsWith("\\") )
33

{
34

linkList.add(url.getPath()+urlAddress);
35

}
36

else
37

{
38

String fullUrl = url.toString();
39

//the length of the url without the current page
40

int lastSlash = fullUrl.lastIndexOf("/") + 1;
41

linkList.add(fullUrl.substring(0,lastSlash) + urlAddress);
42

}
43

}
44

}
45

return linkList;
46

}

posted on 2009-11-05 03:00 甜菜侯爵閱讀(455) 評論(0) 編輯收藏

新用戶注冊刷新評論列表


只有注冊用戶登錄后才能發(fā)表評論。




網(wǎng)站導航: 博客園 IT新聞 Chat2DB C++博客博問管理

<

2009年11月

>

日

一

二

三

四

五

六

25

26

27

28

29

30

31

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

1

2

3

4

5

統(tǒng)計

隨筆 - 5
文章 - 0
評論 - 3
引用 - 0

用正則表達式提取網(wǎng)頁中的鏈接

導航

統(tǒng)計

常用鏈接

留言簿

隨筆檔案

搜索

最新評論

閱讀排行榜

評論排行榜