如果有問題,請與我聯系。 參考資料 java.util.regex的幫助文檔
import java.io.*;
import java.net.*;
import java.util.regex.*;
import java.net.*;
import java.util.regex.*;
/**
This program displays all URLs in a web page
This program displays all URLs in a web page
by jgyang 2005-11-18
*/
public class HerfMatch
{
public static void main(String[] args)
{
try
{
String urlString = "http://www.tom.com";
public class HerfMatch
{
public static void main(String[] args)
{
try
{
String urlString = "http://www.tom.com";
InputStreamReader in = new InputStreamReader(
new URL(urlString).openStream());
new URL(urlString).openStream());
StringBuffer input = new StringBuffer();
int ch;
while ((ch = in.read()) != -1) input.append((char)ch);
int ch;
while ((ch = in.read()) != -1) input.append((char)ch);
String patternString
= "<a\\s+href\\s*=\\s*(\"[^\"]*\"|[^\\s>]|\"[^\"]*\"\\s*+[^>\"]*\\s*=\\s*(\"[^\"]*\"|[^\\s>]))\\s*>";
Pattern pattern = Pattern.compile(patternString,
Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(input);
= "<a\\s+href\\s*=\\s*(\"[^\"]*\"|[^\\s>]|\"[^\"]*\"\\s*+[^>\"]*\\s*=\\s*(\"[^\"]*\"|[^\\s>]))\\s*>";
Pattern pattern = Pattern.compile(patternString,
Pattern.CASE_INSENSITIVE);
Matcher matcher = pattern.matcher(input);
int i = 0;
while (matcher.find())
{
int start = matcher.start();
int end = matcher.end();
String match = input.substring(start, end);
System.out.println(++i + " : " + match);
}
}
catch (IOException exception)
{
exception.printStackTrace();
}
catch (PatternSyntaxException exception)
{
exception.printStackTrace();
}
}
}
while (matcher.find())
{
int start = matcher.start();
int end = matcher.end();
String match = input.substring(start, end);
System.out.println(++i + " : " + match);
}
}
catch (IOException exception)
{
exception.printStackTrace();
}
catch (PatternSyntaxException exception)
{
exception.printStackTrace();
}
}
}
太感謝了.
不過我問一下:如果得到下面的結果有怎么做呢
<A href="http://www.aygfsteel.com/jgyang/archive/2005/11/23/21081.html">從網頁中提取URL的java程序</A>
小弟求知中......
得到的結果與原文件上的鏈接根本不對呀!!!
herf有幾種格式,你可能需要做些特殊處理。
<a class=p3 href='../../a.htm' onclick='click();'><font color=#000000 style="font=2"><b>春天不會來</b></a>
就像下面這樣
<a http://www.163.com/tes.html >這是內容</a>
提取如下:
URL:http://www.163.com/tes.html
內容:這是內容