中文JAVA技術平等自由協作創造

          Java專題文章博客和開源

          常用鏈接

          統計

          最新評論

          多線程實現的Java爬蟲程序

            以下是一個Java爬蟲程序,它能從指定主頁開始,按照指定的深度抓取該站點域名下的網頁并維護簡單索引。

            參數:private static int webDepth = 2;//爬蟲深度。

            主頁的深度為1,設置深度后超過該深度的網頁不會抓取。 private int intThreadNum = 10;//線程數。開啟的線程數。

            抓取時也會在程序源文件目錄下生成一個report.txt文件記錄爬蟲的運行情況,并在抓取結束后生成一個fileindex.txt文件維護網頁文件索引。

            本程序用到了多線程(靜態變量和同步),泛型,文件操作,URL類和連接,Hashtable類關聯數組,正則表達式及其相關類。托福答案

            運行時需使用命令行參數,第一個參數應使用http://開頭的有效URL字符串作為爬蟲的主頁,第二個參數(可選)應輸入可轉換為int型的字符串(用 Integer.parseInt(String s)靜態方法可以轉換的字符串,如3)作為爬蟲深度,如果沒有,則默認深度為2.

            本程序的不足之處是:只考慮了href= href=' href="后加絕對url的這三種情況(由于url地址在網頁源文件中情況比較復雜,有時處理也會出現錯誤),還有相對url和 window.open('的情況沒有考慮。異常處理程序也只是簡單處理。如果讀者有改進辦法可以把源代碼帖出,不勝感激。托福改分

            附上源代碼如下(保存名為GetWeb.java):

            import java.io.File;

            import java.io.BufferedReader;

            import java.io.FileOutputStream;

            import java.io.InputStream;

            import java.io.InputStreamReader;

            import java.io.OutputStreamWriter;

            import java.io.PrintWriter;

            import java.net.URL;

            import java.net.URLConnection;

            import java.util.ArrayList;

            import java.util.regex.Matcher;

            import java.util.regex.Pattern;

            import java.util.Hashtable;

            public class GetWeb {

            private int webDepth = 2;// 爬蟲深度

            private int intThreadNum = 10;// 線程數

            private String strHomePage = "";// 主頁地址

            private String myDomain;// 域名

            private String fPath = "web";// 儲存網頁文件的目錄名

            private ArrayList<String> arrUrls = new ArrayList<String>();// 存儲未處理URL

            private ArrayList<String> arrUrl = new ArrayList<String>();// 存儲所有URL供建立索引

            private Hashtable<String, Integer> allUrls = new Hashtable<String, Integer>();// 存儲所有URL的網頁號

            private Hashtable<String, Integer> deepUrls = new Hashtable<String, Integer>();// 存儲所有URL深度

            private int intWebIndex = 0;// 網頁對應文件下標,從0開始

            private String charset = "GB2312";

            private String report = "";

            private long startTime;

            private int webSuccessed = 0;

            private int webFailed = 0;

            public GetWeb(String s) {

            this.strHomePage = s;

            }

            public GetWeb(String s, int i) {

            this.strHomePage = s;

            this.webDepth = i;

            }

            public synchronized void addWebSuccessed() {

            webSuccessed++;

            }

            public synchronized void addWebFailed() {

            webFailed++;

            }

            public synchronized void addReport(String s) {

            try {

            report += s;

            PrintWriter pwReport = new PrintWriter(new FileOutputStream(

            "report.txt"));

            pwReport.println(report);

            pwReport.close();

            } catch (Exception e) {

            System.out.println("生成報告文件失敗!");

            }

            }

            public synchronized String getAUrl() {

            String tmpAUrl = arrUrls.get(0);

            arrUrls.remove(0);

            return tmpAUrl;

            }

            public synchronized String getUrl() {

            String tmpUrl = arrUrl.get(0);

            arrUrl.remove(0);

            return tmpUrl;

            }

            public synchronized Integer getIntWebIndex() {

            intWebIndex++;

            return intWebIndex;

            }

            /**

            * @param args

            */

            public static void main(String[] args) {

            if (args.length == 0 || args[0].equals("")) {

            System.out.println("No input!");

            System.exit(1);

            } else if (args.length == 1) {

            GetWeb gw = new GetWeb(args[0]);

            gw.getWebByHomePage();

            } else {

            GetWeb gw = new GetWeb(args[0], Integer.parseInt(args[1]));

            gw.getWebByHomePage();

            }

            }

            public void getWebByHomePage() {

            startTime = System.currentTimeMillis();

            this.myDomain = getDomain();

            if (myDomain == null) {

            System.out.println("Wrong input!");

            // System.exit(1);

            return;

            }

            System.out.println("Homepage = " + strHomePage);

            addReport("Homepage = " + strHomePage + "!\n");

            System.out.println("Domain = " + myDomain);

            addReport("Domain = " + myDomain + "!\n");

            arrUrls.add(strHomePage);

            arrUrl.add(strHomePage);

            allUrls.put(strHomePage, 0);

            deepUrls.put(strHomePage, 1);

            File fDir = new File(fPath);

            if (!fDir.exists()) {

            fDir.mkdir();

            }

            System.out.println("Start!");

            this.addReport("Start!\n");

            String tmp = getAUrl();

            this.getWebByUrl(tmp, charset, allUrls.get(tmp) + "");

            int i = 0;

            for (i = 0; i < intThreadNum; i++) {

            new Thread(new Processer(this))。start();

            }

            while (true) {

            if (arrUrls.isEmpty() && Thread.activeCount() == 1) {

            long finishTime = System.currentTimeMillis();

            long costTime = finishTime - startTime;

            System.out.println("\n\n\n\n\nFinished!");

            addReport("\n\n\n\n\nFinished!\n");

            System.out.println("Start time = " + startTime + " "

            + "Finish time = " + finishTime + " "

            + "Cost time = " + costTime + "ms");

            addReport("Start time = " + startTime + " "

            + "Finish time = " + finishTime + " "

            + "Cost time = " + costTime + "ms" + "\n");

            System.out.println("Total url number = "

            + (webSuccessed + webFailed) + " Successed: "

            + webSuccessed + " Failed: " + webFailed);

            addReport("Total url number = " + (webSuccessed + webFailed)

            + " Successed: " + webSuccessed + " Failed: "

            + webFailed + "\n");

            String strIndex = "";

            String tmpUrl = "";

            while (!arrUrl.isEmpty()) {

            tmpUrl = getUrl();

            strIndex += "Web depth:" + deepUrls.get(tmpUrl)

            + " Filepath: " + fPath + "/web"

            + allUrls.get(tmpUrl) + ".htm" + " url:" + tmpUrl

            + "\n\n";

            }

            System.out.println(strIndex);

            try {

            PrintWriter pwIndex = new PrintWriter(new FileOutputStream(

            "fileindex.txt"));

            pwIndex.println(strIndex);

            pwIndex.close();

            } catch (Exception e) {

            System.out.println("生成索引文件失敗!");

            }

            break;

            }

            }

            }

            public void getWebByUrl(String strUrl, String charset, String fileIndex) {

            try {

            // if(charset==null||"".equals(charset))charset="utf-8";

            System.out.println("Getting web by url: " + strUrl);

            addReport("Getting web by url: " + strUrl + "\n");

            URL url = new URL(strUrl);

            URLConnection conn = url.openConnection();

            conn.setDoOutput(true);

            InputStream is = null;

            is = url.openStream();

            String filePath = fPath + "/web" + fileIndex + ".htm";

            PrintWriter pw = null;

            FileOutputStream fos = new FileOutputStream(filePath);

            OutputStreamWriter writer = new OutputStreamWriter(fos);

            pw = new PrintWriter(writer);

            BufferedReader bReader = new BufferedReader(new InputStreamReader(

            is));

            StringBuffer sb = new StringBuffer();

            String rLine = null;

            String tmp_rLine = null;

            while ((rLine = bReader.readLine()) != null) {

            tmp_rLine = rLine;

            int str_len = tmp_rLine.length();

            if (str_len > 0) {

            sb.append("\n" + tmp_rLine);

            pw.println(tmp_rLine);

            pw.flush();

            if (deepUrls.get(strUrl) < webDepth)

            getUrlByString(tmp_rLine, strUrl);

            }

            tmp_rLine = null;

            }

            is.close();

            pw.close();

            System.out.println("Get web successfully! " + strUrl);

            addReport("Get web successfully! " + strUrl + "\n");

            addWebSuccessed();

            } catch (Exception e) {

            System.out.println("Get web failed! " + strUrl);

            addReport("Get web failed! " + strUrl + "\n");

            addWebFailed();

            }

            }

            public String getDomain() {

            String reg = "(?<=http\\://[a-zA-Z0-9]{0,100}[.]{0,1})[^.\\s]*?\\.(com|cn|net|org|biz|info|cc|tv)";

            Pattern p = Pattern.compile(reg, Pattern.CASE_INSENSITIVE);

            Matcher m = p.matcher(strHomePage);

            boolean blnp = m.find();

            if (blnp == true) {

            return m.group(0);

            }

            return null;

            }

            public void getUrlByString(String inputArgs, String strUrl) {

            String tmpStr = inputArgs;

            String regUrl = "(?<=(href=)[\"]?[\']?)[http://][^\\s\"\'\\?]*("

            + myDomain + ")[^\\s\"\'>]*";

            Pattern p = Pattern.compile(regUrl, Pattern.CASE_INSENSITIVE);

            Matcher m = p.matcher(tmpStr);

            boolean blnp = m.find();

            // int i = 0;

            while (blnp == true) {

            if (!allUrls.containsKey(m.group(0))) {

            System.out.println("Find a new url,depth:"

            + (deepUrls.get(strUrl) + 1) + " " + m.group(0));

            addReport("Find a new url,depth:" + (deepUrls.get(strUrl) + 1)

            + " " + m.group(0) + "\n");

            arrUrls.add(m.group(0));

            arrUrl.add(m.group(0));

            allUrls.put(m.group(0), getIntWebIndex());

            deepUrls.put(m.group(0), (deepUrls.get(strUrl) + 1));

            }

            tmpStr = tmpStr.substring(m.end(), tmpStr.length());

            m = p.matcher(tmpStr);

            blnp = m.find();

            }

            }

            class Processer implements Runnable {

            GetWeb gw;

            public Processer(GetWeb g) {

            this.gw = g;

            }

            public void run() {

            // Thread.sleep(5000);

            while (!arrUrls.isEmpty()) {

            String tmp = getAUrl();

            getWebByUrl(tmp, charset, allUrls.get(tmp) + "");

            }

            }

            }

            }

          posted on 2013-10-12 17:38 好不容易 閱讀(270) 評論(0)  編輯  收藏


          只有注冊用戶登錄后才能發表評論。


          網站導航:
           
          PK10開獎 PK10開獎
          主站蜘蛛池模板: 布尔津县| 榕江县| 临洮县| 格尔木市| 望奎县| 北川| 荔波县| 右玉县| 衡阳县| 汶川县| 潞城市| 西丰县| 新昌县| 揭西县| 万源市| 望谟县| 淳安县| 无极县| 瓦房店市| 昭平县| 尤溪县| 安吉县| 柘荣县| 双江| 富民县| 含山县| 乌拉特前旗| 松阳县| 长治县| 正宁县| 慈利县| 宁强县| 清流县| 普宁市| 霍林郭勒市| 大渡口区| 清徐县| 章丘市| 汝城县| 绥阳县| 和政县|