march alex's blog
          hello,I am march alex
          posts - 52,comments - 7,trackbacks - 0
          Crawler類能夠通過寬度優先搜索不斷地抓取網站上的url。
          這里需要用到FileHelper類的writeFile方法用于寫入文件。
          代碼如下:
          import java.util.HashMap;
          import java.util.Iterator;
          import java.util.LinkedList;
          import java.util.Queue;


          public class Crawler {
              
              private static HashMap<String, Integer> map = new HashMap<String, Integer>();
              private static int count = 0;
              private static int max_count = 200000;
              
              public static String[] getLinks(String content) {
                  HashMap<String, Integer> map = new HashMap<String, Integer>();
                  int len = content.length();
                  
                  for(int i=0;i+9 < len;i++) {
                      if(content.substring(i, i+8).equals("\"http://") || content.substring(i, i+9).equals("\"https://")) {
                          String ss = new String();
                          for(int j=i+1;j<len && content.charAt(j) != '\"';j++) ss += content.charAt(j);
                          if(map.containsKey(ss)) continue;
                          map.put(ss, new Integer(1));
                      }
                  }
                  int N = map.size();
                  String[] ans = new String[N];
                  Iterator<String> iter = map.keySet().iterator();
                  int cnt = 0;
                  while (iter.hasNext()) {
                      String key = iter.next();
                      ans[cnt++] = key;
                  }
                  return ans;
              }
              
              private static boolean isPictureUrl(String url) {
                  int len = url.length();
                  if(url.substring(len-4, len).equals(".jpg") 
                          || url.substring(len-4, len).equals(".png") 
                          || url.substring(len-4, len).equals(".gif"))
                      return true;
                  return false;
              }
              
              public static void bfs(String u, String filename) {
                  String ans = "";
                  Queue<String> queue = new LinkedList<String>();
                  map.put(u, new Integer(1));
                  count ++;
                  queue.offer(u);
                  while ((u = queue.poll()) != null) {
                      System.out.println("digging in " + u);
                      System.out.println("have digged " + count + " pages now");
                      String content;
                      try {
                          content = URLAnalysis.getContent(u);
                          String[] res = getLinks(content);
                          int n = res.length;
                          for (int i = 0; i < n; i++) {
                              String v = res[i];
                              if (map.containsKey(v))
                                  continue;
                              count ++;
                              ans += v + "\n";
                              map.put(v, new Integer(1));
                              if(false == isPictureUrl(v))
                                  queue.offer(v);
                          }
                          if(count >= max_count) break;
                      } catch (Exception e) {
                          e.printStackTrace();
                      }
                  }
                  try {
                      FileHelper.writeFile(ans, filename);
                  } catch (Exception e) {
                      e.printStackTrace();
                  }
              }
              
              
              
              public static void main(String[] args) {
                  bfs("http://www.163.com", "D:\\test321\\urls.txt");
              }
          }

          下面是部分輸出內容:
          http://
          http://focus.news.163.com/15/0319/10/AL2INPO400011SM9.html
          http://lady.163.com/15/0317/14/AKTR681900264IJ2.html
          http://dajia.163.com/article/147.html#AL1GT1GU0095004J
          http://xf.house.163.com/qhd/search/0-0-0-0-0-0-0-0-1.html
          http://rd.da.netease.com/redirect?t=mwGQ3t&p=EA7B9E&target=http%3A%2F%2Fwww.kaola.com
          http://tech.163.com/15/0321/07/AL7C7U3R000915BF.html
          http://yuedu.163.com/book_reader/b39efe40b81843a8ac4eabdd3b756d92_4/cd59ff87a38e48eba21b312c4d26f2c7_4?utm_campaign=163ad&utm_source=163home&utm_medium=tab_1_2_7
          http://v.163.com/special/opencourse/financialmarkets.html
          http://paopao.163.com/schedule/show?pageId=4050&utm_source=163&utm_medium=wytab01&utm_campaign=warmup
          http://xf.house.163.com/zz/search/0-0-0-0-0-0-0-0-1.html
          http://sports.163.com/15/0321/10/AL7MA69F00052UUC.html
          http://ent.163.com/15/0321/01/AL6NG0GI00031H2L.html
          http://img2.cache.netease.com/lady/2014/3/1/201403012352473e66b.jpg
          http://love.163.com/?vendor=163.navi.icon&utm_source=163.com&utm_campaign=163navi
          http://caipiao.163.com/#from=www
          http://money.163.com/15/0321/08/AL7GDD1L00253B0H.html
          http://yichuangqingshu.lofter.com/post/21d053_641bd4b?act=qbwysylofer_20150101_01
          http://img4.cache.netease.com/tech/2015/3/21/20150321095714dd3c3.jpg
          http://m.163.com/iphone/index.html
          http://yuanst.blog.163.com/blog/static/186229043201522084612809/
          http://lady.163.com/15/0320/00/AL42J3UD00264OCL.html
          http://w.163.com/15/0320/15/AL5MBP6J00314C3U.html
          http://vhouse.163.com/1421889369882.html
          http://img2.cache.netease.com/edu/2015/3/20/2015032017293274fa5.jpg
          posted on 2015-03-21 16:59 marchalex 閱讀(409) 評論(0)  編輯  收藏 所屬分類: java小程序
          主站蜘蛛池模板: 仪征市| 鹿邑县| 临颍县| 肃北| 奉新县| 大厂| 朝阳区| 新郑市| 台江县| 湾仔区| 油尖旺区| 比如县| 青河县| 宜章县| 郴州市| 井陉县| 石屏县| 尼玛县| 开原市| 玛纳斯县| 金乡县| 彰化县| 平江县| 大港区| 莱州市| 广元市| 河曲县| 太和县| 玉溪市| 新宾| 宣武区| 乌拉特后旗| 迁西县| 陵川县| 汾阳市| 锡林浩特市| 兰西县| 伊春市| 砀山县| 东乡| 名山县|