ï»??xml version="1.0" encoding="utf-8" standalone="yes"?>欧美劲爆第一页,日韩成人影音,久久最新资源网http://www.aygfsteel.com/willpower88/category/37510.html对JAVA有点理解äº?ji¨£n)…â€?/description>zh-cnMon, 09 Feb 2009 08:36:13 GMTMon, 09 Feb 2009 08:36:13 GMT60lucene2.0+heritrix½CÞZ¾‹è¡¥å……http://www.aygfsteel.com/willpower88/archive/2009/02/09/253914.html一å‡?/dc:creator>一å‡?/author>Mon, 09 Feb 2009 07:44:00 GMThttp://www.aygfsteel.com/willpower88/archive/2009/02/09/253914.htmlhttp://www.aygfsteel.com/willpower88/comments/253914.htmlhttp://www.aygfsteel.com/willpower88/archive/2009/02/09/253914.html#Feedback0http://www.aygfsteel.com/willpower88/comments/commentRss/253914.htmlhttp://www.aygfsteel.com/willpower88/services/trackbacks/253914.html searchçš„Extractor代码如下åQŒï¼ˆåˆ«å’Œä¹¦ä¸Šå®žä¾‹ç›¸åŒåQ‰ä¾›å¤§å®¶å‚考:(x¨¬)附äšg里有完整代码
package com.luceneheritrixbook.extractor.younet;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Date;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;

import com.luceneheritrixbook.extractor.Extractor;
import com.luceneheritrixbook.util.StringUtils;

/**
 * <p></p>
 * 
@author cnyqiao@hotmail.com
 * @date   Feb 6, 2009 
 
*/

public class ExtractYounetMoblie extends Extractor {

    @Override
    
public void extract() {
        BufferedWriter bw 
= null;
        NodeFilter title_filter 
= new AndFilter(new TagNameFilter("div"), new HasAttributeFilter("class""mo_tit"));
        NodeFilter attribute_filter 
= new AndFilter(new TagNameFilter("p"), new HasChildFilter(new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class""gn_sp1 blue1"))));
        NodeFilter img_filter 
= new AndFilter(new TagNameFilter("span"), new HasChildFilter(new TagNameFilter("img")));
        
        
//提取标题?sh¨´)¿¡æ?/span>
        try {
            
//Parseræ ÒŽ(gu¨©)®˜q‡æ×o(h¨´)器返回所有满­‘Œ™¿‡æ»¤æ¡ä»¶çš„节点
            
// ˜q­ä»£é€æ¸æŸ¥æ‰¾
            NodeList nodeList=this.getParser().parse(title_filter);
            NodeIterator it 
= nodeList.elements();
            StringBuffer title 
= new StringBuffer();
            
while (it.hasMoreNodes()) {
                Node node 
= (Node) it.nextNode();
                String[] names 
= node.toPlainTextString().split(" ");
                
for(int i = 0; i < names.length; i++)
                    title.append(names[i]).append(
"-");
                title.append(
new Date().getTime());
                
//创徏要生成的文äšg
                bw = new BufferedWriter(new FileWriter(new File(this.getOutputPath() + title + ".txt")));
                
//获取当前提取™å늚„完整URL地址
                int startPos = this.getInuputFilePath().indexOf("mirror"+ 6;
                String url_seg 
= this.getInuputFilePath().substring(startPos);
                url_seg 
= url_seg.replaceAll("\\\\""/");
                String url 
= "http:/" + url_seg;
                
//写入当前提取™å늚„完整URL地址
                bw.write(url + NEWLINE);
                bw.write(names[
0+ NEWLINE);
                bw.write(names[
1+ NEWLINE);
                
            }
            
// é‡ç½®Parser
            this.getParser().reset();
            Parser attNameParser 
= null;
            Parser attValueParser 
= null;
            
//Parser parser=new Parser("http://www.sina.com.cn");
            NodeFilter attributeName_filter = new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class""gn_sp1 blue1"));
            NodeFilter attributeValue_filter 
= new AndFilter(new TagNameFilter("span"), new HasAttributeFilter("class""gn_sp2"));
            String attName 
= "";
            String attValue 
= "";
            
// ˜q­ä»£é€æ¸æŸ¥æ‰¾
            nodeList=this.getParser().parse(attribute_filter);
            it 
= nodeList.elements();
            
while (it.hasMoreNodes()) {                
                Node node 
= (Node) it.nextNode();
                attNameParser 
= new Parser();
                attNameParser.setEncoding(
"GB2312");
                attNameParser.setInputHTML(node.toHtml());
                NodeList attNameNodeList 
= attNameParser.parse(attributeName_filter);
                attName 
= attNameNodeList.elements().nextNode().toPlainTextString();
                
                attValueParser 
= new Parser();
                attValueParser.setEncoding(
"GB2312");
                attValueParser.setInputHTML(node.toHtml());
                NodeList attValueNodeList 
= attValueParser.parse(attributeValue_filter);
                attValue 
= attValueNodeList.elements().nextNode().toPlainTextString();
                bw.write(attName.trim() 
+ attValue.trim());
                bw.newLine();
            }
            
// é‡ç½®Parser
            this.getParser().reset();
            String imgUrl 
= "";
            String fileType 
="";
            
// ˜q­ä»£é€æ¸æŸ¥æ‰¾
            nodeList=this.getParser().parse(img_filter);
            it 
= nodeList.elements();
            
while (it.hasMoreNodes()) {                
                Node node 
= (Node) it.nextNode();
                
                ImageTag imgNode 
= (ImageTag)node.getChildren().elements().nextNode();
                imgUrl 
= imgNode.getAttribute("src");                
                fileType 
= imgUrl.trim().substring(imgUrl
                        .lastIndexOf(
"."+ 1);
                
//生成新的囄¡‰‡çš„æ–‡ä»¶å
                String new_iamge_file = StringUtils.encodePassword(imgUrl, HASH_ALGORITHM) + "." + fileType;
                
//imgUrl = new HtmlPaserFilterTest().replace(new_iamge_file, "+", " ");
                
//利用miorr目录下的囄¡‰‡ç”Ÿæˆçš„æ–°çš„图ç‰?/span>
                this.copyImage(imgUrl, new_iamge_file);
                bw.write(SEPARATOR 
+ NEWLINE);
                bw.write(new_iamge_file 
+ NEWLINE);
            }
            
            
        } 
catch(Exception e) {
            e.printStackTrace();
        } 
finally {
            
try{
                
if (bw != null)
                    bw.close();
            }
catch(IOException e){
                e.printStackTrace();
            }
        }
        
    }
}
˜qè¡Œä¹¦ä¸Šçš„heritrix实例åQŒåƈ按书上的默认讄¡½®˜q›è¡ŒæŠ“取如下åQµï¼²åQ©ï¼š(x¨¬)åQˆè¯·è‡ªå·±åˆ†æžæ•´ç†åQ?br />
http://mobile.younet.com/files/list_1.html
http://mobile.younet.com/files/list_2.html
http://mobile.younet.com/files/list_3.html



]]>
Ö÷Õ¾Ö©Öë³ØÄ£°å£º Õò°ÍÏØ| ÇàÁú| ÕýÑôÏØ| Îä¶¨ÏØ| ÀÖ¶«| ·ï»ËÏØ| Ðû»¯ÏØ| ÕÄÆÖÏØ| ¸ß°²ÊÐ| Ñ×ÁêÏØ| ÈéɽÊÐ| ÄþÏçÏØ| »ªÈÝÏØ| Áù°²ÊÐ| ʯʨÊÐ| ÓÒÓñÏØ| »á²ýÏØ| µÂ»¯ÏØ| ¹ÛÌÁÇø| ¾ÅÕ¯¹µÏØ| ÑÓ½òÏØ| ÇúÎÖÏØ| ÁÙÌ¶ÏØ| ¿ÂÆºÏØ| ´óÀíÊÐ| Ñ·¿ËÏØ| Ù¤Ê¦ÏØ| ÐÂÃÜÊÐ| ¤ÄÏÊÐ| ÇìÔªÏØ| Á¹É½| Æô¶«ÊÐ| Õ´ÒæÏØ| ¸£ÖÝÊÐ| ÁéÉ½ÏØ| ÔÆÁúÏØ| ÎÚËÕÊÐ| ¾¸Ô¶ÏØ| ¦·³ÏØ| Ó¦Óñر¸| ÌÁ¹ÁÇø|