??? import
?org.cyberneko.html.parsers.DOMFragmentParser;
???? import org.apache.html.dom.HTMLDocumentImpl;
???? import ?org.w3c.dom.DocumentFragment;
???? import ?org.w3c.dom.Node;
???? import ?org.w3c.dom.NodeList;
???? import ?org.xml.sax.InputSource;
???? import ?org.xml.sax.SAXException;
???? /**
?????*?從html中抽取純文本
?????*?
?????*? @param ?content
?????*? @return
?????*? @throws ?UnsupportedEncodingException
????? */
???? public ?String?extractTextFromHTML(String?content)
???????????? throws ?UnsupportedEncodingException?{
????????DOMFragmentParser?parser? = ? new ?DOMFragmentParser();
????????DocumentFragment?node? = ? new ?HTMLDocumentImpl().createDocumentFragment();
????????InputStream?is? = ? new ?ByteArrayInputStream(content.getBytes());
???????? try ?{
????????????parser.parse( new ?InputSource(is),?node);
????????}? catch ?(IOException?e)?{
????????????e.printStackTrace();
????????}? catch ?(SAXException?se)?{
????????????se.printStackTrace();
????????}
????????StringBuffer?newContent? = ? new ?StringBuffer();
???????? this .getText(newContent,?node);
????????String?str? = ?( new ?String(
????????????????newContent.toString().getBytes( " Windows-1252 " ),? " GBK " ));
???????? return ?str;
????}
???? private ? void ?getText(StringBuffer?sb,?Node?node)?{
???????? if ?(node.getNodeType()? == ?Node.TEXT_NODE)?{
????????????sb.append(node.getNodeValue());
????????}
????????NodeList?children? = ?node.getChildNodes();
???????? if ?(children? != ? null )?{
???????????? int ?len? = ?children.getLength();
???????????? for ?( int ?i? = ? 0 ;?i? < ?len;?i ++ )?{
????????????????getText(sb,?children.item(i));
????????????}
????????}
????}
???? import org.apache.html.dom.HTMLDocumentImpl;
???? import ?org.w3c.dom.DocumentFragment;
???? import ?org.w3c.dom.Node;
???? import ?org.w3c.dom.NodeList;
???? import ?org.xml.sax.InputSource;
???? import ?org.xml.sax.SAXException;
???? /**
?????*?從html中抽取純文本
?????*?
?????*? @param ?content
?????*? @return
?????*? @throws ?UnsupportedEncodingException
????? */
???? public ?String?extractTextFromHTML(String?content)
???????????? throws ?UnsupportedEncodingException?{
????????DOMFragmentParser?parser? = ? new ?DOMFragmentParser();
????????DocumentFragment?node? = ? new ?HTMLDocumentImpl().createDocumentFragment();
????????InputStream?is? = ? new ?ByteArrayInputStream(content.getBytes());
???????? try ?{
????????????parser.parse( new ?InputSource(is),?node);
????????}? catch ?(IOException?e)?{
????????????e.printStackTrace();
????????}? catch ?(SAXException?se)?{
????????????se.printStackTrace();
????????}
????????StringBuffer?newContent? = ? new ?StringBuffer();
???????? this .getText(newContent,?node);
????????String?str? = ?( new ?String(
????????????????newContent.toString().getBytes( " Windows-1252 " ),? " GBK " ));
???????? return ?str;
????}
???? private ? void ?getText(StringBuffer?sb,?Node?node)?{
???????? if ?(node.getNodeType()? == ?Node.TEXT_NODE)?{
????????????sb.append(node.getNodeValue());
????????}
????????NodeList?children? = ?node.getChildNodes();
???????? if ?(children? != ? null )?{
???????????? int ?len? = ?children.getLength();
???????????? for ?( int ?i? = ? 0 ;?i? < ?len;?i ++ )?{
????????????????getText(sb,?children.item(i));
????????????}
????????}
????}
1,nekohtml1.9.6.1版本用到了jdk5的Arrays.hashCode等方法,為兼容jdk1.4,
? 故采用nekohtml1.9.6版本
2,需要xerces.jar支持
3,
?? http://hi.baidu.com/walkandsing/blog/item/f5743634c6ba2e3a5bb5f5e5.html
?? http://blog.csdn.net/zhou2002/archive/2008/01/19/2053911.aspx
?? http://playfish.javaeye.com/blog/150184
4,
python解析html
http://lenciel.cn/docs/python-parser-of-xml/
http://hi.baidu.com/javalang/blog/item/84bac4bf731fb80f18d81fe1.html
ruby用hpricot