MDA/MDD/TDD/DDD/DDDDDDD
          posts - 536, comments - 111, trackbacks - 0, articles - 0
            BlogJava :: 首頁 :: 新隨筆 :: 聯系 :: 聚合  :: 管理

          html的解析以及nekohtml的使用

          Posted on 2008-02-21 18:29 leekiang 閱讀(2898) 評論(0)  編輯  收藏 所屬分類: 文件處理
          ??? import ?org.cyberneko.html.parsers.DOMFragmentParser;
          ???? import org.apache.html.dom.HTMLDocumentImpl;
          ????
          import ?org.w3c.dom.DocumentFragment;
          ????
          import ?org.w3c.dom.Node;
          ????
          import ?org.w3c.dom.NodeList;
          ????
          import ?org.xml.sax.InputSource;
          ????
          import ?org.xml.sax.SAXException;

          ????
          /**
          ?????*?從html中抽取純文本
          ?????*?
          ?????*?
          @param ?content
          ?????*?
          @return
          ?????*?
          @throws ?UnsupportedEncodingException
          ?????
          */
          ????
          public ?String?extractTextFromHTML(String?content)
          ????????????
          throws ?UnsupportedEncodingException?{
          ????????DOMFragmentParser?parser?
          = ? new ?DOMFragmentParser();
          ????????DocumentFragment?node?
          = ? new ?HTMLDocumentImpl().createDocumentFragment();
          ????????InputStream?is?
          = ? new ?ByteArrayInputStream(content.getBytes());
          ????????
          try ?{
          ????????????parser.parse(
          new ?InputSource(is),?node);
          ????????}?
          catch ?(IOException?e)?{
          ????????????e.printStackTrace();
          ????????}?
          catch ?(SAXException?se)?{
          ????????????se.printStackTrace();
          ????????}

          ????????StringBuffer?newContent?
          = ? new ?StringBuffer();
          ????????
          this .getText(newContent,?node);

          ????????String?str?
          = ?( new ?String(
          ????????????????newContent.toString().getBytes(
          " Windows-1252 " ),? " GBK " ));
          ????????
          return ?str;
          ????}

          ????
          private ? void ?getText(StringBuffer?sb,?Node?node)?{
          ????????
          if ?(node.getNodeType()? == ?Node.TEXT_NODE)?{
          ????????????sb.append(node.getNodeValue());
          ????????}
          ????????NodeList?children?
          = ?node.getChildNodes();
          ????????
          if ?(children? != ? null )?{
          ????????????
          int ?len? = ?children.getLength();
          ????????????
          for ?( int ?i? = ? 0 ;?i? < ?len;?i ++ )?{
          ????????????????getText(sb,?children.item(i));
          ????????????}
          ????????}
          ????}

          1,nekohtml1.9.6.1版本用到了jdk5的Arrays.hashCode等方法,為兼容jdk1.4,
          ? 故采用nekohtml1.9.6版本
          2,需要xerces.jar支持
          3,
          ?? http://hi.baidu.com/walkandsing/blog/item/f5743634c6ba2e3a5bb5f5e5.html
          ?? http://blog.csdn.net/zhou2002/archive/2008/01/19/2053911.aspx
          ?? http://playfish.javaeye.com/blog/150184

          4,
          python解析html
          http://lenciel.cn/docs/python-parser-of-xml/
          http://hi.baidu.com/javalang/blog/item/84bac4bf731fb80f18d81fe1.html
          ruby用hpricot

          主站蜘蛛池模板: 婺源县| 陆河县| 瑞金市| 兴化市| 错那县| 义马市| 宁都县| 宜宾市| 华宁县| 石景山区| 光泽县| 昭平县| 贺州市| 个旧市| 阜康市| 通道| 阿拉善右旗| 彝良县| 邵阳县| 湖北省| 蓬溪县| 从化市| 治县。| 江陵县| 陵水| 通海县| 东港市| 岐山县| 肥西县| 昂仁县| 浙江省| 共和县| 常德市| 仁布县| 元阳县| 砚山县| 高雄市| 延吉市| 祁阳县| 延安市| 黎平县|