lycong

使用SAX對XML根據具體需求過濾標簽和長度截取

需要解決的問題是根據一輸入流讀取一段XML內容，然后對其進行過濾截取，最后寫回輸出流中。具體說明如下：
1.對XML根據特定需求，過濾標簽（如SCRIPT,FRAME等非標準HTML標簽），過濾屬性（如onclick,onblur等）
2.對XML進行長度截取，具體做法如下：
（1）對start標簽的處理：若加上start標簽長度后超過最大允許長度，則去除該標簽，且同時去除后面和該標簽同一等級的所有標簽。
（2）對text內容的處理：若加上text內容的長度后超過最大允許的長度，則從中截取text長度，并加上省略號......
（3）對end標簽內容的處理：不做長度截取，且要做到自動補齊end標簽。

有關SAX的詳細介紹，請查看最好的參考資料 http://www.saxproject.org/ 。其中有一個很重要的類 DefaultHandler，該類中的startElement, endElement, characters 3個方法尤為重要。為解決上述問題，需要設計2個類：HTMLWriter, HTMLFilter, 其中HTMLFilter是HTMLWriter的子類，HTMLWriter繼承了DefaultHandler，其中最為關鍵的是要重寫上述3個關鍵方法。

一.HTMLWriter類的代碼：
這個類主要用于寫操作，最重要是理解變量strippedElementLevel 的用法。上面問題的具體業務邏輯處理（標簽的過濾和長度截取）將在子類HTMLFilter 解決。

package org.util.sax.html

import openxml.parser.HTMLdtd;

import openxml.parser.HTMLSAXParser;

import org.xml.sax.Attributes;

import org.xml.sax.SAXException;

import org.xml.sax.InputSource;

import org.xml.sax.XMLReader;

import org.xml.sax.ErrorHandler;

import org.xml.sax.ext.LexicalHandler;

import org.xml.sax.helpers.DefaultHandler;

import java.io.*;

public class HTMLWriter extends DefaultHandler implements LexicalHandler {

private ErrorHandler errorHandler;

private Writer out;

private int strippedElementLevel = 0; //用來作為start標簽和end標簽成對出現的標記（極為重要），具體算法思路類似于堆棧

private boolean inRawElement;

public void filter(String htmlContent) throws IOException, SAXException {

filter(new StringReader(htmlContent));

}

public void filter(Reader in) throws IOException, SAXException {

filter(new InputSource(in));

}

public void filter(InputSource in) throws IOException, SAXException {

HTMLSAXParser parser = new HTMLSAXParser(errorHandler, false);

parser.setLexicalHandler(this);

XMLReader htmlReader = new HTMLParserAdapter(parser);

htmlReader.setFeature("http://xml.org/sax/features/namespaces", false);

htmlReader.setContentHandler(this);

prepare();

htmlReader.parse(in);

}

protected void prepare() {

if (out == null) {

out = new StringWriter();

}

public void setErrorHandler(ErrorHandler errorHandler) {

this.errorHandler = errorHandler;

}

public void setOut(Writer out) {

this.out = out;

}

public Writer getOut() {

return out;

}

public String getResultAsString() {

if (out instanceof StringWriter) {

return out.toString();

}

throw new IllegalStateException("Not a buffered target");

}

@Override

public void startDocument() throws SAXException {

prepare();

}

@Override

public final void startElement(String namespaceURI,

String localName,

String qName,

Attributes attrs) throws SAXException {

if (strippedElementLevel > 0) {

strippedElementLevel++;

return;

}

// features/namespace is false

if (!startTag(qName, attrs)) {

strippedElementLevel = 1;

}

@Override

public final void endElement(String namespaceURI,

String localName,

String qName) throws SAXException {

if (strippedElementLevel > 0) {

strippedElementLevel--;

return;

}

// features/namespace is false

endTag(qName);

}

protected boolean startTag(String tagName, Attributes attrs) throws SAXException {

String tagUpper = tagName.toUpperCase();

inRawElement = "SCRIPT".equals(tagUpper) || "STYLE".equals(tagUpper);

write('<');

write(tagName);

for (int i = 0; i < attrs.getLength(); i++) {

// features/namespace is false

String attrName = attrs.getQName(i);

attribute(tagUpper, attrName.toLowerCase(), attrName, attrs.getValue(i));

}

write('>');

return true;

}

protected void endTag(String tagName) throws SAXException {

inRawElement = false;

if (!isEmptyTag(tagName.toUpperCase())) {

write("</");

write(tagName);

write('>');

}

@Override

public void characters(char[] ch, int start, int length) throws SAXException {

if (strippedElementLevel != 0) {

return;

}

if (inRawElement) {

write(ch, start, length);

return;

}

text(ch, start, length);

}

protected void text(char[] ch, int start, int length) throws SAXException {

writeText(ch, start, length);

}

public void startDTD(String tagName, String publicId, String systemId) throws SAXException {

write("<!DOCTYPE ");

write(tagName);

write(" PUBLIC ");

write('"');

write(publicId);

write('"');

write('>');

}

public void endDTD() {}

public void startEntity(String name) {}

public void endEntity(String name) {}

public void startCDATA() {}

public void endCDATA() {}

public void comment(char ch[], int start, int length) throws SAXException {

/*

if (strippedElementLevel == 0) {

write("<!--");

write(ch, start, length);

write("-->");

}

*/

}

@Override

public void ignorableWhitespace(char ch[], int start, int length) throws SAXException {

if (strippedElementLevel == 0) {

write(ch, start, length);

}

protected void attribute(final String tagUpper, // 規范化的 TAG 名稱 - 使用大寫字母

final String attrLower, // 規范化的屬性名稱 - 使用小寫字母

String attrName,

String attrValue) throws SAXException {

write(' ');

write(attrName);

if (!isBoolean(attrLower, tagUpper)) {

write('=');

write('"');

for (int i = 0; i < attrValue.length(); i++) {

writeEncoded(attrValue.charAt(i), true);

}

write('"');

}

protected final void writeText(char[] ch, int start, int length) throws SAXException {

writeTextWithEnd(ch, start, start + length);

}

protected final void writeTextWithEnd(char[] ch, int begin, int end) throws SAXException {

for (int i = begin; i < end; i++) {

writeEncoded(ch[i], false);

}

protected void writeEncoded(char c, boolean isAttr) throws SAXException {

switch (c) {

case '<':

write("<");

break;

case '>':

write(">");

break;

case '&':

write("&");

break;

case 0xa0: // NBSP

// 暫時只特殊處理特殊字符 NBSP

// 當組信 NBSP 在轉換到純文本時可變成空格

// 但其它特殊字符沒有簡單的Ascii字符可替代, 因而這里也不執行替代

write(" ");

break;

case '"':

if (isAttr) {

write(""");

break;

}

default:

write(c);

}

protected void write(char c) throws SAXException {

try {

out.write(c);

} catch (IOException e) {

throw new SAXException(e);

}

protected void write(char ch[], int start, int length) throws SAXException {

try {

out.write(ch, start, length);

} catch (IOException e) {

throw new SAXException(e);

}

protected void write(String s) throws SAXException {

try {

out.write(s);

} catch (IOException e) {

throw new SAXException(e);

}

private static boolean isBoolean(String attrLower, String tagUpper) {

return HTMLdtd.isBoolean(attrLower, tagUpper);

}

private static boolean isEmptyTag(String tagUpper) {

return HTMLdtd.isEmptyTag(tagUpper);

}

二. HTMLFilter 類的代碼：
主要解決標簽過濾，即哪些標簽和屬性需要過濾，解決長度截取問題，即斷點出現在startTag,text,endTag的情況應該如何解決。
主要理解重寫父類HTMLWriter的幾個方法：startTag（），characters（），comment（），attribute（），另外需要一個成員變量currentLen記錄當前寫入的長度，在進行write()方法時要對currentLen變量進行疊加。

package org.util.sax.html;

import org.xml.sax.Attributes;

import org.xml.sax.SAXException;

import java.util.Map;

import java.io.Writer;

import java.io.CharArrayWriter;

import java.io.IOException;

public class HTMLFilter extends HTMLWriter {

ConfigManager conf = CM.getConfig();

Map<String, String> cidMap; //cid 和正文內容圖片 filename的映射

private int currentLen; //當前已經寫入out的長度

private int maxLen; //允許push的最大長度

private boolean ignore=false; //當出現要截取時，就設為 true ,意味著如果ignore 為true時，就以后的內容都要忽略。

public HTMLFilter(Map<String,String> map,int allowMessage_BodyLen) {

//super.setAllowContentLen(allowMessage_BodyLen);

this.maxLen=allowMessage_BodyLen;

this.cidMap=map;

}

@Override

protected boolean startTag(String tagName, Attributes attrs) throws SAXException {

if (!isTagAllowed(tagName, attrs)) {

return false;

}

if (ignore) {

return false;

}

Writer originalOutput = getOut();

int remainChars = getRemainChars();

if(remainChars == 0){

ignore = true;

write("

");

return false;

}

CharArrayWriter capturedOutput = new CharArrayWriter();

setOut(capturedOutput);

try {

if (super.startTag(tagName, attrs)) {

if (capturedOutput.toCharArray().length < remainChars) {

try {

originalOutput.write(capturedOutput.toCharArray());

return true;

} catch (IOException e) {

throw new SAXException(e);

}

} finally {

setOut(originalOutput);

}

ignore = true;

write("

");

return false;

}

@Override

public void characters(char[] ch, int start, int length) throws SAXException {

if (ignore) { //如果長度已經超出限制，則不寫

return;

}

int remainChars = getRemainChars();

if (remainChars == 0) {

ignore = true;

write("

");

return;

}

if (remainChars < length) { //當將要寫入的 text 長度大于 remainChars 時，就寫入所能夠寫入的字符，然后添加省略號

ignore = true;

super.characters(ch, start, remainChars);

write("

");

} else {

super.characters(ch, start, length);

}

@Override

protected void endTag(String tagName) throws SAXException {

super.endTag(tagName);

}

public void comment(char ch[], int start, int length) throws SAXException{

if(ignore){

return;

}

int remainChars = getRemainChars();

if (remainChars == 0) {

ignore = true;

write("

");

return;

}

if (remainChars < length) {

ignore=true;

super.comment(ch, start, remainChars);

} else {

super.comment(ch, start, length);

}

@Override

protected void attribute(final String tagUpper,

final String attrLower,

final String attrName,

String attrValue) throws SAXException {

if (attrLower.startsWith("on")) {

return;

}

if (tagUpper.equalsIgnoreCase("IMG") && attrLower.equalsIgnoreCase("src") && attrValue.trim().indexOf("cid:") != -1) {

attrValue=attrValue.trim();

int cid_idx = attrValue.indexOf("cid:");

String cid = attrValue.substring(cid_idx + 4);

// System.out.println("cid is: "+ cid);

String photoName = cidMap.get(cid);

// System.out.println("photoName is: "+ photoName);

if (photoName != null) {

super.attribute(tagUpper, attrLower, attrName, "#{" + photoName + "}");

} else{

super.attribute(tagUpper, attrLower, attrName, "#{" + " " + "}");

}

} else {

attrValue = transformScript(attrValue);

super.attribute(tagUpper, attrLower, attrName, attrValue);

}

private String transformScript(final String data) {

if (true) {

final String trimedData = data.trim();

final String scriptData = mySubstringAfterIgnoreCase(trimedData, "javascript:");

if (scriptData != null) {

return "";

}

return data;

}

protected boolean isTagAllowed(String tagName, Attributes attrs) {

if (tagName.equalsIgnoreCase("SCRIPT")) {

return false;

}

if(tagName.equalsIgnoreCase("A")){ //超鏈接標簽不push

return false;

}

if (tagName.equalsIgnoreCase("PARAM")) {

String name = getAttrIgnoreCase(attrs, "name");

if ("movie".equalsIgnoreCase(name) || "src".equalsIgnoreCase(name)) {

return false;

}

/*

if (tagName.equalsIgnoreCase("STYLE")) {

return false;

}

*/

if (tagName.equalsIgnoreCase("LINK") &&

"stylesheet".equalsIgnoreCase(getAttrIgnoreCase(attrs, "rel"))) {

return false;

}

if (tagName.equals("FRAME") || tagName.equals("FRAMESET")) {

return false;

}

return true;

}

private static String getAttrIgnoreCase(Attributes attrs, String name) {

for (int i = 0, len = attrs.getLength(); i < len; i++) {

if (name.equalsIgnoreCase(attrs.getQName(i))) {

return attrs.getValue(i);

}

return null;

}

/**

* 忽略控制字符后, 判斷是否以某字符串開始, 并返回匹配后的截取部分.

* <p/>

* 注: 忽略控制字符是為了對付IE的安全漏洞

*

* @param source 源字符串

* @param prefix 要匹配的前綴字符串

* @return 如果測試成功, 返回截取后的字符串; 否則, 返回 null;

*/

static String mySubstringAfterIgnoreCase(String source, String prefix) {

int sourceLength = source.length();

int targetLength = prefix.length();

if (sourceLength < targetLength) {

return null;

}

int sourceOffset = 0;

int targetOffset = 0;

char targetChar = Character.toUpperCase(prefix.charAt(targetOffset));

for (; sourceOffset < sourceLength; sourceOffset++) {

char c = source.charAt(sourceOffset);

if (c < ' ') {

// 忽略控制字符

continue;

}

if (Character.toUpperCase(c) != targetChar) {

break;

}

targetOffset++;

if (targetOffset == targetLength) {

return source.substring(sourceOffset + 1);

}

targetChar = Character.toUpperCase(prefix.charAt(targetOffset));

}

return null;

}

protected void write(char c) throws SAXException {

super.write(c);

currentLen++;

}

protected void write(char ch[], int start, int length) throws SAXException {

super.write(ch, start, length);

currentLen += length;

}

protected void write(String s) throws SAXException {

super.write(s);

currentLen += s.length();

}

protected int getRemainChars(){ //求出還剩多少個字符可以寫入

return (maxLen - currentLen);

}

posted on 2008-09-01 21:26 cong 閱讀(930) 評論(0) 編輯收藏所屬分類: JAVA

新用戶注冊刷新評論列表


只有注冊用戶登錄后才能發表評論。




網站導航: 博客園 IT新聞 Chat2DB C++博客博問管理

lycong

使用SAX對XML根據具體需求過濾標簽和長度截取

My Links

Blog Stats

常用鏈接

留言簿(1)

隨筆分類

隨筆檔案

搜索

最新評論

閱讀排行榜

評論排行榜