使用SAX對XML根據具體需求過濾標簽和長度截取
需要解決的問題是 根據一輸入流讀取一段XML內容,然后對其進行過濾截取,最后寫回輸出流中。具體說明如下:
1.對XML根據特定需求,過濾標簽(如SCRIPT,FRAME等非標準HTML標簽),過濾屬性(如onclick,onblur等)
2.對XML進行長度截取,具體做法如下:
(1)對start標簽的處理: 若加上start標簽長度后超過最大允許長度,則去除該標簽,且同時去除后面和該標簽同一等級的所有標簽。
(2)對text內容的處理:若加上text內容的長度后超過最大允許的長度,則從中截取text長度,并加上省略號......
(3)對end標簽內容的處理:不做長度截取,且要做到自動補齊end標簽。
有關SAX的詳細介紹,請查看最好的參考資料 http://www.saxproject.org/ 。其中有一個很重要的類 DefaultHandler, 該類中的startElement, endElement, characters 3個方法尤為重要。 為解決上述問題,需要設計2個類:HTMLWriter, HTMLFilter, 其中HTMLFilter是HTMLWriter的子類,HTMLWriter繼承了DefaultHandler,其中最為關鍵的是要重寫上述3個關鍵方法。
一.HTMLWriter類的代碼:
這個類主要用于寫操作,最重要是理解變量strippedElementLevel 的用法。上面問題的具體業務邏輯處理(標簽的過濾和長度截取)將在子類HTMLFilter 解決。
package org.util.sax.html

import openxml.parser.HTMLdtd;
import openxml.parser.HTMLSAXParser;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.InputSource;
import org.xml.sax.XMLReader;
import org.xml.sax.ErrorHandler;
import org.xml.sax.ext.LexicalHandler;
import org.xml.sax.helpers.DefaultHandler;

import java.io.*;



public class HTMLWriter extends DefaultHandler implements LexicalHandler {

private ErrorHandler errorHandler;

private Writer out;

private int strippedElementLevel = 0; //用來作為start標簽和end標簽成對出現的標記(極為重要),具體算法思路類似于堆棧
private boolean inRawElement;


public void filter(String htmlContent) throws IOException, SAXException {
filter(new StringReader(htmlContent));
}

public void filter(Reader in) throws IOException, SAXException {
filter(new InputSource(in));
}

public void filter(InputSource in) throws IOException, SAXException {
HTMLSAXParser parser = new HTMLSAXParser(errorHandler, false);
parser.setLexicalHandler(this);

XMLReader htmlReader = new HTMLParserAdapter(parser);
htmlReader.setFeature("http://xml.org/sax/features/namespaces", false);
htmlReader.setContentHandler(this);

prepare();
htmlReader.parse(in);
}


protected void prepare() {
if (out == null) {
out = new StringWriter();
}
}


public void setErrorHandler(ErrorHandler errorHandler) {
this.errorHandler = errorHandler;
}

public void setOut(Writer out) {
this.out = out;
}

public Writer getOut() {
return out;
}

public String getResultAsString() {
if (out instanceof StringWriter) {
return out.toString();
}
throw new IllegalStateException("Not a buffered target");
}


@Override
public void startDocument() throws SAXException {
prepare();
}


@Override
public final void startElement(String namespaceURI,
String localName,
String qName,
Attributes attrs) throws SAXException {
if (strippedElementLevel > 0) {
strippedElementLevel++;
return;
}

// features/namespace is false
if (!startTag(qName, attrs)) {
strippedElementLevel = 1;
}
}


@Override
public final void endElement(String namespaceURI,
String localName,
String qName) throws SAXException {
if (strippedElementLevel > 0) {
strippedElementLevel--;
return;
}

// features/namespace is false
endTag(qName);
}


protected boolean startTag(String tagName, Attributes attrs) throws SAXException {

String tagUpper = tagName.toUpperCase();

inRawElement = "SCRIPT".equals(tagUpper) || "STYLE".equals(tagUpper);

write('<');
write(tagName);
for (int i = 0; i < attrs.getLength(); i++) {
// features/namespace is false
String attrName = attrs.getQName(i);
attribute(tagUpper, attrName.toLowerCase(), attrName, attrs.getValue(i));
}
write('>');

return true;
}


protected void endTag(String tagName) throws SAXException {
inRawElement = false;
if (!isEmptyTag(tagName.toUpperCase())) {
write("</");
write(tagName);
write('>');
}
}


@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (strippedElementLevel != 0) {
return;
}

if (inRawElement) {
write(ch, start, length);
return;
}

text(ch, start, length);
}


protected void text(char[] ch, int start, int length) throws SAXException {
writeText(ch, start, length);
}


public void startDTD(String tagName, String publicId, String systemId) throws SAXException {
write("<!DOCTYPE ");
write(tagName);
write(" PUBLIC ");
write('"');
write(publicId);
write('"');
write('>');
}


public void endDTD() {}
public void startEntity(String name) {}
public void endEntity(String name) {}
public void startCDATA() {}
public void endCDATA() {}

public void comment(char ch[], int start, int length) throws SAXException {
/*
if (strippedElementLevel == 0) {
write("<!--");
write(ch, start, length);
write("-->");
}
*/
}


@Override
public void ignorableWhitespace(char ch[], int start, int length) throws SAXException {
if (strippedElementLevel == 0) {
write(ch, start, length);
}
}


protected void attribute(final String tagUpper, // 規范化的 TAG 名稱 - 使用大寫字母
final String attrLower, // 規范化的 屬性 名稱 - 使用小寫字母
String attrName,
String attrValue) throws SAXException {
write(' ');
write(attrName);
if (!isBoolean(attrLower, tagUpper)) {
write('=');
write('"');
for (int i = 0; i < attrValue.length(); i++) {
writeEncoded(attrValue.charAt(i), true);
}
write('"');
}
}


protected final void writeText(char[] ch, int start, int length) throws SAXException {
writeTextWithEnd(ch, start, start + length);
}


protected final void writeTextWithEnd(char[] ch, int begin, int end) throws SAXException {
for (int i = begin; i < end; i++) {
writeEncoded(ch[i], false);
}
}


protected void writeEncoded(char c, boolean isAttr) throws SAXException {
switch (c) {
case '<':
write("<");
break;
case '>':
write(">");
break;
case '&':
write("&");
break;
case 0xa0: // NBSP
// 暫時只特殊處理特殊字符 NBSP
// 當組信 NBSP 在轉換到純文本時可變成空格
// 但其它特殊字符沒有簡單的Ascii字符可替代, 因而這里也不執行替代
write(" ");
break;
case '"':
if (isAttr) {
write(""");
break;
}
default:
write(c);
}
}

protected void write(char c) throws SAXException {
try {
out.write(c);
} catch (IOException e) {
throw new SAXException(e);
}
}


protected void write(char ch[], int start, int length) throws SAXException {
try {
out.write(ch, start, length);
} catch (IOException e) {
throw new SAXException(e);
}
}

protected void write(String s) throws SAXException {
try {
out.write(s);
} catch (IOException e) {
throw new SAXException(e);
}
}


private static boolean isBoolean(String attrLower, String tagUpper) {
return HTMLdtd.isBoolean(attrLower, tagUpper);
}

private static boolean isEmptyTag(String tagUpper) {
return HTMLdtd.isEmptyTag(tagUpper);
}

}
package org.util.sax.html;

import org.xml.sax.Attributes;
import org.xml.sax.SAXException;

import java.util.Map;
import java.io.Writer;
import java.io.CharArrayWriter;
import java.io.IOException;

public class HTMLFilter extends HTMLWriter {

ConfigManager conf = CM.getConfig();
Map<String, String> cidMap; //cid 和 正文內容圖片 filename的 映射
private int currentLen; //當前已經寫入out的長度
private int maxLen; //允許push的最大長度
private boolean ignore=false; //當出現要截取時,就設為 true ,意味著如果ignore 為true時, 就以后的內容都要忽略。

public HTMLFilter(Map<String,String> map,int allowMessage_BodyLen) {
//super.setAllowContentLen(allowMessage_BodyLen);
this.maxLen=allowMessage_BodyLen;
this.cidMap=map;
}

@Override
protected boolean startTag(String tagName, Attributes attrs) throws SAXException {
if (!isTagAllowed(tagName, attrs)) {
return false;
}

if (ignore) {
return false;
}

Writer originalOutput = getOut();
int remainChars = getRemainChars();

if(remainChars == 0){
ignore = true;
write("
");
return false;
}

CharArrayWriter capturedOutput = new CharArrayWriter();
setOut(capturedOutput);

try {
if (super.startTag(tagName, attrs)) {
if (capturedOutput.toCharArray().length < remainChars) {
try {
originalOutput.write(capturedOutput.toCharArray());
return true;
} catch (IOException e) {
throw new SAXException(e);
}
}
}
} finally {
setOut(originalOutput);
}

ignore = true;
write("
");
return false;
}


@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (ignore) { //如果長度已經超出限制,則不寫
return;
}
int remainChars = getRemainChars();

if (remainChars == 0) {
ignore = true;
write("
");
return;
}

if (remainChars < length) { //當將要寫入的 text 長度 大于 remainChars 時, 就寫入所能夠寫入的字符,然后添加省略號

ignore = true;
super.characters(ch, start, remainChars);
write("
");
} else {
super.characters(ch, start, length);
}
}

@Override
protected void endTag(String tagName) throws SAXException {
super.endTag(tagName);
}

public void comment(char ch[], int start, int length) throws SAXException{
if(ignore){
return;
}
int remainChars = getRemainChars();

if (remainChars == 0) {
ignore = true;
write("
");
return;
}
if (remainChars < length) {
ignore=true;
super.comment(ch, start, remainChars);
} else {
super.comment(ch, start, length);
}

}

@Override
protected void attribute(final String tagUpper,
final String attrLower,
final String attrName,
String attrValue) throws SAXException {

if (attrLower.startsWith("on")) {
return;
}
if (tagUpper.equalsIgnoreCase("IMG") && attrLower.equalsIgnoreCase("src") && attrValue.trim().indexOf("cid:") != -1) {
attrValue=attrValue.trim();
int cid_idx = attrValue.indexOf("cid:");
String cid = attrValue.substring(cid_idx + 4);
// System.out.println("cid is: "+ cid);
String photoName = cidMap.get(cid);
// System.out.println("photoName is: "+ photoName);
if (photoName != null) {
super.attribute(tagUpper, attrLower, attrName, "#{" + photoName + "}");
} else{
super.attribute(tagUpper, attrLower, attrName, "#{" + " " + "}");
}


} else {
attrValue = transformScript(attrValue);
super.attribute(tagUpper, attrLower, attrName, attrValue);
}
}

private String transformScript(final String data) {
if (true) {
final String trimedData = data.trim();
final String scriptData = mySubstringAfterIgnoreCase(trimedData, "javascript:");
if (scriptData != null) {
return "";
}
}
return data;
}

protected boolean isTagAllowed(String tagName, Attributes attrs) {
if (tagName.equalsIgnoreCase("SCRIPT")) {
return false;
}
if(tagName.equalsIgnoreCase("A")){ //超鏈接標簽不push
return false;
}
if (tagName.equalsIgnoreCase("PARAM")) {
String name = getAttrIgnoreCase(attrs, "name");
if ("movie".equalsIgnoreCase(name) || "src".equalsIgnoreCase(name)) {
return false;
}
}
/*
if (tagName.equalsIgnoreCase("STYLE")) {
return false;
}
*/
if (tagName.equalsIgnoreCase("LINK") &&
"stylesheet".equalsIgnoreCase(getAttrIgnoreCase(attrs, "rel"))) {
return false;
}
if (tagName.equals("FRAME") || tagName.equals("FRAMESET")) {
return false;
}
return true;
}


private static String getAttrIgnoreCase(Attributes attrs, String name) {
for (int i = 0, len = attrs.getLength(); i < len; i++) {
if (name.equalsIgnoreCase(attrs.getQName(i))) {
return attrs.getValue(i);
}
}
return null;
}


/**
* 忽略控制字符后, 判斷是否以某字符串開始, 并返回匹配后的截取部分.
* <p/>
* <p/>
* 注: 忽略控制字符是為了對付IE的安全漏洞
*
* @param source 源字符串
* @param prefix 要匹配的前綴字符串
* @return 如果測試成功, 返回截取后的字符串; 否則, 返回 null;
*/
static String mySubstringAfterIgnoreCase(String source, String prefix) {
int sourceLength = source.length();
int targetLength = prefix.length();

if (sourceLength < targetLength) {
return null;
}

int sourceOffset = 0;
int targetOffset = 0;
char targetChar = Character.toUpperCase(prefix.charAt(targetOffset));

for (; sourceOffset < sourceLength; sourceOffset++) {
char c = source.charAt(sourceOffset);
if (c < ' ') {
// 忽略控制字符
continue;
}

if (Character.toUpperCase(c) != targetChar) {
break;
}

targetOffset++;
if (targetOffset == targetLength) {
return source.substring(sourceOffset + 1);
}

targetChar = Character.toUpperCase(prefix.charAt(targetOffset));
}

return null;
}

protected void write(char c) throws SAXException {
super.write(c);
currentLen++;
}

protected void write(char ch[], int start, int length) throws SAXException {
super.write(ch, start, length);
currentLen += length;
}

protected void write(String s) throws SAXException {
super.write(s);
currentLen += s.length();
}

protected int getRemainChars(){ //求出還剩多少個字符可以寫入
return (maxLen - currentLen);
}


}
1.對XML根據特定需求,過濾標簽(如SCRIPT,FRAME等非標準HTML標簽),過濾屬性(如onclick,onblur等)
2.對XML進行長度截取,具體做法如下:
(1)對start標簽的處理: 若加上start標簽長度后超過最大允許長度,則去除該標簽,且同時去除后面和該標簽同一等級的所有標簽。
(2)對text內容的處理:若加上text內容的長度后超過最大允許的長度,則從中截取text長度,并加上省略號......
(3)對end標簽內容的處理:不做長度截取,且要做到自動補齊end標簽。
有關SAX的詳細介紹,請查看最好的參考資料 http://www.saxproject.org/ 。其中有一個很重要的類 DefaultHandler, 該類中的startElement, endElement, characters 3個方法尤為重要。 為解決上述問題,需要設計2個類:HTMLWriter, HTMLFilter, 其中HTMLFilter是HTMLWriter的子類,HTMLWriter繼承了DefaultHandler,其中最為關鍵的是要重寫上述3個關鍵方法。
一.HTMLWriter類的代碼:
這個類主要用于寫操作,最重要是理解變量strippedElementLevel 的用法。上面問題的具體業務邏輯處理(標簽的過濾和長度截取)將在子類HTMLFilter 解決。
































































































































































































































































































二. HTMLFilter 類的代碼:
主要解決標簽過濾,即哪些標簽和屬性需要過濾,解決長度截取問題,即斷點出現在startTag,text,endTag的情況應該如何解決。
主要理解重寫父類HTMLWriter的幾個方法:startTag(),characters(),comment(),attribute(), 另外需要一個成員變量currentLen記錄當前寫入的長度,在進行write()方法時要對currentLen變量進行疊加。




















































































































































































































































































posted on 2008-09-01 21:26 cong 閱讀(930) 評論(0) 編輯 收藏 所屬分類: JAVA