Donnerstag Mai 28, 2009
HTML Inhalte säubern
Ich musste heute eine Lösung finden, um sicherzustellen, dass Inhalte, die auf öffentlich zugänglichen Formularen erstellt werden und dann öffentlich sichtbar werden nicht irgendwelche bösen HTML Hacks enthalten können. Zunächst wollte ich die Sache 'händisch' mit ein paar Regexen lösen, aber dann dacht ich mir, dass das ineffizient wäre und habe ein bisschen recherchiert. Dabei sind mir zwei Tools in die Hände gefallen:
- htmlcleaner und
- ein alter Bekannter: NekoHtml
import java.io.StringReader;
import java.io.StringWriter;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.cyberneko.html.filters.ElementRemover;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.html.HTMLDocument;
import org.xml.sax.InputSource;
org.apache.commons.lang.StringUtils
…
/**
* cleans up (user provided) string input and makes sure no dangerous markup is left.
*
* Allows b, p, br, i, ol, ul, li, a (with href) and img (with srw, width, height, title) tags
* All others are removed with their text content left intact
*
* script and iframe tags are removed entirely (including textual content
*
* @param input
* @return
*/
public static String cleanupHtmlFragment(String input) {
if(isBlank(input))
return "";
try {
// create element remover filter
ElementRemover remover = new ElementRemover();
// set which elements to accept
remover.acceptElement("b", null);
remover.acceptElement("p", null);
remover.acceptElement("br", null);
remover.acceptElement("i", null);
remover.acceptElement("ol", null);
remover.acceptElement("ul", null);
remover.acceptElement("li", null);
remover.acceptElement("a", new String[] { "href", "title" });
remover.acceptElement("img", new String[] { "src", "width", "height", "title" });
// completely remove script and iframe elements
remover.removeElement("iframe");
remover.removeElement("script");
// create writer filter
org.cyberneko.html.filters.Writer writer = new org.cyberneko.html.filters.Writer();
// setup filter chain
XMLDocumentFilter[] filters = { remover, writer, };
DOMFragmentParser parser = new DOMFragmentParser();
parser.setProperty("http://cyberneko.org/html/properties/filters",filters);
parser.setProperty("http://cyberneko.org/html/properties/default-encoding","UTF-8");
parser.setProperty("http://cyberneko.org/html/properties/names/elems","lower");
parser.setProperty("http://cyberneko.org/html/properties/doctype/pubid","-//W3C//DTD XHTML 1.0 Transitional//EN");
HTMLDocument document = new HTMLDocumentImpl();
DocumentFragment fragment = document.createDocumentFragment();
parser.parse(new InputSource(new StringReader(input)), fragment);
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION,"yes");
StreamResult result = new StreamResult(new StringWriter());
DOMSource source = new DOMSource(fragment);
transformer.transform(source, result);
return result.getWriter().toString();
} catch (Exception e) {
log.warn("Couldn't parse string fragment with nekohtml", e);
return "";
}
}
Posted at 04:07PM Mai 28, 2009 by joerg in Allgemein | Kommentare[0]
Kommentare: