View Javadoc

1   /*
2    * Copyright 2010-2013 Capgemini
3    * Licensed under the Apache License, Version 2.0 (the "License"); 
4    * you may not use this file except in compliance with the License. 
5    * You may obtain a copy of the License at 
6    * 
7    * http://www.apache.org/licenses/LICENSE-2.0 
8    * 
9    * Unless required by applicable law or agreed to in writing, software 
10   * distributed under the License is distributed on an "AS IS" BASIS, 
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
12   * See the License for the specific language governing permissions and 
13   * limitations under the License. 
14   * 
15   */
16  package org.xmlfield.core.impl.dom.cleanup;
17  
18  import java.util.regex.Matcher;
19  import java.util.regex.Pattern;
20  
21  /**
22   * Utility class for handling invalid XML entities and characters.
23   * <p>
24   * Invalid characters/entity are replaced by the "unknown" character (\uFFFD).
25   * 
26   * @author Nicolas Richeton
27   */
28  public class InputSanitizer {
29  
30  	static Pattern INVALID_XML_CHARS = Pattern
31  			.compile("[^\\u0009\\u000A\\u000D\\u0020-\\uD7FF\\uE000-\\uFFFD\uD800\uDC00-\uDBFF\uDFFF]");
32  
33  	static Pattern XML_ENTITY = Pattern.compile("&#([0-9]+);");
34  
35  	/**
36  	 * Replace invalid characters by the unknown character (\uFFFD).
37  	 * 
38  	 * @param s
39  	 *            text to sanitize.
40  	 * @return sanitized text.
41  	 */
42  	public static String sanitizeText(String s) {
43  		if (s == null) {
44  			return null;
45  		}
46  		return INVALID_XML_CHARS.matcher(s).replaceAll("\uFFFD");
47  	}
48  
49  	/**
50  	 * Replace invalid entities by the entity corresponding to the unknown
51  	 * character (\uFFFD).
52  	 * 
53  	 * @param xml
54  	 *            XML input to sanitize.
55  	 * @return sanitized XML input
56  	 */
57  	public static String sanitizeXml(String xml) {
58  		if (xml == null) {
59  			return null;
60  		}
61  		Matcher m = XML_ENTITY.matcher(xml);
62  		StringBuffer sb = new StringBuffer();
63  		while (m.find()) {
64  
65  			String entityValue = m.group(1);
66  			int entity = Integer.parseInt(entityValue);
67  
68  			if (!(entity == 9 || entity == 10 || entity == 13 || entity >= 32
69  					&& entity <= 55295 || entity >= 57344 && entity <= 65533
70  					|| entity == 55296 || entity >= 56320 && entity <= 56319 || entity == 57343)) {
71  				entityValue = "65533";
72  			}
73  			m.appendReplacement(sb, "&#" + entityValue + ";");
74  		}
75  		m.appendTail(sb);
76  
77  		return sb.toString();
78  	}
79  }