1 /*
2 * Copyright 2010-2013 Capgemini
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 *
15 */
16 package org.xmlfield.core.impl.dom.cleanup;
17
18 import java.util.regex.Matcher;
19 import java.util.regex.Pattern;
20
21 /**
22 * Utility class for handling invalid XML entities and characters.
23 * <p>
24 * Invalid characters/entity are replaced by the "unknown" character (\uFFFD).
25 *
26 * @author Nicolas Richeton
27 */
28 public class InputSanitizer {
29
30 static Pattern INVALID_XML_CHARS = Pattern
31 .compile("[^\\u0009\\u000A\\u000D\\u0020-\\uD7FF\\uE000-\\uFFFD\uD800\uDC00-\uDBFF\uDFFF]");
32
33 static Pattern XML_ENTITY = Pattern.compile("&#([0-9]+);");
34
35 /**
36 * Replace invalid characters by the unknown character (\uFFFD).
37 *
38 * @param s
39 * text to sanitize.
40 * @return sanitized text.
41 */
42 public static String sanitizeText(String s) {
43 if (s == null) {
44 return null;
45 }
46 return INVALID_XML_CHARS.matcher(s).replaceAll("\uFFFD");
47 }
48
49 /**
50 * Replace invalid entities by the entity corresponding to the unknown
51 * character (\uFFFD).
52 *
53 * @param xml
54 * XML input to sanitize.
55 * @return sanitized XML input
56 */
57 public static String sanitizeXml(String xml) {
58 if (xml == null) {
59 return null;
60 }
61 Matcher m = XML_ENTITY.matcher(xml);
62 StringBuffer sb = new StringBuffer();
63 while (m.find()) {
64
65 String entityValue = m.group(1);
66 int entity = Integer.parseInt(entityValue);
67
68 if (!(entity == 9 || entity == 10 || entity == 13 || entity >= 32
69 && entity <= 55295 || entity >= 57344 && entity <= 65533
70 || entity == 55296 || entity >= 56320 && entity <= 56319 || entity == 57343)) {
71 entityValue = "65533";
72 }
73 m.appendReplacement(sb, "&#" + entityValue + ";");
74 }
75 m.appendTail(sb);
76
77 return sb.toString();
78 }
79 }