1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package org.xmlfield.core.impl.dom.cleanup;
17
18 import java.util.regex.Matcher;
19 import java.util.regex.Pattern;
20
21
22
23
24
25
26
27
28 public class InputSanitizer {
29
30 static Pattern INVALID_XML_CHARS = Pattern
31 .compile("[^\\u0009\\u000A\\u000D\\u0020-\\uD7FF\\uE000-\\uFFFD\uD800\uDC00-\uDBFF\uDFFF]");
32
33 static Pattern XML_ENTITY = Pattern.compile("&#([0-9]+);");
34
35
36
37
38
39
40
41
42 public static String sanitizeText(String s) {
43 if (s == null) {
44 return null;
45 }
46 return INVALID_XML_CHARS.matcher(s).replaceAll("\uFFFD");
47 }
48
49
50
51
52
53
54
55
56
57 public static String sanitizeXml(String xml) {
58 if (xml == null) {
59 return null;
60 }
61 Matcher m = XML_ENTITY.matcher(xml);
62 StringBuffer sb = new StringBuffer();
63 while (m.find()) {
64
65 String entityValue = m.group(1);
66 int entity = Integer.parseInt(entityValue);
67
68 if (!(entity == 9 || entity == 10 || entity == 13 || entity >= 32
69 && entity <= 55295 || entity >= 57344 && entity <= 65533
70 || entity == 55296 || entity >= 56320 && entity <= 56319 || entity == 57343)) {
71 entityValue = "65533";
72 }
73 m.appendReplacement(sb, "&#" + entityValue + ";");
74 }
75 m.appendTail(sb);
76
77 return sb.toString();
78 }
79 }