View Javadoc

1   /*
2    * Copyright 2010-2013 Capgemini
3    * Licensed under the Apache License, Version 2.0 (the "License"); 
4    * you may not use this file except in compliance with the License. 
5    * You may obtain a copy of the License at 
6    * 
7    * http://www.apache.org/licenses/LICENSE-2.0 
8    * 
9    * Unless required by applicable law or agreed to in writing, software 
10   * distributed under the License is distributed on an "AS IS" BASIS, 
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
12   * See the License for the specific language governing permissions and 
13   * limitations under the License. 
14   * 
15   */
16  package org.xmlfield.core.impl.dom.cleanup;
17  
18  import java.io.IOException;
19  import java.io.InputStream;
20  
21  /**
22   * An input stream which remove invalid numeric entities from an XML input
23   * stream. Invalid values are replaced by the 'unknown character'.
24   * <p>
25   * This input stream does not try to understand encoding and process only ASCII
26   * values. As a result, it should work with UTF-8 and ASCII streams bu will not
27   * work with UTF-16 streams.
28   * 
29   * @author Nicolas Richeton
30   * 
31   */
32  public class EntitySanitizingInputStream extends InputStream {
33  	/**
34  	 * Maximum length for read ahead.
35  	 */
36  	static int MAX_ENTITY_SIZE = 15;
37  	/**
38  	 * the generic unknown entity buffer.
39  	 */
40  	static int[] readHeadUndefined = new int[] { '&', '#', '6', '5', '5', '3',
41  			'3', ';' };
42  
43  	/**
44  	 * Current buffer. Contains null if no character has been read ahead of the
45  	 * stream.
46  	 */
47  	int[] readAheadBuffer = null;
48  	int readAheadCount = 0;
49  	int readAheadPosition = 0;
50  
51  	private InputStream wrappedInputstream;
52  
53  	/**
54  	 * Wrap an input stream with XMLentity sanitizing.
55  	 * 
56  	 * @param xmlStream
57  	 *            The original xml input stream
58  	 */
59  	public EntitySanitizingInputStream(InputStream xmlStream) {
60  		this.wrappedInputstream = xmlStream;
61  	}
62  
63  	@Override
64  	public int available() throws IOException {
65  		if (readAheadCount == 0) {
66  			return super.available();
67  		}
68  
69  		return readAheadCount - readAheadPosition + super.available();
70  	}
71  
72  	@Override
73  	public void close() throws IOException {
74  		wrappedInputstream.close();
75  	}
76  
77  	@Override
78  	public int read() throws IOException {
79  
80  		// If the readAheadBuffer exists, return data from the buffer.
81  		if (readAheadCount > 0) {
82  			int current = readAheadBuffer[readAheadPosition];
83  			readAheadPosition++;
84  			// If we reach the end of the buffer, cleanup and return to the
85  			// standard behavior.
86  			if (readAheadPosition == readAheadCount) {
87  				readAheadCount = 0;
88  				readAheadPosition = 0;
89  				readAheadBuffer = null;
90  			}
91  			return current;
92  		}
93  
94  		int current = wrappedInputstream.read();
95  
96  		// If we just read the begining of an entity, start reading ahead.
97  		if (current == '&') {
98  			readAheadBuffer = new int[MAX_ENTITY_SIZE];
99  			readAheadBuffer[0] = current;
100 			current = 0;
101 			readAheadCount++;
102 
103 			// Read until buffer is full / no more data available / end of the
104 			// entity
105 			while (readAheadCount < MAX_ENTITY_SIZE && current != -1
106 					&& current != ';') {
107 				current = wrappedInputstream.read();
108 				readAheadBuffer[readAheadCount] = current;
109 				readAheadCount++;
110 
111 				// Not an entity number ? -> stop reading.
112 				if (!(readAheadBuffer[1] == '#')) {
113 					break;
114 				}
115 
116 				// Reached the end of the entity
117 				if (readAheadCount > 3 && current == ';') {
118 					// Get the entity value
119 					StringBuilder number = new StringBuilder();
120 					for (int i = 2; i < readAheadCount - 1; i++) {
121 						number.append((char) readAheadBuffer[i]);
122 					}
123 
124 					try {
125 						int entity = Integer.parseInt(number.toString());
126 
127 						// If entity is in invalid range, replace by the
128 						// 'unknown entity"
129 						if (!(entity == 9 || entity == 10 || entity == 13
130 								|| entity >= 32 && entity <= 55295
131 								|| entity >= 57344 && entity <= 65533
132 								|| entity == 55296 || entity >= 56320
133 								&& entity <= 56319 || entity == 57343)) {
134 							readAheadBuffer = readHeadUndefined;
135 							readAheadCount = readAheadBuffer.length;
136 
137 						}
138 
139 					} catch (NumberFormatException e) {
140 						// this was not a number, the XML is probably invalid.
141 						// Nothing to do, just start to output the buffer.
142 					}
143 				}
144 
145 			}
146 
147 			current = readAheadBuffer[0];
148 			readAheadPosition++;
149 		}
150 		return current;
151 	}
152 
153 	@Override
154 	public synchronized void reset() throws IOException {
155 		wrappedInputstream.reset();
156 	}
157 
158 }