jdk-sandbox: jaxp/src/java.xml/share/classes/com/sun/org/apache/xerces/internal/xinclude/XIncludeTextReader.java@4a6b2e733c0d (annotated)

6 7f561c08de6b Initial load duke parents: diff changeset	1	/*
7f561c08de6b Initial load duke parents: diff changeset	2	* reserved comment block
7f561c08de6b Initial load duke parents: diff changeset	3	* DO NOT REMOVE OR ALTER!
7f561c08de6b Initial load duke parents: diff changeset	4	*/
7f561c08de6b Initial load duke parents: diff changeset	5	/*
7f561c08de6b Initial load duke parents: diff changeset	6	* Copyright 2003-2005 The Apache Software Foundation.
7f561c08de6b Initial load duke parents: diff changeset	7	*
7f561c08de6b Initial load duke parents: diff changeset	8	* Licensed under the Apache License, Version 2.0 (the "License");
7f561c08de6b Initial load duke parents: diff changeset	9	* you may not use this file except in compliance with the License.
7f561c08de6b Initial load duke parents: diff changeset	10	* You may obtain a copy of the License at
7f561c08de6b Initial load duke parents: diff changeset	11	*
7f561c08de6b Initial load duke parents: diff changeset	12	* http://www.apache.org/licenses/LICENSE-2.0
7f561c08de6b Initial load duke parents: diff changeset	13	*
7f561c08de6b Initial load duke parents: diff changeset	14	* Unless required by applicable law or agreed to in writing, software
7f561c08de6b Initial load duke parents: diff changeset	15	* distributed under the License is distributed on an "AS IS" BASIS,
7f561c08de6b Initial load duke parents: diff changeset	16	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
7f561c08de6b Initial load duke parents: diff changeset	17	* See the License for the specific language governing permissions and
7f561c08de6b Initial load duke parents: diff changeset	18	* limitations under the License.
7f561c08de6b Initial load duke parents: diff changeset	19	*/
7f561c08de6b Initial load duke parents: diff changeset	20	package com.sun.org.apache.xerces.internal.xinclude;
7f561c08de6b Initial load duke parents: diff changeset	21
7f561c08de6b Initial load duke parents: diff changeset	22	import java.io.BufferedInputStream;
7f561c08de6b Initial load duke parents: diff changeset	23	import java.io.IOException;
7f561c08de6b Initial load duke parents: diff changeset	24	import java.io.InputStream;
7f561c08de6b Initial load duke parents: diff changeset	25	import java.io.InputStreamReader;
7f561c08de6b Initial load duke parents: diff changeset	26	import java.io.Reader;
7f561c08de6b Initial load duke parents: diff changeset	27	import java.net.HttpURLConnection;
7f561c08de6b Initial load duke parents: diff changeset	28	import java.net.URL;
7f561c08de6b Initial load duke parents: diff changeset	29	import java.net.URLConnection;
7f561c08de6b Initial load duke parents: diff changeset	30	import java.util.Iterator;
7f561c08de6b Initial load duke parents: diff changeset	31	import java.util.Locale;
7f561c08de6b Initial load duke parents: diff changeset	32	import java.util.Map;
7f561c08de6b Initial load duke parents: diff changeset	33
7f561c08de6b Initial load duke parents: diff changeset	34	import com.sun.org.apache.xerces.internal.impl.XMLEntityManager;
7f561c08de6b Initial load duke parents: diff changeset	35	import com.sun.org.apache.xerces.internal.impl.XMLErrorReporter;
7f561c08de6b Initial load duke parents: diff changeset	36	import com.sun.org.apache.xerces.internal.impl.io.ASCIIReader;
7f561c08de6b Initial load duke parents: diff changeset	37	import com.sun.org.apache.xerces.internal.impl.io.UTF8Reader;
7f561c08de6b Initial load duke parents: diff changeset	38	import com.sun.org.apache.xerces.internal.impl.msg.XMLMessageFormatter;
7f561c08de6b Initial load duke parents: diff changeset	39	import com.sun.org.apache.xerces.internal.util.EncodingMap;
7f561c08de6b Initial load duke parents: diff changeset	40	import com.sun.org.apache.xerces.internal.util.HTTPInputSource;
7f561c08de6b Initial load duke parents: diff changeset	41	import com.sun.org.apache.xerces.internal.util.MessageFormatter;
7f561c08de6b Initial load duke parents: diff changeset	42	import com.sun.org.apache.xerces.internal.util.XMLChar;
7f561c08de6b Initial load duke parents: diff changeset	43	import com.sun.org.apache.xerces.internal.xni.XMLString;
7f561c08de6b Initial load duke parents: diff changeset	44	import com.sun.org.apache.xerces.internal.xni.parser.XMLInputSource;
7f561c08de6b Initial load duke parents: diff changeset	45
7f561c08de6b Initial load duke parents: diff changeset	46	/**
7f561c08de6b Initial load duke parents: diff changeset	47	* This class is used for reading resources requested in <include> elements,
7f561c08de6b Initial load duke parents: diff changeset	48	* when the parse attribute of the <include> element is "text". Using this
7f561c08de6b Initial load duke parents: diff changeset	49	* class will open the location, detect the encoding, and discard the byte order
7f561c08de6b Initial load duke parents: diff changeset	50	* mark, if applicable.
7f561c08de6b Initial load duke parents: diff changeset	51	*
7f561c08de6b Initial load duke parents: diff changeset	52	* REVISIT:
7f561c08de6b Initial load duke parents: diff changeset	53	* Much of the code in this class is taken from XMLEntityManager. It would be nice
7f561c08de6b Initial load duke parents: diff changeset	54	* if this code could be shared in some way. However, since XMLEntityManager is used
7f561c08de6b Initial load duke parents: diff changeset	55	* for reading files as XML, and this needs to read files as text, there would need
7f561c08de6b Initial load duke parents: diff changeset	56	* to be some refactoring done.
7f561c08de6b Initial load duke parents: diff changeset	57	*
7f561c08de6b Initial load duke parents: diff changeset	58	* @author Michael Glavassevich, IBM
7f561c08de6b Initial load duke parents: diff changeset	59	* @author Peter McCracken, IBM
7f561c08de6b Initial load duke parents: diff changeset	60	* @author Ankit Pasricha, IBM
7f561c08de6b Initial load duke parents: diff changeset	61	* @author Arun Yadav, Sun Microsystems Inc.
7f561c08de6b Initial load duke parents: diff changeset	62	*
7f561c08de6b Initial load duke parents: diff changeset	63	*
7f561c08de6b Initial load duke parents: diff changeset	64	* @see XIncludeHandler
7f561c08de6b Initial load duke parents: diff changeset	65	*/
7f561c08de6b Initial load duke parents: diff changeset	66	public class XIncludeTextReader {
7f561c08de6b Initial load duke parents: diff changeset	67
7f561c08de6b Initial load duke parents: diff changeset	68	private Reader fReader;
7f561c08de6b Initial load duke parents: diff changeset	69	private XIncludeHandler fHandler;
7f561c08de6b Initial load duke parents: diff changeset	70	private XMLInputSource fSource;
7f561c08de6b Initial load duke parents: diff changeset	71	private XMLErrorReporter fErrorReporter;
7f561c08de6b Initial load duke parents: diff changeset	72	private XMLString fTempString = new XMLString();
7f561c08de6b Initial load duke parents: diff changeset	73
7f561c08de6b Initial load duke parents: diff changeset	74	/**
7f561c08de6b Initial load duke parents: diff changeset	75	* Construct the XIncludeReader using the XMLInputSource and XIncludeHandler.
7f561c08de6b Initial load duke parents: diff changeset	76	*
7f561c08de6b Initial load duke parents: diff changeset	77	* @param source The XMLInputSource to use.
7f561c08de6b Initial load duke parents: diff changeset	78	* @param handler The XIncludeHandler to use.
7f561c08de6b Initial load duke parents: diff changeset	79	* @param bufferSize The size of this text reader's buffer.
7f561c08de6b Initial load duke parents: diff changeset	80	*/
7f561c08de6b Initial load duke parents: diff changeset	81	public XIncludeTextReader(XMLInputSource source, XIncludeHandler handler, int bufferSize)
7f561c08de6b Initial load duke parents: diff changeset	82	throws IOException {
7f561c08de6b Initial load duke parents: diff changeset	83	fHandler = handler;
7f561c08de6b Initial load duke parents: diff changeset	84	fSource = source;
7f561c08de6b Initial load duke parents: diff changeset	85	fTempString = new XMLString(new char[bufferSize + 1], 0, 0);
7f561c08de6b Initial load duke parents: diff changeset	86	}
7f561c08de6b Initial load duke parents: diff changeset	87
7f561c08de6b Initial load duke parents: diff changeset	88	/**
7f561c08de6b Initial load duke parents: diff changeset	89	* Sets the XMLErrorReporter used for reporting errors while
7f561c08de6b Initial load duke parents: diff changeset	90	* reading the text include.
7f561c08de6b Initial load duke parents: diff changeset	91	*
7f561c08de6b Initial load duke parents: diff changeset	92	* @param errorReporter the XMLErrorReporter to be used for
7f561c08de6b Initial load duke parents: diff changeset	93	* reporting errors.
7f561c08de6b Initial load duke parents: diff changeset	94	*/
7f561c08de6b Initial load duke parents: diff changeset	95	public void setErrorReporter(XMLErrorReporter errorReporter) {
7f561c08de6b Initial load duke parents: diff changeset	96	fErrorReporter = errorReporter;
7f561c08de6b Initial load duke parents: diff changeset	97	}
7f561c08de6b Initial load duke parents: diff changeset	98
7f561c08de6b Initial load duke parents: diff changeset	99	/**
7f561c08de6b Initial load duke parents: diff changeset	100	* Return the Reader for given XMLInputSource.
7f561c08de6b Initial load duke parents: diff changeset	101	*
7f561c08de6b Initial load duke parents: diff changeset	102	* @param source The XMLInputSource to use.
7f561c08de6b Initial load duke parents: diff changeset	103	*/
7f561c08de6b Initial load duke parents: diff changeset	104	protected Reader getReader(XMLInputSource source) throws IOException {
7f561c08de6b Initial load duke parents: diff changeset	105	if (source.getCharacterStream() != null) {
7f561c08de6b Initial load duke parents: diff changeset	106	return source.getCharacterStream();
7f561c08de6b Initial load duke parents: diff changeset	107	}
7f561c08de6b Initial load duke parents: diff changeset	108	else {
7f561c08de6b Initial load duke parents: diff changeset	109	InputStream stream = null;
7f561c08de6b Initial load duke parents: diff changeset	110
7f561c08de6b Initial load duke parents: diff changeset	111	String encoding = source.getEncoding();
7f561c08de6b Initial load duke parents: diff changeset	112	if (encoding == null) {
7f561c08de6b Initial load duke parents: diff changeset	113	encoding = "UTF-8";
7f561c08de6b Initial load duke parents: diff changeset	114	}
7f561c08de6b Initial load duke parents: diff changeset	115	if (source.getByteStream() != null) {
7f561c08de6b Initial load duke parents: diff changeset	116	stream = source.getByteStream();
7f561c08de6b Initial load duke parents: diff changeset	117	// Wrap the InputStream so that it is possible to rewind it.
7f561c08de6b Initial load duke parents: diff changeset	118	if (!(stream instanceof BufferedInputStream)) {
7f561c08de6b Initial load duke parents: diff changeset	119	stream = new BufferedInputStream(stream, fTempString.ch.length);
7f561c08de6b Initial load duke parents: diff changeset	120	}
7f561c08de6b Initial load duke parents: diff changeset	121	}
7f561c08de6b Initial load duke parents: diff changeset	122	else {
7f561c08de6b Initial load duke parents: diff changeset	123	String expandedSystemId = XMLEntityManager.expandSystemId(source.getSystemId(), source.getBaseSystemId(), false);
7f561c08de6b Initial load duke parents: diff changeset	124
7f561c08de6b Initial load duke parents: diff changeset	125	URL url = new URL(expandedSystemId);
7f561c08de6b Initial load duke parents: diff changeset	126	URLConnection urlCon = url.openConnection();
7f561c08de6b Initial load duke parents: diff changeset	127
7f561c08de6b Initial load duke parents: diff changeset	128	// If this is an HTTP connection attach any request properties to the request.
7f561c08de6b Initial load duke parents: diff changeset	129	if (urlCon instanceof HttpURLConnection && source instanceof HTTPInputSource) {
7f561c08de6b Initial load duke parents: diff changeset	130	final HttpURLConnection urlConnection = (HttpURLConnection) urlCon;
7f561c08de6b Initial load duke parents: diff changeset	131	final HTTPInputSource httpInputSource = (HTTPInputSource) source;
7f561c08de6b Initial load duke parents: diff changeset	132
7f561c08de6b Initial load duke parents: diff changeset	133	// set request properties
7f561c08de6b Initial load duke parents: diff changeset	134	Iterator propIter = httpInputSource.getHTTPRequestProperties();
7f561c08de6b Initial load duke parents: diff changeset	135	while (propIter.hasNext()) {
7f561c08de6b Initial load duke parents: diff changeset	136	Map.Entry entry = (Map.Entry) propIter.next();
7f561c08de6b Initial load duke parents: diff changeset	137	urlConnection.setRequestProperty((String) entry.getKey(), (String) entry.getValue());
7f561c08de6b Initial load duke parents: diff changeset	138	}
7f561c08de6b Initial load duke parents: diff changeset	139
7f561c08de6b Initial load duke parents: diff changeset	140	// set preference for redirection
7f561c08de6b Initial load duke parents: diff changeset	141	boolean followRedirects = httpInputSource.getFollowHTTPRedirects();
7f561c08de6b Initial load duke parents: diff changeset	142	if (!followRedirects) {
31497 4a6b2e733c0d 8130051: Cleanup usage of reflection in jaxp dfuchs parents: 25868 diff changeset	143	urlConnection.setInstanceFollowRedirects(followRedirects);
6 7f561c08de6b Initial load duke parents: diff changeset	144	}
7f561c08de6b Initial load duke parents: diff changeset	145	}
7f561c08de6b Initial load duke parents: diff changeset	146
7f561c08de6b Initial load duke parents: diff changeset	147	// Wrap the InputStream so that it is possible to rewind it.
7f561c08de6b Initial load duke parents: diff changeset	148	stream = new BufferedInputStream(urlCon.getInputStream());
7f561c08de6b Initial load duke parents: diff changeset	149
7f561c08de6b Initial load duke parents: diff changeset	150	// content type will be string like "text/xml; charset=UTF-8" or "text/xml"
7f561c08de6b Initial load duke parents: diff changeset	151	String rawContentType = urlCon.getContentType();
7f561c08de6b Initial load duke parents: diff changeset	152
7f561c08de6b Initial load duke parents: diff changeset	153	// text/xml and application/xml offer only one optional parameter
7f561c08de6b Initial load duke parents: diff changeset	154	int index = (rawContentType != null) ? rawContentType.indexOf(';') : -1;
7f561c08de6b Initial load duke parents: diff changeset	155
7f561c08de6b Initial load duke parents: diff changeset	156	String contentType = null;
7f561c08de6b Initial load duke parents: diff changeset	157	String charset = null;
7f561c08de6b Initial load duke parents: diff changeset	158	if (index != -1) {
7f561c08de6b Initial load duke parents: diff changeset	159	// this should be something like "text/xml"
7f561c08de6b Initial load duke parents: diff changeset	160	contentType = rawContentType.substring(0, index).trim();
7f561c08de6b Initial load duke parents: diff changeset	161
7f561c08de6b Initial load duke parents: diff changeset	162	// this should be something like "charset=UTF-8", but we want to
7f561c08de6b Initial load duke parents: diff changeset	163	// strip it down to just "UTF-8"
7f561c08de6b Initial load duke parents: diff changeset	164	charset = rawContentType.substring(index + 1).trim();
7f561c08de6b Initial load duke parents: diff changeset	165	if (charset.startsWith("charset=")) {
7f561c08de6b Initial load duke parents: diff changeset	166	// 8 is the length of "charset="
7f561c08de6b Initial load duke parents: diff changeset	167	charset = charset.substring(8).trim();
7f561c08de6b Initial load duke parents: diff changeset	168	// strip quotes, if present
7f561c08de6b Initial load duke parents: diff changeset	169	if ((charset.charAt(0) == '"'
7f561c08de6b Initial load duke parents: diff changeset	170	&& charset.charAt(charset.length() - 1) == '"')
7f561c08de6b Initial load duke parents: diff changeset	171	\|\| (charset.charAt(0) == '\''
7f561c08de6b Initial load duke parents: diff changeset	172	&& charset.charAt(charset.length() - 1)
7f561c08de6b Initial load duke parents: diff changeset	173	== '\'')) {
7f561c08de6b Initial load duke parents: diff changeset	174	charset =
7f561c08de6b Initial load duke parents: diff changeset	175	charset.substring(1, charset.length() - 1);
7f561c08de6b Initial load duke parents: diff changeset	176	}
7f561c08de6b Initial load duke parents: diff changeset	177	}
7f561c08de6b Initial load duke parents: diff changeset	178	else {
7f561c08de6b Initial load duke parents: diff changeset	179	charset = null;
7f561c08de6b Initial load duke parents: diff changeset	180	}
7f561c08de6b Initial load duke parents: diff changeset	181	}
7f561c08de6b Initial load duke parents: diff changeset	182	else {
7f561c08de6b Initial load duke parents: diff changeset	183	contentType = rawContentType.trim();
7f561c08de6b Initial load duke parents: diff changeset	184	}
7f561c08de6b Initial load duke parents: diff changeset	185
7f561c08de6b Initial load duke parents: diff changeset	186	String detectedEncoding = null;
7f561c08de6b Initial load duke parents: diff changeset	187	/** The encoding of such a resource is determined by:
7f561c08de6b Initial load duke parents: diff changeset	188	1 external encoding information, if available, otherwise
7f561c08de6b Initial load duke parents: diff changeset	189	-- the most common type of external information is the "charset" parameter of a MIME package
7f561c08de6b Initial load duke parents: diff changeset	190	2 if the media type of the resource is text/xml, application/xml, or matches the conventions text/+xml or application/+xml as described in XML Media Types [IETF RFC 3023], the encoding is recognized as specified in XML 1.0, otherwise
7f561c08de6b Initial load duke parents: diff changeset	191	3 the value of the encoding attribute if one exists, otherwise
7f561c08de6b Initial load duke parents: diff changeset	192	4 UTF-8.
7f561c08de6b Initial load duke parents: diff changeset	193	**/
7f561c08de6b Initial load duke parents: diff changeset	194	if (contentType.equals("text/xml")) {
7f561c08de6b Initial load duke parents: diff changeset	195	if (charset != null) {
7f561c08de6b Initial load duke parents: diff changeset	196	detectedEncoding = charset;
7f561c08de6b Initial load duke parents: diff changeset	197	}
7f561c08de6b Initial load duke parents: diff changeset	198	else {
7f561c08de6b Initial load duke parents: diff changeset	199	// see RFC2376 or 3023, section 3.1
7f561c08de6b Initial load duke parents: diff changeset	200	detectedEncoding = "US-ASCII";
7f561c08de6b Initial load duke parents: diff changeset	201	}
7f561c08de6b Initial load duke parents: diff changeset	202	}
7f561c08de6b Initial load duke parents: diff changeset	203	else if (contentType.equals("application/xml")) {
7f561c08de6b Initial load duke parents: diff changeset	204	if (charset != null) {
7f561c08de6b Initial load duke parents: diff changeset	205	detectedEncoding = charset;
7f561c08de6b Initial load duke parents: diff changeset	206	}
7f561c08de6b Initial load duke parents: diff changeset	207	else {
7f561c08de6b Initial load duke parents: diff changeset	208	// see RFC2376 or 3023, section 3.2
7f561c08de6b Initial load duke parents: diff changeset	209	detectedEncoding = getEncodingName(stream);
7f561c08de6b Initial load duke parents: diff changeset	210	}
7f561c08de6b Initial load duke parents: diff changeset	211	}
7f561c08de6b Initial load duke parents: diff changeset	212	else if (contentType.endsWith("+xml")) {
7f561c08de6b Initial load duke parents: diff changeset	213	detectedEncoding = getEncodingName(stream);
7f561c08de6b Initial load duke parents: diff changeset	214	}
7f561c08de6b Initial load duke parents: diff changeset	215
7f561c08de6b Initial load duke parents: diff changeset	216	if (detectedEncoding != null) {
7f561c08de6b Initial load duke parents: diff changeset	217	encoding = detectedEncoding;
7f561c08de6b Initial load duke parents: diff changeset	218	}
7f561c08de6b Initial load duke parents: diff changeset	219	// else 3 or 4.
7f561c08de6b Initial load duke parents: diff changeset	220	}
7f561c08de6b Initial load duke parents: diff changeset	221
7f561c08de6b Initial load duke parents: diff changeset	222	encoding = encoding.toUpperCase(Locale.ENGLISH);
7f561c08de6b Initial load duke parents: diff changeset	223
7f561c08de6b Initial load duke parents: diff changeset	224	// eat the Byte Order Mark
7f561c08de6b Initial load duke parents: diff changeset	225	encoding = consumeBOM(stream, encoding);
7f561c08de6b Initial load duke parents: diff changeset	226
7f561c08de6b Initial load duke parents: diff changeset	227	// If the document is UTF-8 or US-ASCII use
7f561c08de6b Initial load duke parents: diff changeset	228	// the Xerces readers for these encodings. For
7f561c08de6b Initial load duke parents: diff changeset	229	// US-ASCII consult the encoding map since
7f561c08de6b Initial load duke parents: diff changeset	230	// this encoding has many aliases.
7f561c08de6b Initial load duke parents: diff changeset	231	if (encoding.equals("UTF-8")) {
7f561c08de6b Initial load duke parents: diff changeset	232	return new UTF8Reader(stream,
7f561c08de6b Initial load duke parents: diff changeset	233	fTempString.ch.length,
7f561c08de6b Initial load duke parents: diff changeset	234	fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
7f561c08de6b Initial load duke parents: diff changeset	235	fErrorReporter.getLocale() );
7f561c08de6b Initial load duke parents: diff changeset	236	}
7f561c08de6b Initial load duke parents: diff changeset	237
7f561c08de6b Initial load duke parents: diff changeset	238	// Try to use a Java reader.
7f561c08de6b Initial load duke parents: diff changeset	239	String javaEncoding = EncodingMap.getIANA2JavaMapping(encoding);
7f561c08de6b Initial load duke parents: diff changeset	240
7f561c08de6b Initial load duke parents: diff changeset	241	// If the specified encoding wasn't a recognized IANA encoding throw an IOException.
7f561c08de6b Initial load duke parents: diff changeset	242	// The XIncludeHandler will report this as a ResourceError and then will
7f561c08de6b Initial load duke parents: diff changeset	243	// attempt to include a fallback if there is one.
7f561c08de6b Initial load duke parents: diff changeset	244	if (javaEncoding == null) {
7f561c08de6b Initial load duke parents: diff changeset	245	MessageFormatter aFormatter =
7f561c08de6b Initial load duke parents: diff changeset	246	fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN);
7f561c08de6b Initial load duke parents: diff changeset	247	Locale aLocale = fErrorReporter.getLocale();
7f561c08de6b Initial load duke parents: diff changeset	248	throw new IOException( aFormatter.formatMessage( aLocale,
7f561c08de6b Initial load duke parents: diff changeset	249	"EncodingDeclInvalid",
7f561c08de6b Initial load duke parents: diff changeset	250	new Object[] {encoding} ) );
7f561c08de6b Initial load duke parents: diff changeset	251	}
7f561c08de6b Initial load duke parents: diff changeset	252	else if (javaEncoding.equals("ASCII")) {
7f561c08de6b Initial load duke parents: diff changeset	253	return new ASCIIReader(stream,
7f561c08de6b Initial load duke parents: diff changeset	254	fTempString.ch.length,
7f561c08de6b Initial load duke parents: diff changeset	255	fErrorReporter.getMessageFormatter(XMLMessageFormatter.XML_DOMAIN),
7f561c08de6b Initial load duke parents: diff changeset	256	fErrorReporter.getLocale() );
7f561c08de6b Initial load duke parents: diff changeset	257	}
7f561c08de6b Initial load duke parents: diff changeset	258
7f561c08de6b Initial load duke parents: diff changeset	259	return new InputStreamReader(stream, javaEncoding);
7f561c08de6b Initial load duke parents: diff changeset	260	}
7f561c08de6b Initial load duke parents: diff changeset	261	}
7f561c08de6b Initial load duke parents: diff changeset	262
7f561c08de6b Initial load duke parents: diff changeset	263	/**
7f561c08de6b Initial load duke parents: diff changeset	264	* XMLEntityManager cares about endian-ness, since it creates its own optimized
7f561c08de6b Initial load duke parents: diff changeset	265	* readers. Since we're just using generic Java readers for now, we're not caring
7f561c08de6b Initial load duke parents: diff changeset	266	* about endian-ness. If this changes, even more code needs to be copied from
7f561c08de6b Initial load duke parents: diff changeset	267	* XMLEntity manager. -- PJM
7f561c08de6b Initial load duke parents: diff changeset	268	*/
7f561c08de6b Initial load duke parents: diff changeset	269	protected String getEncodingName(InputStream stream) throws IOException {
7f561c08de6b Initial load duke parents: diff changeset	270	final byte[] b4 = new byte[4];
7f561c08de6b Initial load duke parents: diff changeset	271	String encoding = null;
7f561c08de6b Initial load duke parents: diff changeset	272
7f561c08de6b Initial load duke parents: diff changeset	273	// this has the potential to throw an exception
7f561c08de6b Initial load duke parents: diff changeset	274	// it will be fixed when we ensure the stream is rewindable (see note above)
7f561c08de6b Initial load duke parents: diff changeset	275	stream.mark(4);
7f561c08de6b Initial load duke parents: diff changeset	276	int count = stream.read(b4, 0, 4);
7f561c08de6b Initial load duke parents: diff changeset	277	stream.reset();
7f561c08de6b Initial load duke parents: diff changeset	278	if (count == 4) {
7f561c08de6b Initial load duke parents: diff changeset	279	encoding = getEncodingName(b4);
7f561c08de6b Initial load duke parents: diff changeset	280	}
7f561c08de6b Initial load duke parents: diff changeset	281
7f561c08de6b Initial load duke parents: diff changeset	282	return encoding;
7f561c08de6b Initial load duke parents: diff changeset	283	}
7f561c08de6b Initial load duke parents: diff changeset	284
7f561c08de6b Initial load duke parents: diff changeset	285	/**
7f561c08de6b Initial load duke parents: diff changeset	286	* Removes the byte order mark from the stream, if
7f561c08de6b Initial load duke parents: diff changeset	287	* it exists and returns the encoding name.
7f561c08de6b Initial load duke parents: diff changeset	288	*
7f561c08de6b Initial load duke parents: diff changeset	289	* @param stream
7f561c08de6b Initial load duke parents: diff changeset	290	* @param encoding
7f561c08de6b Initial load duke parents: diff changeset	291	* @throws IOException
7f561c08de6b Initial load duke parents: diff changeset	292	*/
7f561c08de6b Initial load duke parents: diff changeset	293	protected String consumeBOM(InputStream stream, String encoding)
7f561c08de6b Initial load duke parents: diff changeset	294	throws IOException {
7f561c08de6b Initial load duke parents: diff changeset	295
7f561c08de6b Initial load duke parents: diff changeset	296	byte[] b = new byte[3];
7f561c08de6b Initial load duke parents: diff changeset	297	int count = 0;
7f561c08de6b Initial load duke parents: diff changeset	298	stream.mark(3);
7f561c08de6b Initial load duke parents: diff changeset	299	if (encoding.equals("UTF-8")) {
7f561c08de6b Initial load duke parents: diff changeset	300	count = stream.read(b, 0, 3);
7f561c08de6b Initial load duke parents: diff changeset	301	if (count == 3) {
7f561c08de6b Initial load duke parents: diff changeset	302	final int b0 = b[0] & 0xFF;
7f561c08de6b Initial load duke parents: diff changeset	303	final int b1 = b[1] & 0xFF;
7f561c08de6b Initial load duke parents: diff changeset	304	final int b2 = b[2] & 0xFF;
7f561c08de6b Initial load duke parents: diff changeset	305	if (b0 != 0xEF \|\| b1 != 0xBB \|\| b2 != 0xBF) {
7f561c08de6b Initial load duke parents: diff changeset	306	// First three bytes are not BOM, so reset.
7f561c08de6b Initial load duke parents: diff changeset	307	stream.reset();
7f561c08de6b Initial load duke parents: diff changeset	308	}
7f561c08de6b Initial load duke parents: diff changeset	309	}
7f561c08de6b Initial load duke parents: diff changeset	310	else {
7f561c08de6b Initial load duke parents: diff changeset	311	stream.reset();
7f561c08de6b Initial load duke parents: diff changeset	312	}
7f561c08de6b Initial load duke parents: diff changeset	313	}
7f561c08de6b Initial load duke parents: diff changeset	314	else if (encoding.startsWith("UTF-16")) {
7f561c08de6b Initial load duke parents: diff changeset	315	count = stream.read(b, 0, 2);
7f561c08de6b Initial load duke parents: diff changeset	316	if (count == 2) {
7f561c08de6b Initial load duke parents: diff changeset	317	final int b0 = b[0] & 0xFF;
7f561c08de6b Initial load duke parents: diff changeset	318	final int b1 = b[1] & 0xFF;
7f561c08de6b Initial load duke parents: diff changeset	319	if (b0 == 0xFE && b1 == 0xFF) {
7f561c08de6b Initial load duke parents: diff changeset	320	return "UTF-16BE";
7f561c08de6b Initial load duke parents: diff changeset	321	}
7f561c08de6b Initial load duke parents: diff changeset	322	else if (b0 == 0xFF && b1 == 0xFE) {
7f561c08de6b Initial load duke parents: diff changeset	323	return "UTF-16LE";
7f561c08de6b Initial load duke parents: diff changeset	324	}
7f561c08de6b Initial load duke parents: diff changeset	325	}
7f561c08de6b Initial load duke parents: diff changeset	326	// First two bytes are not BOM, so reset.
7f561c08de6b Initial load duke parents: diff changeset	327	stream.reset();
7f561c08de6b Initial load duke parents: diff changeset	328	}
7f561c08de6b Initial load duke parents: diff changeset	329	// We could do UTF-32, but since the getEncodingName() doesn't support that
7f561c08de6b Initial load duke parents: diff changeset	330	// we won't support it here.
7f561c08de6b Initial load duke parents: diff changeset	331	// To implement UTF-32, look for: 00 00 FE FF for big-endian
7f561c08de6b Initial load duke parents: diff changeset	332	// or FF FE 00 00 for little-endian
7f561c08de6b Initial load duke parents: diff changeset	333	return encoding;
7f561c08de6b Initial load duke parents: diff changeset	334	}
7f561c08de6b Initial load duke parents: diff changeset	335
7f561c08de6b Initial load duke parents: diff changeset	336	/**
7f561c08de6b Initial load duke parents: diff changeset	337	* REVISIT: This code is taken from com.sun.org.apache.xerces.internal.impl.XMLEntityManager.
7f561c08de6b Initial load duke parents: diff changeset	338	* Is there any way we can share the code, without having it implemented twice?
7f561c08de6b Initial load duke parents: diff changeset	339	* I think we should make it public and static in XMLEntityManager. --PJM
7f561c08de6b Initial load duke parents: diff changeset	340	*
7f561c08de6b Initial load duke parents: diff changeset	341	* Returns the IANA encoding name that is auto-detected from
7f561c08de6b Initial load duke parents: diff changeset	342	* the bytes specified, with the endian-ness of that encoding where appropriate.
7f561c08de6b Initial load duke parents: diff changeset	343	*
7f561c08de6b Initial load duke parents: diff changeset	344	* @param b4 The first four bytes of the input.
7f561c08de6b Initial load duke parents: diff changeset	345	* @return the encoding name, or null if no encoding could be detected
7f561c08de6b Initial load duke parents: diff changeset	346	*/
7f561c08de6b Initial load duke parents: diff changeset	347	protected String getEncodingName(byte[] b4) {
7f561c08de6b Initial load duke parents: diff changeset	348
7f561c08de6b Initial load duke parents: diff changeset	349	// UTF-16, with BOM
7f561c08de6b Initial load duke parents: diff changeset	350	int b0 = b4[0] & 0xFF;
7f561c08de6b Initial load duke parents: diff changeset	351	int b1 = b4[1] & 0xFF;
7f561c08de6b Initial load duke parents: diff changeset	352	if (b0 == 0xFE && b1 == 0xFF) {
7f561c08de6b Initial load duke parents: diff changeset	353	// UTF-16, big-endian
7f561c08de6b Initial load duke parents: diff changeset	354	return "UTF-16BE";
7f561c08de6b Initial load duke parents: diff changeset	355	}
7f561c08de6b Initial load duke parents: diff changeset	356	if (b0 == 0xFF && b1 == 0xFE) {
7f561c08de6b Initial load duke parents: diff changeset	357	// UTF-16, little-endian
7f561c08de6b Initial load duke parents: diff changeset	358	return "UTF-16LE";
7f561c08de6b Initial load duke parents: diff changeset	359	}
7f561c08de6b Initial load duke parents: diff changeset	360
7f561c08de6b Initial load duke parents: diff changeset	361	// UTF-8 with a BOM
7f561c08de6b Initial load duke parents: diff changeset	362	int b2 = b4[2] & 0xFF;
7f561c08de6b Initial load duke parents: diff changeset	363	if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) {
7f561c08de6b Initial load duke parents: diff changeset	364	return "UTF-8";
7f561c08de6b Initial load duke parents: diff changeset	365	}
7f561c08de6b Initial load duke parents: diff changeset	366
7f561c08de6b Initial load duke parents: diff changeset	367	// other encodings
7f561c08de6b Initial load duke parents: diff changeset	368	int b3 = b4[3] & 0xFF;
7f561c08de6b Initial load duke parents: diff changeset	369	if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) {
7f561c08de6b Initial load duke parents: diff changeset	370	// UCS-4, big endian (1234)
7f561c08de6b Initial load duke parents: diff changeset	371	return "ISO-10646-UCS-4";
7f561c08de6b Initial load duke parents: diff changeset	372	}
7f561c08de6b Initial load duke parents: diff changeset	373	if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) {
7f561c08de6b Initial load duke parents: diff changeset	374	// UCS-4, little endian (4321)
7f561c08de6b Initial load duke parents: diff changeset	375	return "ISO-10646-UCS-4";
7f561c08de6b Initial load duke parents: diff changeset	376	}
7f561c08de6b Initial load duke parents: diff changeset	377	if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) {
7f561c08de6b Initial load duke parents: diff changeset	378	// UCS-4, unusual octet order (2143)
7f561c08de6b Initial load duke parents: diff changeset	379	return "ISO-10646-UCS-4";
7f561c08de6b Initial load duke parents: diff changeset	380	}
7f561c08de6b Initial load duke parents: diff changeset	381	if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) {
7f561c08de6b Initial load duke parents: diff changeset	382	// UCS-4, unusual octect order (3412)
7f561c08de6b Initial load duke parents: diff changeset	383	return "ISO-10646-UCS-4";
7f561c08de6b Initial load duke parents: diff changeset	384	}
7f561c08de6b Initial load duke parents: diff changeset	385	if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) {
7f561c08de6b Initial load duke parents: diff changeset	386	// UTF-16, big-endian, no BOM
7f561c08de6b Initial load duke parents: diff changeset	387	// (or could turn out to be UCS-2...
7f561c08de6b Initial load duke parents: diff changeset	388	return "UTF-16BE";
7f561c08de6b Initial load duke parents: diff changeset	389	}
7f561c08de6b Initial load duke parents: diff changeset	390	if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) {
7f561c08de6b Initial load duke parents: diff changeset	391	// UTF-16, little-endian, no BOM
7f561c08de6b Initial load duke parents: diff changeset	392	// (or could turn out to be UCS-2...
7f561c08de6b Initial load duke parents: diff changeset	393	return "UTF-16LE";
7f561c08de6b Initial load duke parents: diff changeset	394	}
7f561c08de6b Initial load duke parents: diff changeset	395	if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) {
7f561c08de6b Initial load duke parents: diff changeset	396	// EBCDIC
7f561c08de6b Initial load duke parents: diff changeset	397	// a la xerces1, return CP037 instead of EBCDIC here
7f561c08de6b Initial load duke parents: diff changeset	398	return "CP037";
7f561c08de6b Initial load duke parents: diff changeset	399	}
7f561c08de6b Initial load duke parents: diff changeset	400
7f561c08de6b Initial load duke parents: diff changeset	401	// this signals us to use the value from the encoding attribute
7f561c08de6b Initial load duke parents: diff changeset	402	return null;
7f561c08de6b Initial load duke parents: diff changeset	403
7f561c08de6b Initial load duke parents: diff changeset	404	} // getEncodingName(byte[]):Object[]
7f561c08de6b Initial load duke parents: diff changeset	405
7f561c08de6b Initial load duke parents: diff changeset	406	/**
7f561c08de6b Initial load duke parents: diff changeset	407	* Read the input stream as text, and pass the text on to the XIncludeHandler
7f561c08de6b Initial load duke parents: diff changeset	408	* using calls to characters(). This will read all of the text it can from the
7f561c08de6b Initial load duke parents: diff changeset	409	* resource.
7f561c08de6b Initial load duke parents: diff changeset	410	*
7f561c08de6b Initial load duke parents: diff changeset	411	* @throws IOException
7f561c08de6b Initial load duke parents: diff changeset	412	*/
7f561c08de6b Initial load duke parents: diff changeset	413	public void parse() throws IOException {
7f561c08de6b Initial load duke parents: diff changeset	414
7f561c08de6b Initial load duke parents: diff changeset	415	fReader = getReader(fSource);
7f561c08de6b Initial load duke parents: diff changeset	416	fSource = null;
7f561c08de6b Initial load duke parents: diff changeset	417	int readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1);
7f561c08de6b Initial load duke parents: diff changeset	418	while (readSize != -1) {
7f561c08de6b Initial load duke parents: diff changeset	419	for (int i = 0; i < readSize; ++i) {
7f561c08de6b Initial load duke parents: diff changeset	420	char ch = fTempString.ch[i];
7f561c08de6b Initial load duke parents: diff changeset	421	if (!isValid(ch)) {
7f561c08de6b Initial load duke parents: diff changeset	422	if (XMLChar.isHighSurrogate(ch)) {
7f561c08de6b Initial load duke parents: diff changeset	423	int ch2;
7f561c08de6b Initial load duke parents: diff changeset	424	// retrieve next character
7f561c08de6b Initial load duke parents: diff changeset	425	if (++i < readSize) {
7f561c08de6b Initial load duke parents: diff changeset	426	ch2 = fTempString.ch[i];
7f561c08de6b Initial load duke parents: diff changeset	427	}
7f561c08de6b Initial load duke parents: diff changeset	428	// handle rare boundary case
7f561c08de6b Initial load duke parents: diff changeset	429	else {
7f561c08de6b Initial load duke parents: diff changeset	430	ch2 = fReader.read();
7f561c08de6b Initial load duke parents: diff changeset	431	if (ch2 != -1) {
7f561c08de6b Initial load duke parents: diff changeset	432	fTempString.ch[readSize++] = (char) ch2;
7f561c08de6b Initial load duke parents: diff changeset	433	}
7f561c08de6b Initial load duke parents: diff changeset	434	}
7f561c08de6b Initial load duke parents: diff changeset	435	if (XMLChar.isLowSurrogate(ch2)) {
7f561c08de6b Initial load duke parents: diff changeset	436	// convert surrogates to a supplemental character
7f561c08de6b Initial load duke parents: diff changeset	437	int sup = XMLChar.supplemental(ch, (char)ch2);
7f561c08de6b Initial load duke parents: diff changeset	438	if (!isValid(sup)) {
7f561c08de6b Initial load duke parents: diff changeset	439	fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
7f561c08de6b Initial load duke parents: diff changeset	440	"InvalidCharInContent",
7f561c08de6b Initial load duke parents: diff changeset	441	new Object[] { Integer.toString(sup, 16) },
7f561c08de6b Initial load duke parents: diff changeset	442	XMLErrorReporter.SEVERITY_FATAL_ERROR);
7f561c08de6b Initial load duke parents: diff changeset	443	}
7f561c08de6b Initial load duke parents: diff changeset	444	}
7f561c08de6b Initial load duke parents: diff changeset	445	else {
7f561c08de6b Initial load duke parents: diff changeset	446	fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
7f561c08de6b Initial load duke parents: diff changeset	447	"InvalidCharInContent",
7f561c08de6b Initial load duke parents: diff changeset	448	new Object[] { Integer.toString(ch2, 16) },
7f561c08de6b Initial load duke parents: diff changeset	449	XMLErrorReporter.SEVERITY_FATAL_ERROR);
7f561c08de6b Initial load duke parents: diff changeset	450	}
7f561c08de6b Initial load duke parents: diff changeset	451	}
7f561c08de6b Initial load duke parents: diff changeset	452	else {
7f561c08de6b Initial load duke parents: diff changeset	453	fErrorReporter.reportError(XMLMessageFormatter.XML_DOMAIN,
7f561c08de6b Initial load duke parents: diff changeset	454	"InvalidCharInContent",
7f561c08de6b Initial load duke parents: diff changeset	455	new Object[] { Integer.toString(ch, 16) },
7f561c08de6b Initial load duke parents: diff changeset	456	XMLErrorReporter.SEVERITY_FATAL_ERROR);
7f561c08de6b Initial load duke parents: diff changeset	457	}
7f561c08de6b Initial load duke parents: diff changeset	458	}
7f561c08de6b Initial load duke parents: diff changeset	459	}
7f561c08de6b Initial load duke parents: diff changeset	460	if (fHandler != null && readSize > 0) {
7f561c08de6b Initial load duke parents: diff changeset	461	fTempString.offset = 0;
7f561c08de6b Initial load duke parents: diff changeset	462	fTempString.length = readSize;
7f561c08de6b Initial load duke parents: diff changeset	463	fHandler.characters(
7f561c08de6b Initial load duke parents: diff changeset	464	fTempString,
7f561c08de6b Initial load duke parents: diff changeset	465	fHandler.modifyAugmentations(null, true));
7f561c08de6b Initial load duke parents: diff changeset	466	}
7f561c08de6b Initial load duke parents: diff changeset	467	readSize = fReader.read(fTempString.ch, 0, fTempString.ch.length - 1);
7f561c08de6b Initial load duke parents: diff changeset	468	}
7f561c08de6b Initial load duke parents: diff changeset	469
7f561c08de6b Initial load duke parents: diff changeset	470	}
7f561c08de6b Initial load duke parents: diff changeset	471
7f561c08de6b Initial load duke parents: diff changeset	472	/**
7f561c08de6b Initial load duke parents: diff changeset	473	* Sets the input source on this text reader.
7f561c08de6b Initial load duke parents: diff changeset	474	*
7f561c08de6b Initial load duke parents: diff changeset	475	* @param source The XMLInputSource to use.
7f561c08de6b Initial load duke parents: diff changeset	476	*/
7f561c08de6b Initial load duke parents: diff changeset	477	public void setInputSource(XMLInputSource source) {
7f561c08de6b Initial load duke parents: diff changeset	478	fSource = source;
7f561c08de6b Initial load duke parents: diff changeset	479	}
7f561c08de6b Initial load duke parents: diff changeset	480
7f561c08de6b Initial load duke parents: diff changeset	481	/**
7f561c08de6b Initial load duke parents: diff changeset	482	* Closes the stream. Call this after parse(), or when there is no longer any need
7f561c08de6b Initial load duke parents: diff changeset	483	* for this object.
7f561c08de6b Initial load duke parents: diff changeset	484	*
7f561c08de6b Initial load duke parents: diff changeset	485	* @throws IOException
7f561c08de6b Initial load duke parents: diff changeset	486	*/
7f561c08de6b Initial load duke parents: diff changeset	487	public void close() throws IOException {
7f561c08de6b Initial load duke parents: diff changeset	488	if (fReader != null) {
7f561c08de6b Initial load duke parents: diff changeset	489	fReader.close();
7f561c08de6b Initial load duke parents: diff changeset	490	fReader = null;
7f561c08de6b Initial load duke parents: diff changeset	491	}
7f561c08de6b Initial load duke parents: diff changeset	492	}
7f561c08de6b Initial load duke parents: diff changeset	493
7f561c08de6b Initial load duke parents: diff changeset	494	/**
7f561c08de6b Initial load duke parents: diff changeset	495	* Returns true if the specified character is a valid XML character
7f561c08de6b Initial load duke parents: diff changeset	496	* as per the rules of XML 1.0.
7f561c08de6b Initial load duke parents: diff changeset	497	*
7f561c08de6b Initial load duke parents: diff changeset	498	* @param ch The character to check.
7f561c08de6b Initial load duke parents: diff changeset	499	*/
7f561c08de6b Initial load duke parents: diff changeset	500	protected boolean isValid(int ch) {
7f561c08de6b Initial load duke parents: diff changeset	501	return XMLChar.isValid(ch);
7f561c08de6b Initial load duke parents: diff changeset	502	}
7f561c08de6b Initial load duke parents: diff changeset	503
7f561c08de6b Initial load duke parents: diff changeset	504	/**
7f561c08de6b Initial load duke parents: diff changeset	505	* Sets the buffer size property for the reader which decides the chunk sizes that are parsed
7f561c08de6b Initial load duke parents: diff changeset	506	* by the reader at a time and passed to the handler
7f561c08de6b Initial load duke parents: diff changeset	507	*
7f561c08de6b Initial load duke parents: diff changeset	508	* @param bufferSize The size of the buffer desired
7f561c08de6b Initial load duke parents: diff changeset	509	*/
7f561c08de6b Initial load duke parents: diff changeset	510	protected void setBufferSize(int bufferSize) {
7f561c08de6b Initial load duke parents: diff changeset	511	if (fTempString.ch.length != ++bufferSize) {
7f561c08de6b Initial load duke parents: diff changeset	512	fTempString.ch = new char[bufferSize];
7f561c08de6b Initial load duke parents: diff changeset	513	}
7f561c08de6b Initial load duke parents: diff changeset	514	}
7f561c08de6b Initial load duke parents: diff changeset	515
7f561c08de6b Initial load duke parents: diff changeset	516	}

author	dfuchs
	Tue, 30 Jun 2015 12:04:27 +0200
changeset 31497	4a6b2e733c0d
parent 25868	686eef1e7a79
child 44797	8b3b3b911b8a
permissions	-rw-r--r--