2
|
1 |
/*
|
|
2 |
* Copyright 1998-2003 Sun Microsystems, Inc. All Rights Reserved.
|
|
3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
4 |
*
|
|
5 |
* This code is free software; you can redistribute it and/or modify it
|
|
6 |
* under the terms of the GNU General Public License version 2 only, as
|
|
7 |
* published by the Free Software Foundation. Sun designates this
|
|
8 |
* particular file as subject to the "Classpath" exception as provided
|
|
9 |
* by Sun in the LICENSE file that accompanied this code.
|
|
10 |
*
|
|
11 |
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
12 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
13 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
14 |
* version 2 for more details (a copy is included in the LICENSE file that
|
|
15 |
* accompanied this code).
|
|
16 |
*
|
|
17 |
* You should have received a copy of the GNU General Public License version
|
|
18 |
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
19 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
20 |
*
|
|
21 |
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
|
|
22 |
* CA 95054 USA or visit www.sun.com if you need additional information or
|
|
23 |
* have any questions.
|
|
24 |
*/
|
|
25 |
|
|
26 |
package javax.swing.text.html.parser;
|
|
27 |
|
|
28 |
import javax.swing.text.SimpleAttributeSet;
|
|
29 |
import javax.swing.text.html.HTMLEditorKit;
|
|
30 |
import javax.swing.text.html.HTML;
|
|
31 |
import javax.swing.text.ChangedCharSetException;
|
|
32 |
|
|
33 |
import java.util.*;
|
|
34 |
import java.io.*;
|
|
35 |
import java.net.*;
|
|
36 |
|
|
37 |
/**
|
|
38 |
* A Parser for HTML Documents (actually, you can specify a DTD, but
|
|
39 |
* you should really only use this class with the html dtd in swing).
|
|
40 |
* Reads an InputStream of HTML and
|
|
41 |
* invokes the appropriate methods in the ParserCallback class. This
|
|
42 |
* is the default parser used by HTMLEditorKit to parse HTML url's.
|
|
43 |
* <p>This will message the callback for all valid tags, as well as
|
|
44 |
* tags that are implied but not explicitly specified. For example, the
|
|
45 |
* html string (<p>blah) only has a p tag defined. The callback
|
|
46 |
* will see the following methods:
|
|
47 |
* <ol><li><i>handleStartTag(html, ...)</i></li>
|
|
48 |
* <li><i>handleStartTag(head, ...)</i></li>
|
|
49 |
* <li><i>handleEndTag(head)</i></li>
|
|
50 |
* <li><i>handleStartTag(body, ...)</i></li>
|
|
51 |
* <li>handleStartTag(p, ...)</i></li>
|
|
52 |
* <li>handleText(...)</li>
|
|
53 |
* <li><i>handleEndTag(p)</i></li>
|
|
54 |
* <li><i>handleEndTag(body)</i></li>
|
|
55 |
* <li><i>handleEndTag(html)</i></li>
|
|
56 |
* </ol>
|
|
57 |
* The items in <i>italic</i> are implied, that is, although they were not
|
|
58 |
* explicitly specified, to be correct html they should have been present
|
|
59 |
* (head isn't necessary, but it is still generated). For tags that
|
|
60 |
* are implied, the AttributeSet argument will have a value of
|
|
61 |
* <code>Boolean.TRUE</code> for the key
|
|
62 |
* <code>HTMLEditorKit.ParserCallback.IMPLIED</code>.
|
|
63 |
* <p>HTML.Attributes defines a type safe enumeration of html attributes.
|
|
64 |
* If an attribute key of a tag is defined in HTML.Attribute, the
|
|
65 |
* HTML.Attribute will be used as the key, otherwise a String will be used.
|
|
66 |
* For example <p foo=bar class=neat> has two attributes. foo is
|
|
67 |
* not defined in HTML.Attribute, where as class is, therefore the
|
|
68 |
* AttributeSet will have two values in it, HTML.Attribute.CLASS with
|
|
69 |
* a String value of 'neat' and the String key 'foo' with a String value of
|
|
70 |
* 'bar'.
|
|
71 |
* <p>The position argument will indicate the start of the tag, comment
|
|
72 |
* or text. Similiar to arrays, the first character in the stream has a
|
|
73 |
* position of 0. For tags that are
|
|
74 |
* implied the position will indicate
|
|
75 |
* the location of the next encountered tag. In the first example,
|
|
76 |
* the implied start body and html tags will have the same position as the
|
|
77 |
* p tag, and the implied end p, html and body tags will all have the same
|
|
78 |
* position.
|
|
79 |
* <p>As html skips whitespace the position for text will be the position
|
|
80 |
* of the first valid character, eg in the string '\n\n\nblah'
|
|
81 |
* the text 'blah' will have a position of 3, the newlines are skipped.
|
|
82 |
* <p>
|
|
83 |
* For attributes that do not have a value, eg in the html
|
|
84 |
* string <code><foo blah></code> the attribute <code>blah</code>
|
|
85 |
* does not have a value, there are two possible values that will be
|
|
86 |
* placed in the AttributeSet's value:
|
|
87 |
* <ul>
|
|
88 |
* <li>If the DTD does not contain an definition for the element, or the
|
|
89 |
* definition does not have an explicit value then the value in the
|
|
90 |
* AttributeSet will be <code>HTML.NULL_ATTRIBUTE_VALUE</code>.
|
|
91 |
* <li>If the DTD contains an explicit value, as in:
|
|
92 |
* <code><!ATTLIST OPTION selected (selected) #IMPLIED></code>
|
|
93 |
* this value from the dtd (in this case selected) will be used.
|
|
94 |
* </ul>
|
|
95 |
* <p>
|
|
96 |
* Once the stream has been parsed, the callback is notified of the most
|
|
97 |
* likely end of line string. The end of line string will be one of
|
|
98 |
* \n, \r or \r\n, which ever is encountered the most in parsing the
|
|
99 |
* stream.
|
|
100 |
*
|
|
101 |
* @author Sunita Mani
|
|
102 |
*/
|
|
103 |
public class DocumentParser extends javax.swing.text.html.parser.Parser {
|
|
104 |
|
|
105 |
private int inbody;
|
|
106 |
private int intitle;
|
|
107 |
private int inhead;
|
|
108 |
private int instyle;
|
|
109 |
private int inscript;
|
|
110 |
private boolean seentitle;
|
|
111 |
private HTMLEditorKit.ParserCallback callback = null;
|
|
112 |
private boolean ignoreCharSet = false;
|
|
113 |
private static final boolean debugFlag = false;
|
|
114 |
|
|
115 |
public DocumentParser(DTD dtd) {
|
|
116 |
super(dtd);
|
|
117 |
}
|
|
118 |
|
|
119 |
public void parse(Reader in, HTMLEditorKit.ParserCallback callback, boolean ignoreCharSet) throws IOException {
|
|
120 |
this.ignoreCharSet = ignoreCharSet;
|
|
121 |
this.callback = callback;
|
|
122 |
parse(in);
|
|
123 |
// end of line
|
|
124 |
callback.handleEndOfLineString(getEndOfLineString());
|
|
125 |
}
|
|
126 |
|
|
127 |
/**
|
|
128 |
* Handle Start Tag.
|
|
129 |
*/
|
|
130 |
protected void handleStartTag(TagElement tag) {
|
|
131 |
|
|
132 |
Element elem = tag.getElement();
|
|
133 |
if (elem == dtd.body) {
|
|
134 |
inbody++;
|
|
135 |
} else if (elem == dtd.html) {
|
|
136 |
} else if (elem == dtd.head) {
|
|
137 |
inhead++;
|
|
138 |
} else if (elem == dtd.title) {
|
|
139 |
intitle++;
|
|
140 |
} else if (elem == dtd.style) {
|
|
141 |
instyle++;
|
|
142 |
} else if (elem == dtd.script) {
|
|
143 |
inscript++;
|
|
144 |
}
|
|
145 |
if (debugFlag) {
|
|
146 |
if (tag.fictional()) {
|
|
147 |
debug("Start Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
|
|
148 |
} else {
|
|
149 |
debug("Start Tag: " + tag.getHTMLTag() + " attributes: " +
|
|
150 |
getAttributes() + " pos: " + getCurrentPos());
|
|
151 |
}
|
|
152 |
}
|
|
153 |
if (tag.fictional()) {
|
|
154 |
SimpleAttributeSet attrs = new SimpleAttributeSet();
|
|
155 |
attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
|
|
156 |
Boolean.TRUE);
|
|
157 |
callback.handleStartTag(tag.getHTMLTag(), attrs,
|
|
158 |
getBlockStartPosition());
|
|
159 |
} else {
|
|
160 |
callback.handleStartTag(tag.getHTMLTag(), getAttributes(),
|
|
161 |
getBlockStartPosition());
|
|
162 |
flushAttributes();
|
|
163 |
}
|
|
164 |
}
|
|
165 |
|
|
166 |
|
|
167 |
protected void handleComment(char text[]) {
|
|
168 |
if (debugFlag) {
|
|
169 |
debug("comment: ->" + new String(text) + "<-"
|
|
170 |
+ " pos: " + getCurrentPos());
|
|
171 |
}
|
|
172 |
callback.handleComment(text, getBlockStartPosition());
|
|
173 |
}
|
|
174 |
|
|
175 |
/**
|
|
176 |
* Handle Empty Tag.
|
|
177 |
*/
|
|
178 |
protected void handleEmptyTag(TagElement tag) throws ChangedCharSetException {
|
|
179 |
|
|
180 |
Element elem = tag.getElement();
|
|
181 |
if (elem == dtd.meta && !ignoreCharSet) {
|
|
182 |
SimpleAttributeSet atts = getAttributes();
|
|
183 |
if (atts != null) {
|
|
184 |
String content = (String)atts.getAttribute(HTML.Attribute.CONTENT);
|
|
185 |
if (content != null) {
|
|
186 |
if ("content-type".equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
|
|
187 |
if (!content.equalsIgnoreCase("text/html") &&
|
|
188 |
!content.equalsIgnoreCase("text/plain")) {
|
|
189 |
throw new ChangedCharSetException(content, false);
|
|
190 |
}
|
|
191 |
} else if ("charset" .equalsIgnoreCase((String)atts.getAttribute(HTML.Attribute.HTTPEQUIV))) {
|
|
192 |
throw new ChangedCharSetException(content, true);
|
|
193 |
}
|
|
194 |
}
|
|
195 |
}
|
|
196 |
}
|
|
197 |
if (inbody != 0 || elem == dtd.meta || elem == dtd.base || elem == dtd.isindex || elem == dtd.style || elem == dtd.link) {
|
|
198 |
if (debugFlag) {
|
|
199 |
if (tag.fictional()) {
|
|
200 |
debug("Empty Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
|
|
201 |
} else {
|
|
202 |
debug("Empty Tag: " + tag.getHTMLTag() + " attributes: "
|
|
203 |
+ getAttributes() + " pos: " + getCurrentPos());
|
|
204 |
}
|
|
205 |
}
|
|
206 |
if (tag.fictional()) {
|
|
207 |
SimpleAttributeSet attrs = new SimpleAttributeSet();
|
|
208 |
attrs.addAttribute(HTMLEditorKit.ParserCallback.IMPLIED,
|
|
209 |
Boolean.TRUE);
|
|
210 |
callback.handleSimpleTag(tag.getHTMLTag(), attrs,
|
|
211 |
getBlockStartPosition());
|
|
212 |
} else {
|
|
213 |
callback.handleSimpleTag(tag.getHTMLTag(), getAttributes(),
|
|
214 |
getBlockStartPosition());
|
|
215 |
flushAttributes();
|
|
216 |
}
|
|
217 |
}
|
|
218 |
}
|
|
219 |
|
|
220 |
/**
|
|
221 |
* Handle End Tag.
|
|
222 |
*/
|
|
223 |
protected void handleEndTag(TagElement tag) {
|
|
224 |
Element elem = tag.getElement();
|
|
225 |
if (elem == dtd.body) {
|
|
226 |
inbody--;
|
|
227 |
} else if (elem == dtd.title) {
|
|
228 |
intitle--;
|
|
229 |
seentitle = true;
|
|
230 |
} else if (elem == dtd.head) {
|
|
231 |
inhead--;
|
|
232 |
} else if (elem == dtd.style) {
|
|
233 |
instyle--;
|
|
234 |
} else if (elem == dtd.script) {
|
|
235 |
inscript--;
|
|
236 |
}
|
|
237 |
if (debugFlag) {
|
|
238 |
debug("End Tag: " + tag.getHTMLTag() + " pos: " + getCurrentPos());
|
|
239 |
}
|
|
240 |
callback.handleEndTag(tag.getHTMLTag(), getBlockStartPosition());
|
|
241 |
|
|
242 |
}
|
|
243 |
|
|
244 |
/**
|
|
245 |
* Handle Text.
|
|
246 |
*/
|
|
247 |
protected void handleText(char data[]) {
|
|
248 |
if (data != null) {
|
|
249 |
if (inscript != 0) {
|
|
250 |
callback.handleComment(data, getBlockStartPosition());
|
|
251 |
return;
|
|
252 |
}
|
|
253 |
if (inbody != 0 || ((instyle != 0) ||
|
|
254 |
((intitle != 0) && !seentitle))) {
|
|
255 |
if (debugFlag) {
|
|
256 |
debug("text: ->" + new String(data) + "<-" + " pos: " + getCurrentPos());
|
|
257 |
}
|
|
258 |
callback.handleText(data, getBlockStartPosition());
|
|
259 |
}
|
|
260 |
}
|
|
261 |
}
|
|
262 |
|
|
263 |
/*
|
|
264 |
* Error handling.
|
|
265 |
*/
|
|
266 |
protected void handleError(int ln, String errorMsg) {
|
|
267 |
if (debugFlag) {
|
|
268 |
debug("Error: ->" + errorMsg + "<-" + " pos: " + getCurrentPos());
|
|
269 |
}
|
|
270 |
/* PENDING: need to improve the error string. */
|
|
271 |
callback.handleError(errorMsg, getCurrentPos());
|
|
272 |
}
|
|
273 |
|
|
274 |
|
|
275 |
/*
|
|
276 |
* debug messages
|
|
277 |
*/
|
|
278 |
private void debug(String msg) {
|
|
279 |
System.out.println(msg);
|
|
280 |
}
|
|
281 |
}
|