author | chegar |
Thu, 17 Oct 2019 20:54:25 +0100 | |
branch | datagramsocketimpl-branch |
changeset 58679 | 9c3209ff7550 |
parent 58678 | 9cf78a70fa4f |
parent 55651 | 45fc36beb0aa |
permissions | -rw-r--r-- |
55113 | 1 |
/* |
2 |
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved. |
|
3 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
4 |
* |
|
5 |
* This code is free software; you can redistribute it and/or modify it |
|
6 |
* under the terms of the GNU General Public License version 2 only, as |
|
7 |
* published by the Free Software Foundation. Oracle designates this |
|
8 |
* particular file as subject to the "Classpath" exception as provided |
|
9 |
* by Oracle in the LICENSE file that accompanied this code. |
|
10 |
* |
|
11 |
* This code is distributed in the hope that it will be useful, but WITHOUT |
|
12 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
13 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
14 |
* version 2 for more details (a copy is included in the LICENSE file that |
|
15 |
* accompanied this code). |
|
16 |
* |
|
17 |
* You should have received a copy of the GNU General Public License version |
|
18 |
* 2 along with this work; if not, write to the Free Software Foundation, |
|
19 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
20 |
* |
|
21 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
22 |
* or visit www.oracle.com if you need additional information or have any |
|
23 |
* questions. |
|
24 |
*/ |
|
25 |
||
26 |
package build.tools.fixuppandoc; |
|
27 |
||
28 |
import java.io.BufferedReader; |
|
29 |
import java.io.BufferedWriter; |
|
30 |
import java.io.IOException; |
|
31 |
import java.io.InputStreamReader; |
|
32 |
import java.io.OutputStreamWriter; |
|
33 |
import java.io.PrintStream; |
|
34 |
import java.io.PrintWriter; |
|
35 |
import java.io.Reader; |
|
36 |
import java.io.Writer; |
|
37 |
import java.nio.file.Files; |
|
38 |
import java.nio.file.Path; |
|
39 |
import java.util.ArrayList; |
|
40 |
import java.util.LinkedHashMap; |
|
41 |
import java.util.LinkedHashSet; |
|
42 |
import java.util.List; |
|
43 |
import java.util.Locale; |
|
44 |
import java.util.Map; |
|
45 |
import java.util.Objects; |
|
46 |
import java.util.Set; |
|
55297 | 47 |
import java.util.regex.Matcher; |
55113 | 48 |
import java.util.regex.Pattern; |
49 |
||
50 |
/** |
|
51 |
* Fixup HTML generated by pandoc. |
|
52 |
* |
|
53 |
* <h2>{@code <html>}</h2> |
|
54 |
* |
|
55 |
* Replace the existing element with {@code <html lang="en">}, removing references to XML. |
|
56 |
* |
|
57 |
* <h2>{@code <main>}</h2> |
|
58 |
* |
|
59 |
* {@code <main>} is inserted if palpable content is found that is not with a |
|
60 |
* section such as {@code header}, {@code footer}, {@code aside}. |
|
61 |
* |
|
62 |
* {@code </main>} is inserted if {@code <main>} was inserted and a section |
|
63 |
* is started that should not be included in the main section. |
|
64 |
* |
|
65 |
* <h2>Tables: row headings</h2> |
|
66 |
* |
|
55297 | 67 |
* For simple tables, as typically generated by _pandoc_, determine the column |
68 |
* whose contents are unique, and convert the cells in that column to be header |
|
69 |
* cells with {@code scope="row"}. In case of ambiguity, a column containing a |
|
70 |
* {@code <th>} whose contents begin with <em>name</em> is preferred. |
|
71 |
* When converting the cell, the {@code style} attribute will be updated to |
|
72 |
* specify {@code font-weight: normal}, and if there is not already an explicit |
|
73 |
* setting for {@code text-align}, then the style will be updated to include |
|
74 |
* {@code text-align:left;}. |
|
55113 | 75 |
* |
55297 | 76 |
* These rules do not apply if the table contains any cells that include |
77 |
* a setting for the {@code scope} attribute, or if the table contains |
|
78 |
* spanning cells or nested tables. |
|
55113 | 79 |
* |
80 |
* <h2>{@code <meta name="generator">}</h2> |
|
81 |
* |
|
82 |
* Update the content string, to indicate it has been processed by this program. |
|
83 |
* |
|
55651
45fc36beb0aa
8227416: Add a "Table Of Contents" to HTML version of man pages
jjg
parents:
55300
diff
changeset
|
84 |
* <h2>{@code <nav id="TOC">}</h2> |
45fc36beb0aa
8227416: Add a "Table Of Contents" to HTML version of man pages
jjg
parents:
55300
diff
changeset
|
85 |
* |
45fc36beb0aa
8227416: Add a "Table Of Contents" to HTML version of man pages
jjg
parents:
55300
diff
changeset
|
86 |
* Set attribute {@code title="Table Of Contents"} |
45fc36beb0aa
8227416: Add a "Table Of Contents" to HTML version of man pages
jjg
parents:
55300
diff
changeset
|
87 |
* |
55113 | 88 |
*/ |
89 |
public class Main { |
|
90 |
/** |
|
91 |
* Runs the program. |
|
92 |
* |
|
93 |
* <pre> |
|
94 |
* java build.tools.fixuppandoc.Main [-o output-file] [input-file] |
|
95 |
* </pre> |
|
96 |
* |
|
97 |
* If no input file is specified, the program will read from standard input. |
|
98 |
* If no output file is specified, the program will write to standard output. |
|
99 |
* Any error messages will be written to the standard error stream. |
|
100 |
* |
|
101 |
* @param args the command-line arguments |
|
102 |
*/ |
|
103 |
public static void main(String... args) { |
|
104 |
try { |
|
105 |
new Main().run(args); |
|
106 |
} catch (IOException | IllegalArgumentException e) { |
|
107 |
System.err.println(e); |
|
108 |
System.exit(1); |
|
109 |
} catch (Throwable t) { |
|
110 |
t.printStackTrace(System.err); |
|
111 |
System.exit(1); |
|
112 |
} |
|
113 |
} |
|
114 |
||
115 |
private void run(String... args) throws IOException { |
|
116 |
Path inFile = null; |
|
117 |
Path outFile = null; |
|
118 |
||
119 |
for (int i = 0; i < args.length; i++) { |
|
120 |
String arg = args[i]; |
|
121 |
if (arg.equals("-o") && i + 1 < args.length) { |
|
122 |
outFile = Path.of(args[++i]); |
|
123 |
} else if (arg.startsWith("-")) { |
|
124 |
throw new IllegalArgumentException(arg); |
|
125 |
} else if (inFile == null) { |
|
126 |
inFile = Path.of(arg); |
|
127 |
} else { |
|
128 |
throw new IllegalArgumentException(arg); |
|
129 |
} |
|
130 |
} |
|
131 |
||
132 |
new Fixup().run(inFile, outFile); |
|
133 |
} |
|
134 |
||
135 |
/** |
|
136 |
* A class to read HTML, copying input to output, modifying |
|
137 |
* fragments as needed. |
|
138 |
*/ |
|
139 |
class Fixup extends HtmlParser { |
|
140 |
/** The output stream. */ |
|
141 |
PrintWriter out; |
|
142 |
||
143 |
/** A stream for reporting errors. */ |
|
144 |
PrintStream err = System.err; |
|
145 |
||
146 |
/** |
|
147 |
* Flag to indicate when {@code <main>} is permitted around palpable content. |
|
148 |
* Set within {@code <body>}; disabled within elements in which {@code <main>} |
|
149 |
* is not permitted. |
|
150 |
*/ |
|
151 |
boolean allowMain = false; |
|
152 |
||
153 |
/** |
|
154 |
* Flag to indicate that {@code <main>} is required. |
|
155 |
* Set on {@code <body>}; reset when {@code <main>} is either found or generated. |
|
156 |
*/ |
|
157 |
boolean needMain = false; |
|
158 |
||
159 |
/** |
|
160 |
* Flag to indicate that {@code </main>} is required. |
|
161 |
* Set if {@code <main>} is generated. |
|
162 |
* Reset when a start or end element is found that requires that {@code </main>} |
|
163 |
* needs to be generated if necessary. |
|
164 |
*/ |
|
165 |
boolean needEndMain = false; |
|
166 |
||
167 |
/** |
|
168 |
* Handler for {@code <table>} elements. |
|
169 |
*/ |
|
170 |
Table table; |
|
171 |
||
172 |
/** |
|
173 |
* Run the program, copying an input file to an output file. |
|
174 |
* If the input file is {@code null}, input is read from the standard input. |
|
175 |
* If the output file is {@code null}, output is written to the standard output. |
|
176 |
* |
|
177 |
* @param inFile the input file |
|
178 |
* @param outFile the output file |
|
179 |
* @throws IOException if an IO error occurs |
|
180 |
*/ |
|
181 |
void run(Path inFile, Path outFile) throws IOException { |
|
182 |
try (Writer out = openWriter(outFile)) { |
|
183 |
this.out = new PrintWriter(out); |
|
184 |
if (inFile != null) { |
|
185 |
read(inFile); |
|
186 |
} else { |
|
187 |
read(new BufferedReader(new InputStreamReader(System.in))); |
|
188 |
} |
|
189 |
} |
|
190 |
} |
|
191 |
||
192 |
/** |
|
193 |
* Returns a writer for a file, or for the standard output if the file is {@code null}. |
|
194 |
* |
|
195 |
* @param file the file |
|
196 |
* @return the writer |
|
197 |
* @throws IOException if an IO error occurs |
|
198 |
*/ |
|
199 |
private Writer openWriter(Path file) throws IOException { |
|
200 |
if (file != null) { |
|
201 |
return Files.newBufferedWriter(file); |
|
202 |
} else { |
|
203 |
return new BufferedWriter(new OutputStreamWriter(System.out) { |
|
204 |
@Override |
|
205 |
public void close() throws IOException { |
|
206 |
flush(); |
|
207 |
} |
|
208 |
}); |
|
209 |
} |
|
210 |
} |
|
211 |
||
212 |
@Override |
|
213 |
protected void error(Path file, int lineNumber, String message) { |
|
214 |
err.print(file == null ? "<stdin>" : file); |
|
215 |
if (lineNumber > 0) { |
|
216 |
err.print(":"); |
|
217 |
err.print(lineNumber); |
|
218 |
} |
|
219 |
err.print(": "); |
|
220 |
err.println(message); |
|
221 |
} |
|
222 |
||
223 |
@Override |
|
224 |
protected void error(Path file, int lineNumber, Throwable t) { |
|
225 |
error(file, lineNumber, t.toString()); |
|
226 |
t.printStackTrace(err); |
|
227 |
} |
|
228 |
||
229 |
/** |
|
230 |
* The buffer in which input is stored until an appropriate action can be determined. |
|
231 |
* Using the buffer ensures that the output exactly matches the input, except where |
|
232 |
* it is intentionally modified. |
|
233 |
*/ |
|
234 |
private StringBuilder buffer = new StringBuilder(); |
|
235 |
||
236 |
@Override |
|
237 |
public int nextChar() throws IOException { |
|
238 |
if (ch > 0) { |
|
239 |
buffer.append((char) ch); |
|
240 |
} |
|
241 |
return super.nextChar(); |
|
242 |
} |
|
243 |
||
244 |
@Override |
|
245 |
protected void doctype(String s) { |
|
246 |
flushBuffer(); |
|
247 |
} |
|
248 |
||
249 |
@Override |
|
250 |
protected void startElement(String name, Map<String,String> attrs, boolean selfClosing) { |
|
251 |
switch (name) { |
|
252 |
case "html": |
|
253 |
// replace the existing <html> fragment |
|
254 |
out.write("<html lang=\"en\">"); |
|
255 |
buffer.setLength(0); |
|
256 |
break; |
|
257 |
||
258 |
case "meta": |
|
259 |
// update the meta-data for the generator |
|
260 |
if (Objects.equals(attrs.get("name"), "generator")) { |
|
261 |
out.write(buffer.toString() |
|
262 |
.replaceAll("(content=\"[^\"]*)(\")", "$1,fixuphtml$2")); |
|
263 |
buffer.setLength(0); |
|
264 |
} |
|
265 |
break; |
|
266 |
||
267 |
case "article": |
|
268 |
case "aside": |
|
269 |
case "footer": |
|
270 |
case "header": |
|
271 |
case "nav": |
|
272 |
// starting one of these elements will terminate <main> if one is being |
|
273 |
// inserted |
|
274 |
if (needEndMain) { |
|
275 |
out.write("</main>"); |
|
276 |
needEndMain = false; |
|
277 |
} |
|
278 |
// <main> is not permitted within these elements |
|
279 |
allowMain = false; |
|
55651
45fc36beb0aa
8227416: Add a "Table Of Contents" to HTML version of man pages
jjg
parents:
55300
diff
changeset
|
280 |
if (name.equals("nav") && Objects.equals(attrs.get("id"), "TOC")) { |
45fc36beb0aa
8227416: Add a "Table Of Contents" to HTML version of man pages
jjg
parents:
55300
diff
changeset
|
281 |
out.write(buffer.toString() |
45fc36beb0aa
8227416: Add a "Table Of Contents" to HTML version of man pages
jjg
parents:
55300
diff
changeset
|
282 |
.replaceAll(">$", " title=\"Table Of Contents\">")); |
45fc36beb0aa
8227416: Add a "Table Of Contents" to HTML version of man pages
jjg
parents:
55300
diff
changeset
|
283 |
buffer.setLength(0); |
45fc36beb0aa
8227416: Add a "Table Of Contents" to HTML version of man pages
jjg
parents:
55300
diff
changeset
|
284 |
} |
55113 | 285 |
break; |
286 |
||
287 |
case "body": |
|
288 |
// within <body>, <main> is both permitted and required |
|
289 |
allowMain = true; |
|
290 |
needMain = true; |
|
291 |
break; |
|
292 |
||
293 |
case "main": |
|
294 |
// an explicit <main> found in the input; no need to add one |
|
295 |
needMain = false; |
|
296 |
break; |
|
297 |
||
298 |
case "table": |
|
299 |
// The entire content of a <table> is buffered, until it can be |
|
300 |
// determined in which column of the table contains the cells |
|
301 |
// that can be used to identify the row. |
|
302 |
if (table == null) { |
|
303 |
table = new Table(); |
|
304 |
} else { |
|
305 |
// tables containing nested tables are not updated |
|
306 |
table.simple = false; |
|
307 |
} |
|
308 |
table.nestDepth++; |
|
309 |
break; |
|
310 |
||
311 |
case "thead": |
|
312 |
case "tbody": |
|
313 |
if (table != null) { |
|
314 |
table.endCell(); |
|
315 |
} |
|
316 |
break; |
|
317 |
||
318 |
case "tr": |
|
319 |
if (table != null) { |
|
320 |
table.endCell(); |
|
321 |
table.nextCellColumnIndex = 0; |
|
322 |
} |
|
323 |
break; |
|
324 |
||
325 |
case "td": |
|
326 |
case "th": |
|
327 |
if (table != null) { |
|
328 |
if (attrs.containsKey("rowspan") |
|
329 |
|| attrs.containsKey("colspan") |
|
330 |
|| attrs.containsKey("scope")) { |
|
331 |
// tables containing spanning cells and tables that already |
|
332 |
// contain scope attributes are not updated |
|
333 |
table.simple = false; |
|
334 |
} |
|
335 |
table.startCell(name); |
|
336 |
} |
|
337 |
break; |
|
338 |
} |
|
339 |
||
340 |
// by default, the content is deemed to be palpable content, and so |
|
341 |
// insert <main> if it is permitted and one is still required, |
|
342 |
// while also ensuring that it does not appear before <body> |
|
343 |
if (allowMain && needMain && !name.equals("body")) { |
|
344 |
out.write("<main>"); |
|
345 |
needMain = false; |
|
346 |
needEndMain = true; |
|
347 |
} |
|
348 |
||
349 |
flushBuffer(); |
|
350 |
} |
|
351 |
||
352 |
@Override |
|
353 |
protected void endElement(String name) { |
|
354 |
switch (name) { |
|
355 |
case "article": |
|
356 |
case "aside": |
|
357 |
case "footer": |
|
358 |
case "header": |
|
359 |
case "nav": |
|
360 |
// The code does not handle nested elements of these kinds, but could. |
|
361 |
// So, assuming they are not nested, ending these elements implies |
|
362 |
// that <main> is once again permitted. |
|
363 |
allowMain = true; |
|
364 |
break; |
|
365 |
||
366 |
case "body": |
|
367 |
// The document is nearly done; insert <main> if needed |
|
368 |
if (needEndMain) { |
|
369 |
out.write("</main>"); |
|
370 |
needEndMain = false; |
|
371 |
} |
|
372 |
break; |
|
373 |
||
374 |
case "table": |
|
375 |
// if the table is finished, analyze it and write it out |
|
376 |
if (table != null) { |
|
377 |
if (--table.nestDepth == 0) { |
|
378 |
table.add(buffer.toString()); |
|
379 |
table.write(out); |
|
380 |
table = null; |
|
381 |
buffer.setLength(0); |
|
382 |
} |
|
383 |
} |
|
384 |
break; |
|
385 |
||
386 |
case "thead": |
|
387 |
case "tbody": |
|
388 |
case "tr": |
|
389 |
case "td": |
|
390 |
case "th": |
|
391 |
// ending any of these elements implicity or explicitly ends the |
|
392 |
// current cell |
|
393 |
table.endCell(); |
|
394 |
break; |
|
395 |
||
396 |
} |
|
397 |
flushBuffer(); |
|
398 |
} |
|
399 |
||
400 |
@Override |
|
401 |
protected void content(String content) { |
|
402 |
if (table != null) { |
|
403 |
table.content(content); |
|
404 |
} else if (allowMain && needMain && !content.isBlank()) { |
|
405 |
// insert <main> if required and if we have palpable content |
|
406 |
out.write("<main>"); |
|
407 |
needMain = false; |
|
408 |
needEndMain = true; |
|
409 |
} |
|
410 |
flushBuffer(); |
|
411 |
} |
|
412 |
||
413 |
@Override |
|
414 |
protected void comment(String comment) { |
|
415 |
flushBuffer(); |
|
416 |
} |
|
417 |
||
418 |
/** |
|
419 |
* Flushes the buffer, either by adding it into a table, if one is |
|
420 |
* in progress, or by writing it out. |
|
421 |
*/ |
|
422 |
private void flushBuffer() { |
|
423 |
String s = buffer.toString(); |
|
424 |
if (table != null) { |
|
425 |
table.add(s); |
|
426 |
} else { |
|
427 |
out.write(s); |
|
428 |
} |
|
429 |
buffer.setLength(0); |
|
430 |
||
431 |
} |
|
432 |
} |
|
433 |
||
434 |
/** |
|
435 |
* Storage for the content of a {@code <table>} element} until we can determine |
|
436 |
* whether we should add {@code scope="row"} to the cells in a given column, |
|
437 |
* and if so, which column. |
|
438 |
* |
|
439 |
* The column with the highest number of unique entries is selected; |
|
440 |
* in case of ambiguity, a column whose heading begins "name" is chosen. |
|
441 |
* |
|
442 |
* Only "simple" tables are supported. Tables with any of the following |
|
443 |
* features are not considered "simple" and will not be modified: |
|
444 |
* <ul> |
|
445 |
* <li>Tables containing nested tables</li> |
|
446 |
* <li>Tables containing cells that use "rowspan" and "colspan" attributes</li> |
|
447 |
* <li>Tables containing cells that already use "scope" attributes</li> |
|
448 |
* </ul> |
|
449 |
*/ |
|
450 |
class Table { |
|
451 |
/** |
|
452 |
* A fragment of HTML in this table. |
|
453 |
*/ |
|
454 |
class Entry { |
|
455 |
/** The fragment. */ |
|
456 |
final String html; |
|
457 |
/** The column for a {@code <td>} fragment, or -1. */ |
|
458 |
final int column; |
|
459 |
||
460 |
Entry(String html, int column) { |
|
461 |
this.html = html; |
|
462 |
this.column = column; |
|
463 |
} |
|
464 |
} |
|
465 |
||
466 |
/** Whether or not this is a "simple" table. */ |
|
467 |
boolean simple = true; |
|
468 |
||
469 |
/** The nesting depth of the current table, within enclosing tables. */ |
|
470 |
int nestDepth; |
|
471 |
||
472 |
/** A list of the HTML fragments that make up this table. */ |
|
473 |
List<Entry> entries; |
|
474 |
||
475 |
/** The plain text contents of each column, used to determine the primary column. */ |
|
476 |
List<Set<String>> columnContents; |
|
477 |
||
478 |
/** The column index of the next cell to be found. */ |
|
479 |
int nextCellColumnIndex; |
|
480 |
||
481 |
/** A flag to mark the start of a {@code <td>} cell. */ |
|
482 |
boolean startTDCell; |
|
483 |
||
484 |
/** The column index of the current cell, or -1 if not in a cell. */ |
|
485 |
int currCellColumnIndex; |
|
486 |
||
487 |
/** The plain text contents of the current column. */ |
|
488 |
Set<String> currColumnContents; |
|
489 |
||
490 |
/** The plain text content of the current cell. */ |
|
491 |
StringBuilder currCellContent; |
|
492 |
||
493 |
/** The kind ({@code th} or {@code td}) of the current cell. */ |
|
494 |
String currCellKind; |
|
495 |
||
496 |
/** |
|
497 |
* The index of the column, if any, containing a heading beginning "name". |
|
498 |
* This column is given preferential treatment when deciding the primary column. |
|
499 |
*/ |
|
500 |
int nameColumn; |
|
501 |
||
502 |
Table() { |
|
503 |
entries = new ArrayList<>(); |
|
504 |
columnContents = new ArrayList<>(); |
|
505 |
} |
|
506 |
||
507 |
void startCell(String name) { |
|
508 |
endCell(); |
|
509 |
startTDCell = name.equals("td"); |
|
510 |
currCellColumnIndex = nextCellColumnIndex++; |
|
511 |
currColumnContents = getColumn(currCellColumnIndex); |
|
512 |
currCellContent = new StringBuilder(); |
|
513 |
currCellKind = name; |
|
514 |
} |
|
515 |
||
516 |
void endCell() { |
|
517 |
if (currCellContent != null) { |
|
518 |
String c = currCellContent.toString().trim(); |
|
519 |
if (Objects.equals(currCellKind, "th") |
|
520 |
&& c.toLowerCase(Locale.US).startsWith("name")) { |
|
521 |
nameColumn = currCellColumnIndex; |
|
522 |
} |
|
523 |
currColumnContents.add(c); |
|
524 |
currCellContent = null; |
|
525 |
currCellColumnIndex = -1; |
|
526 |
currColumnContents = null; |
|
527 |
} |
|
528 |
} |
|
529 |
||
530 |
void content(String content) { |
|
531 |
if (currCellContent != null) { |
|
532 |
currCellContent.append(content); |
|
533 |
} |
|
534 |
} |
|
535 |
||
536 |
void add(String html) { |
|
537 |
int index = startTDCell ? currCellColumnIndex : -1; |
|
538 |
entries.add(new Entry(html, index)); |
|
539 |
startTDCell = false; |
|
540 |
} |
|
541 |
||
542 |
void write(PrintWriter out) { |
|
543 |
int max = -1; |
|
544 |
int maxIndex = -1; |
|
545 |
int index = 0; |
|
546 |
for (Set<String> c : columnContents) { |
|
547 |
if (c.size() > max || c.size() == max && index == nameColumn) { |
|
548 |
max = c.size(); |
|
549 |
maxIndex = index; |
|
550 |
} |
|
551 |
index++; |
|
552 |
} |
|
55297 | 553 |
boolean updateEndTd = false; |
554 |
Pattern styleAttr = Pattern.compile("(?<before>.*style=\")(?<style>[^\"]*)(?<after>\".*)"); |
|
55113 | 555 |
for (Entry e : entries) { |
556 |
if (simple && e.column == maxIndex) { |
|
55297 | 557 |
String attrs = e.html.substring(3, e.html.length() - 1); |
558 |
out.write("<th"); |
|
559 |
Matcher m = styleAttr.matcher(attrs); |
|
560 |
if (m.matches()) { |
|
561 |
out.write(m.group("before")); |
|
562 |
out.write("font-weight: normal; "); |
|
563 |
String style = m.group("style"); |
|
564 |
if (!style.contains("text-align")) { |
|
565 |
out.write("text-align: left; "); |
|
566 |
} |
|
567 |
out.write(style); |
|
568 |
out.write(m.group("after")); |
|
569 |
} else { |
|
55300 | 570 |
out.write(" style=\"font-weight: normal; text-align:left;\" "); |
55297 | 571 |
out.write(attrs); |
572 |
} |
|
573 |
out.write(" scope=\"row\""); |
|
574 |
out.write(">"); |
|
575 |
updateEndTd = true; |
|
576 |
} else if (updateEndTd && e.html.equalsIgnoreCase("</td>")) { |
|
577 |
out.write("</th>"); |
|
578 |
updateEndTd = false; |
|
55113 | 579 |
} else { |
580 |
out.write(e.html); |
|
55297 | 581 |
if (updateEndTd && e.html.regionMatches(true, 0, "<td", 0, 3)) { |
582 |
// a new cell has been started without explicitly closing the |
|
583 |
// cell that was being updated |
|
584 |
updateEndTd = false; |
|
585 |
} |
|
55113 | 586 |
} |
587 |
} |
|
588 |
} |
|
589 |
||
590 |
private Set<String> getColumn(int index) { |
|
591 |
while (columnContents.size() <= index) { |
|
592 |
columnContents.add(new LinkedHashSet<>()); |
|
593 |
} |
|
594 |
||
595 |
return columnContents.get(index); |
|
596 |
} |
|
597 |
} |
|
598 |
||
599 |
/** |
|
600 |
* A basic HTML parser. |
|
601 |
* Override the protected methods as needed to get notified of significant items |
|
602 |
* in any file that is read. |
|
603 |
*/ |
|
604 |
abstract class HtmlParser { |
|
605 |
||
606 |
private Path file; |
|
607 |
private Reader in; |
|
608 |
protected int ch; |
|
609 |
private int lineNumber; |
|
610 |
private boolean inScript; |
|
611 |
private boolean xml; |
|
612 |
||
613 |
/** |
|
614 |
* Read a file. |
|
615 |
* @param file the file |
|
616 |
*/ |
|
617 |
void read(Path file) { |
|
618 |
try (Reader r = Files.newBufferedReader(file)) { |
|
619 |
this.file = file; |
|
620 |
read(r); |
|
621 |
} catch (IOException e) { |
|
622 |
error(file, -1, e); |
|
623 |
} |
|
624 |
} |
|
625 |
||
626 |
HtmlParser() { } |
|
627 |
||
628 |
/** |
|
629 |
* Read a stream. |
|
630 |
* @param r the stream |
|
631 |
*/ |
|
632 |
void read(Reader r) { |
|
633 |
try { |
|
634 |
this.in = r; |
|
635 |
StringBuilder content = new StringBuilder(); |
|
636 |
||
637 |
startFile(file); |
|
638 |
try { |
|
639 |
lineNumber = 1; |
|
640 |
xml = false; |
|
641 |
nextChar(); |
|
642 |
||
643 |
while (ch != -1) { |
|
644 |
if (ch == '<') { |
|
645 |
content(content.toString()); |
|
646 |
content.setLength(0); |
|
647 |
html(); |
|
648 |
} else { |
|
649 |
content.append((char) ch); |
|
650 |
if (ch == '\n') { |
|
651 |
content(content.toString()); |
|
652 |
content.setLength(0); |
|
653 |
} |
|
654 |
nextChar(); |
|
655 |
} |
|
656 |
} |
|
657 |
} finally { |
|
658 |
endFile(); |
|
659 |
} |
|
660 |
} catch (IOException e) { |
|
661 |
error(file, lineNumber, e); |
|
662 |
} catch (Throwable t) { |
|
663 |
error(file, lineNumber, t); |
|
664 |
t.printStackTrace(System.err); |
|
665 |
} |
|
666 |
} |
|
667 |
||
668 |
protected int getLineNumber() { |
|
669 |
return lineNumber; |
|
670 |
} |
|
671 |
||
672 |
/** |
|
673 |
* Called when a file has been opened, before parsing begins. |
|
674 |
* This is always the first notification when reading a file. |
|
675 |
* This implementation does nothing. |
|
676 |
* |
|
677 |
* @param file the file |
|
678 |
*/ |
|
679 |
protected void startFile(Path file) { } |
|
680 |
||
681 |
/** |
|
682 |
* Called when the parser has finished reading a file. |
|
683 |
* This is always the last notification when reading a file, |
|
684 |
* unless any errors occur while closing the file. |
|
685 |
* This implementation does nothing. |
|
686 |
*/ |
|
687 |
protected void endFile() { } |
|
688 |
||
689 |
/** |
|
690 |
* Called when a doctype declaration is found, at the beginning of the file. |
|
691 |
* This implementation does nothing. |
|
692 |
* @param s the doctype declaration |
|
693 |
*/ |
|
694 |
protected void doctype(String s) { } |
|
695 |
||
696 |
/** |
|
697 |
* Called when the opening tag of an HTML element is encountered. |
|
698 |
* This implementation does nothing. |
|
699 |
* @param name the name of the tag |
|
700 |
* @param attrs the attribute |
|
701 |
* @param selfClosing whether or not this is a self-closing tag |
|
702 |
*/ |
|
703 |
protected void startElement(String name, Map<String,String> attrs, boolean selfClosing) { } |
|
704 |
||
705 |
/** |
|
706 |
* Called when the closing tag of an HTML tag is encountered. |
|
707 |
* This implementation does nothing. |
|
708 |
* @param name the name of the tag |
|
709 |
*/ |
|
710 |
protected void endElement(String name) { } |
|
711 |
||
712 |
/** |
|
713 |
* Called for sequences of character content. |
|
714 |
* @param content the character content |
|
715 |
*/ |
|
716 |
protected void content(String content) { } |
|
717 |
||
718 |
/** |
|
719 |
* Called for sequences of comment. |
|
720 |
* @param comment the comment |
|
721 |
*/ |
|
722 |
protected void comment(String comment) { } |
|
723 |
||
724 |
/** |
|
725 |
* Called when an error has been encountered. |
|
726 |
* @param file the file being read |
|
727 |
* @param lineNumber the line number of line containing the error |
|
728 |
* @param message a description of the error |
|
729 |
*/ |
|
730 |
protected abstract void error(Path file, int lineNumber, String message); |
|
731 |
||
732 |
/** |
|
733 |
* Called when an exception has been encountered. |
|
734 |
* @param file the file being read |
|
735 |
* @param lineNumber the line number of the line being read when the exception was found |
|
736 |
* @param t the exception |
|
737 |
*/ |
|
738 |
protected abstract void error(Path file, int lineNumber, Throwable t); |
|
739 |
||
740 |
protected int nextChar() throws IOException { |
|
741 |
ch = in.read(); |
|
742 |
if (ch == '\n') |
|
743 |
lineNumber++; |
|
744 |
return ch; |
|
745 |
} |
|
746 |
||
747 |
/** |
|
748 |
* Read the start or end of an HTML tag, or an HTML comment |
|
749 |
* {@literal <identifier attrs> } or {@literal </identifier> } |
|
750 |
* @throws java.io.IOException if there is a problem reading the file |
|
751 |
*/ |
|
752 |
protected void html() throws IOException { |
|
753 |
nextChar(); |
|
754 |
if (isIdentifierStart((char) ch)) { |
|
755 |
String name = readIdentifier().toLowerCase(Locale.US); |
|
756 |
Map<String,String> attrs = htmlAttrs(); |
|
757 |
if (attrs != null) { |
|
758 |
boolean selfClosing = false; |
|
759 |
if (ch == '/') { |
|
760 |
nextChar(); |
|
761 |
selfClosing = true; |
|
762 |
} |
|
763 |
if (ch == '>') { |
|
764 |
nextChar(); |
|
765 |
startElement(name, attrs, selfClosing); |
|
766 |
if (name.equals("script")) { |
|
767 |
inScript = true; |
|
768 |
} |
|
769 |
return; |
|
770 |
} |
|
771 |
} |
|
772 |
} else if (ch == '/') { |
|
773 |
nextChar(); |
|
774 |
if (isIdentifierStart((char) ch)) { |
|
775 |
String name = readIdentifier().toLowerCase(Locale.US); |
|
776 |
skipWhitespace(); |
|
777 |
if (ch == '>') { |
|
778 |
nextChar(); |
|
779 |
endElement(name); |
|
780 |
if (name.equals("script")) { |
|
781 |
inScript = false; |
|
782 |
} |
|
783 |
return; |
|
784 |
} |
|
785 |
} |
|
786 |
} else if (ch == '!') { |
|
787 |
nextChar(); |
|
788 |
if (ch == '-') { |
|
789 |
nextChar(); |
|
790 |
if (ch == '-') { |
|
791 |
nextChar(); |
|
792 |
StringBuilder comment = new StringBuilder(); |
|
793 |
while (ch != -1) { |
|
794 |
int dash = 0; |
|
795 |
while (ch == '-') { |
|
796 |
dash++; |
|
797 |
comment.append(ch); |
|
798 |
nextChar(); |
|
799 |
} |
|
800 |
// Strictly speaking, a comment should not contain "--" |
|
801 |
// so dash > 2 is an error, dash == 2 implies ch == '>' |
|
802 |
// See http://www.w3.org/TR/html-markup/syntax.html#syntax-comments |
|
803 |
// for more details. |
|
804 |
if (dash >= 2 && ch == '>') { |
|
805 |
comment.setLength(comment.length() - 2); |
|
806 |
comment(comment.toString()); |
|
807 |
nextChar(); |
|
808 |
return; |
|
809 |
} |
|
810 |
||
811 |
comment.append(ch); |
|
812 |
nextChar(); |
|
813 |
} |
|
814 |
} |
|
815 |
} else if (ch == '[') { |
|
816 |
nextChar(); |
|
817 |
if (ch == 'C') { |
|
818 |
nextChar(); |
|
819 |
if (ch == 'D') { |
|
820 |
nextChar(); |
|
821 |
if (ch == 'A') { |
|
822 |
nextChar(); |
|
823 |
if (ch == 'T') { |
|
824 |
nextChar(); |
|
825 |
if (ch == 'A') { |
|
826 |
nextChar(); |
|
827 |
if (ch == '[') { |
|
828 |
while (true) { |
|
829 |
nextChar(); |
|
830 |
if (ch == ']') { |
|
831 |
nextChar(); |
|
832 |
if (ch == ']') { |
|
833 |
nextChar(); |
|
834 |
if (ch == '>') { |
|
835 |
nextChar(); |
|
836 |
return; |
|
837 |
} |
|
838 |
} |
|
839 |
} |
|
840 |
} |
|
841 |
||
842 |
} |
|
843 |
} |
|
844 |
} |
|
845 |
} |
|
846 |
} |
|
847 |
} |
|
848 |
} else { |
|
849 |
StringBuilder sb = new StringBuilder(); |
|
850 |
while (ch != -1 && ch != '>') { |
|
851 |
sb.append((char) ch); |
|
852 |
nextChar(); |
|
853 |
} |
|
854 |
Pattern p = Pattern.compile("(?is)doctype\\s+html\\s?.*"); |
|
855 |
String s = sb.toString(); |
|
856 |
if (p.matcher(s).matches()) { |
|
857 |
doctype(s); |
|
858 |
return; |
|
859 |
} |
|
860 |
} |
|
861 |
} else if (ch == '?') { |
|
862 |
nextChar(); |
|
863 |
if (ch == 'x') { |
|
864 |
nextChar(); |
|
865 |
if (ch == 'm') { |
|
866 |
nextChar(); |
|
867 |
if (ch == 'l') { |
|
868 |
Map<String,String> attrs = htmlAttrs(); |
|
869 |
if (ch == '?') { |
|
870 |
nextChar(); |
|
871 |
if (ch == '>') { |
|
872 |
nextChar(); |
|
873 |
xml = true; |
|
874 |
return; |
|
875 |
} |
|
876 |
} |
|
877 |
} |
|
878 |
} |
|
879 |
||
880 |
} |
|
881 |
} |
|
882 |
||
883 |
if (!inScript) { |
|
884 |
error(file, lineNumber, "bad html"); |
|
885 |
} |
|
886 |
} |
|
887 |
||
888 |
/** |
|
889 |
* Read a series of HTML attributes, terminated by {@literal > }. |
|
890 |
* Each attribute is of the form {@literal identifier[=value] }. |
|
891 |
* "value" may be unquoted, single-quoted, or double-quoted. |
|
892 |
*/ |
|
893 |
private Map<String,String> htmlAttrs() throws IOException { |
|
894 |
Map<String, String> map = new LinkedHashMap<>(); |
|
895 |
skipWhitespace(); |
|
896 |
||
897 |
while (isIdentifierStart((char) ch)) { |
|
898 |
String name = readAttributeName().toLowerCase(Locale.US); |
|
899 |
skipWhitespace(); |
|
900 |
String value = null; |
|
901 |
if (ch == '=') { |
|
902 |
nextChar(); |
|
903 |
skipWhitespace(); |
|
904 |
if (ch == '\'' || ch == '"') { |
|
905 |
char quote = (char) ch; |
|
906 |
nextChar(); |
|
907 |
StringBuilder sb = new StringBuilder(); |
|
908 |
while (ch != -1 && ch != quote) { |
|
909 |
sb.append((char) ch); |
|
910 |
nextChar(); |
|
911 |
} |
|
912 |
value = sb.toString() // hack to replace common entities |
|
913 |
.replace("<", "<") |
|
914 |
.replace(">", ">") |
|
915 |
.replace("&", "&"); |
|
916 |
nextChar(); |
|
917 |
} else { |
|
918 |
StringBuilder sb = new StringBuilder(); |
|
919 |
while (ch != -1 && !isUnquotedAttrValueTerminator((char) ch)) { |
|
920 |
sb.append((char) ch); |
|
921 |
nextChar(); |
|
922 |
} |
|
923 |
value = sb.toString(); |
|
924 |
} |
|
925 |
skipWhitespace(); |
|
926 |
} |
|
927 |
map.put(name, value); |
|
928 |
} |
|
929 |
||
930 |
return map; |
|
931 |
} |
|
932 |
||
933 |
private boolean isIdentifierStart(char ch) { |
|
934 |
return Character.isUnicodeIdentifierStart(ch); |
|
935 |
} |
|
936 |
||
937 |
private String readIdentifier() throws IOException { |
|
938 |
StringBuilder sb = new StringBuilder(); |
|
939 |
sb.append((char) ch); |
|
940 |
nextChar(); |
|
941 |
while (ch != -1 && Character.isUnicodeIdentifierPart(ch)) { |
|
942 |
sb.append((char) ch); |
|
943 |
nextChar(); |
|
944 |
} |
|
945 |
return sb.toString(); |
|
946 |
} |
|
947 |
||
948 |
private String readAttributeName() throws IOException { |
|
949 |
StringBuilder sb = new StringBuilder(); |
|
950 |
sb.append((char) ch); |
|
951 |
nextChar(); |
|
952 |
while (ch != -1 && Character.isUnicodeIdentifierPart(ch) |
|
953 |
|| ch == '-' |
|
954 |
|| (xml || sb.toString().startsWith("xml")) && ch == ':') { |
|
955 |
sb.append((char) ch); |
|
956 |
nextChar(); |
|
957 |
} |
|
958 |
return sb.toString(); |
|
959 |
} |
|
960 |
||
961 |
private boolean isWhitespace(char ch) { |
|
962 |
return Character.isWhitespace(ch); |
|
963 |
} |
|
964 |
||
965 |
private void skipWhitespace() throws IOException { |
|
966 |
while (isWhitespace((char) ch)) { |
|
967 |
nextChar(); |
|
968 |
} |
|
969 |
} |
|
970 |
||
971 |
private boolean isUnquotedAttrValueTerminator(char ch) { |
|
972 |
switch (ch) { |
|
973 |
case '\f': case '\n': case '\r': case '\t': |
|
974 |
case ' ': |
|
975 |
case '"': case '\'': case '`': |
|
976 |
case '=': case '<': case '>': |
|
977 |
return true; |
|
978 |
default: |
|
979 |
return false; |
|
980 |
} |
|
981 |
} |
|
982 |
} |
|
983 |
||
984 |
} |