make/jdk/src/classes/build/tools/fixuppandoc/Main.java
branchdatagramsocketimpl-branch
changeset 58678 9cf78a70fa4f
child 58679 9c3209ff7550
equal deleted inserted replaced
58677:13588c901957 58678:9cf78a70fa4f
       
     1 /*
       
     2  * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
       
     3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     4  *
       
     5  * This code is free software; you can redistribute it and/or modify it
       
     6  * under the terms of the GNU General Public License version 2 only, as
       
     7  * published by the Free Software Foundation.  Oracle designates this
       
     8  * particular file as subject to the "Classpath" exception as provided
       
     9  * by Oracle in the LICENSE file that accompanied this code.
       
    10  *
       
    11  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    14  * version 2 for more details (a copy is included in the LICENSE file that
       
    15  * accompanied this code).
       
    16  *
       
    17  * You should have received a copy of the GNU General Public License version
       
    18  * 2 along with this work; if not, write to the Free Software Foundation,
       
    19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    20  *
       
    21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
       
    22  * or visit www.oracle.com if you need additional information or have any
       
    23  * questions.
       
    24  */
       
    25 
       
    26 package build.tools.fixuppandoc;
       
    27 
       
    28 import java.io.BufferedReader;
       
    29 import java.io.BufferedWriter;
       
    30 import java.io.IOException;
       
    31 import java.io.InputStreamReader;
       
    32 import java.io.OutputStreamWriter;
       
    33 import java.io.PrintStream;
       
    34 import java.io.PrintWriter;
       
    35 import java.io.Reader;
       
    36 import java.io.Writer;
       
    37 import java.nio.file.Files;
       
    38 import java.nio.file.Path;
       
    39 import java.util.ArrayList;
       
    40 import java.util.LinkedHashMap;
       
    41 import java.util.LinkedHashSet;
       
    42 import java.util.List;
       
    43 import java.util.Locale;
       
    44 import java.util.Map;
       
    45 import java.util.Objects;
       
    46 import java.util.Set;
       
    47 import java.util.regex.Matcher;
       
    48 import java.util.regex.Pattern;
       
    49 
       
    50 /**
       
    51  * Fixup HTML generated by pandoc.
       
    52  *
       
    53  * <h2>{@code <html>}</h2>
       
    54  *
       
    55  * Replace the existing element with {@code <html lang="en">}, removing references to XML.
       
    56  *
       
    57  * <h2>{@code <main>}</h2>
       
    58  *
       
    59  * {@code <main>} is inserted if palpable content is found that is not with a
       
    60  * section such as {@code header},  {@code footer},  {@code aside}.
       
    61  *
       
    62  * {@code </main>} is inserted if {@code <main>} was inserted and a section
       
    63  * is started that should not be included in the main section.
       
    64  *
       
    65  * <h2>Tables: row headings</h2>
       
    66  *
       
    67  * For simple tables, as typically generated by _pandoc_, determine the column
       
    68  * whose contents are unique, and convert the cells in that column to be header
       
    69  * cells with {@code scope="row"}. In case of ambiguity, a column containing a
       
    70  * {@code <th>} whose contents begin with <em>name</em> is preferred.
       
    71  * When converting the cell, the {@code style} attribute will be updated to
       
    72  * specify {@code font-weight: normal}, and if there is not already an explicit
       
    73  * setting for {@code text-align}, then the style will be updated to include
       
    74  * {@code text-align:left;}.
       
    75  *
       
    76  * These rules do not apply if the table contains any cells that include
       
    77  * a setting for the {@code scope} attribute, or if the table contains
       
    78  * spanning cells or nested tables.
       
    79  *
       
    80  * <h2>{@code <meta name="generator">}</h2>
       
    81  *
       
    82  * Update the content string, to indicate it has been processed by this program.
       
    83  *
       
    84  * <h2>{@code <nav id="TOC">}</h2>
       
    85  *
       
    86  * Set attribute {@code title="Table Of Contents"}
       
    87  *
       
    88  */
       
    89 public class Main {
       
    90     /**
       
    91      * Runs the program.
       
    92      *
       
    93      * <pre>
       
    94      *     java build.tools.fixuppandoc.Main [-o output-file] [input-file]
       
    95      * </pre>
       
    96      *
       
    97      * If no input file is specified, the program will read from standard input.
       
    98      * If no output file is specified, the program will write to standard output.
       
    99      * Any error messages will be written to the standard error stream.
       
   100      *
       
   101      * @param args the command-line arguments
       
   102      */
       
   103     public static void main(String... args) {
       
   104         try {
       
   105             new Main().run(args);
       
   106         } catch (IOException | IllegalArgumentException e) {
       
   107             System.err.println(e);
       
   108             System.exit(1);
       
   109         } catch (Throwable t) {
       
   110             t.printStackTrace(System.err);
       
   111             System.exit(1);
       
   112         }
       
   113     }
       
   114 
       
   115     private void run(String... args) throws IOException {
       
   116         Path inFile = null;
       
   117         Path outFile = null;
       
   118 
       
   119         for (int i = 0; i < args.length; i++) {
       
   120             String arg = args[i];
       
   121             if (arg.equals("-o") && i + 1 < args.length) {
       
   122                 outFile = Path.of(args[++i]);
       
   123             } else if (arg.startsWith("-")) {
       
   124                 throw new IllegalArgumentException(arg);
       
   125             } else if (inFile == null) {
       
   126                 inFile = Path.of(arg);
       
   127             } else {
       
   128                 throw new IllegalArgumentException(arg);
       
   129             }
       
   130         }
       
   131 
       
   132         new Fixup().run(inFile, outFile);
       
   133     }
       
   134 
       
   135     /**
       
   136      * A class to read HTML, copying input to output, modifying
       
   137      * fragments as needed.
       
   138      */
       
   139     class Fixup extends HtmlParser {
       
   140         /** The output stream. */
       
   141         PrintWriter out;
       
   142 
       
   143         /** A stream for reporting errors. */
       
   144         PrintStream err = System.err;
       
   145 
       
   146         /**
       
   147          * Flag to indicate when {@code <main>} is permitted around palpable content.
       
   148          * Set within {@code <body>}; disabled within elements in which {@code <main>}
       
   149          * is not permitted.
       
   150          */
       
   151         boolean allowMain = false;
       
   152 
       
   153         /**
       
   154          * Flag to indicate that {@code <main>} is required.
       
   155          * Set on {@code <body>}; reset when {@code <main>} is either found or generated.
       
   156          */
       
   157         boolean needMain = false;
       
   158 
       
   159         /**
       
   160          * Flag to indicate that {@code </main>} is required.
       
   161          * Set if {@code <main>} is generated.
       
   162          * Reset when a start or end element is found that requires that {@code </main>}
       
   163          * needs to be generated if necessary.
       
   164          */
       
   165         boolean needEndMain = false;
       
   166 
       
   167         /**
       
   168          * Handler for {@code <table>} elements.
       
   169          */
       
   170         Table table;
       
   171 
       
   172         /**
       
   173          * Run the program, copying an input file to an output file.
       
   174          * If the input file is {@code null}, input is read from the standard input.
       
   175          * If the output file is {@code null}, output is written to the standard output.
       
   176          *
       
   177          * @param inFile the input file
       
   178          * @param outFile the output file
       
   179          * @throws IOException if an IO error occurs
       
   180          */
       
   181         void run(Path inFile, Path outFile) throws IOException {
       
   182             try (Writer out = openWriter(outFile)) {
       
   183                 this.out = new PrintWriter(out);
       
   184                 if (inFile != null) {
       
   185                     read(inFile);
       
   186                 } else {
       
   187                     read(new BufferedReader(new InputStreamReader(System.in)));
       
   188                 }
       
   189             }
       
   190         }
       
   191 
       
   192         /**
       
   193          * Returns a writer for a file, or for the standard output if the file is {@code null}.
       
   194          *
       
   195          * @param file the file
       
   196          * @return the writer
       
   197          * @throws IOException if an IO error occurs
       
   198          */
       
   199         private Writer openWriter(Path file) throws IOException {
       
   200             if (file != null) {
       
   201                 return Files.newBufferedWriter(file);
       
   202             } else {
       
   203                 return new BufferedWriter(new OutputStreamWriter(System.out) {
       
   204                     @Override
       
   205                     public void close() throws IOException {
       
   206                         flush();
       
   207                     }
       
   208                 });
       
   209             }
       
   210         }
       
   211 
       
   212         @Override
       
   213         protected void error(Path file, int lineNumber, String message) {
       
   214             err.print(file == null ? "<stdin>" : file);
       
   215             if (lineNumber > 0) {
       
   216                 err.print(":");
       
   217                 err.print(lineNumber);
       
   218             }
       
   219             err.print(": ");
       
   220             err.println(message);
       
   221         }
       
   222 
       
   223         @Override
       
   224         protected void error(Path file, int lineNumber, Throwable t) {
       
   225             error(file, lineNumber, t.toString());
       
   226             t.printStackTrace(err);
       
   227         }
       
   228 
       
   229         /**
       
   230          * The buffer in which input is stored until an appropriate action can be determined.
       
   231          * Using the buffer ensures that the output exactly matches the input, except where
       
   232          * it is intentionally modified.
       
   233          */
       
   234         private StringBuilder buffer = new StringBuilder();
       
   235 
       
   236         @Override
       
   237         public int nextChar() throws IOException {
       
   238             if (ch > 0) {
       
   239                 buffer.append((char) ch);
       
   240             }
       
   241             return super.nextChar();
       
   242         }
       
   243 
       
   244         @Override
       
   245         protected void doctype(String s) {
       
   246             flushBuffer();
       
   247         }
       
   248 
       
   249         @Override
       
   250         protected void startElement(String name, Map<String,String> attrs, boolean selfClosing) {
       
   251             switch (name) {
       
   252                 case "html":
       
   253                     // replace the existing <html> fragment
       
   254                     out.write("<html lang=\"en\">");
       
   255                     buffer.setLength(0);
       
   256                     break;
       
   257 
       
   258                 case "meta":
       
   259                     // update the meta-data for the generator
       
   260                     if (Objects.equals(attrs.get("name"), "generator")) {
       
   261                         out.write(buffer.toString()
       
   262                                 .replaceAll("(content=\"[^\"]*)(\")", "$1,fixuphtml$2"));
       
   263                         buffer.setLength(0);
       
   264                     }
       
   265                     break;
       
   266 
       
   267                 case "article":
       
   268                 case "aside":
       
   269                 case "footer":
       
   270                 case "header":
       
   271                 case "nav":
       
   272                     // starting one of these elements will terminate <main> if one is being
       
   273                     // inserted
       
   274                     if (needEndMain) {
       
   275                         out.write("</main>");
       
   276                         needEndMain = false;
       
   277                     }
       
   278                     // <main> is not permitted within these elements
       
   279                     allowMain = false;
       
   280                     if (name.equals("nav") && Objects.equals(attrs.get("id"), "TOC")) {
       
   281                         out.write(buffer.toString()
       
   282                                 .replaceAll(">$", " title=\"Table Of Contents\">"));
       
   283                         buffer.setLength(0);
       
   284                     }
       
   285                     break;
       
   286 
       
   287                 case "body":
       
   288                     // within <body>, <main> is both permitted and required
       
   289                     allowMain = true;
       
   290                     needMain = true;
       
   291                     break;
       
   292 
       
   293                 case "main":
       
   294                     // an explicit <main> found in the input; no need to add one
       
   295                     needMain = false;
       
   296                     break;
       
   297 
       
   298                 case "table":
       
   299                     // The entire content of a <table> is buffered, until it can be
       
   300                     // determined in which column of the table contains the cells
       
   301                     // that can be used to identify the row.
       
   302                     if (table == null) {
       
   303                         table = new Table();
       
   304                     } else {
       
   305                         // tables containing nested tables are not updated
       
   306                         table.simple = false;
       
   307                     }
       
   308                     table.nestDepth++;
       
   309                     break;
       
   310 
       
   311                 case "thead":
       
   312                 case "tbody":
       
   313                     if (table != null) {
       
   314                         table.endCell();
       
   315                     }
       
   316                     break;
       
   317 
       
   318                 case "tr":
       
   319                     if (table != null) {
       
   320                         table.endCell();
       
   321                         table.nextCellColumnIndex = 0;
       
   322                     }
       
   323                     break;
       
   324 
       
   325                 case "td":
       
   326                 case "th":
       
   327                     if (table != null) {
       
   328                         if (attrs.containsKey("rowspan")
       
   329                                 || attrs.containsKey("colspan")
       
   330                                 || attrs.containsKey("scope")) {
       
   331                             // tables containing spanning cells and tables that already
       
   332                             // contain scope attributes are not updated
       
   333                             table.simple = false;
       
   334                         }
       
   335                         table.startCell(name);
       
   336                     }
       
   337                     break;
       
   338             }
       
   339 
       
   340             // by default, the content is deemed to be palpable content, and so
       
   341             // insert <main> if it is permitted and one is still required,
       
   342             // while also ensuring that it does not appear before <body>
       
   343             if (allowMain && needMain && !name.equals("body")) {
       
   344                 out.write("<main>");
       
   345                 needMain = false;
       
   346                 needEndMain = true;
       
   347             }
       
   348 
       
   349             flushBuffer();
       
   350         }
       
   351 
       
   352         @Override
       
   353         protected void endElement(String name) {
       
   354             switch (name) {
       
   355                 case "article":
       
   356                 case "aside":
       
   357                 case "footer":
       
   358                 case "header":
       
   359                 case "nav":
       
   360                     // The code does not handle nested elements of these kinds, but could.
       
   361                     // So, assuming they are not nested, ending these elements implies
       
   362                     // that <main> is once again permitted.
       
   363                     allowMain = true;
       
   364                     break;
       
   365 
       
   366                 case "body":
       
   367                     // The document is nearly done; insert <main> if needed
       
   368                     if (needEndMain) {
       
   369                         out.write("</main>");
       
   370                         needEndMain = false;
       
   371                     }
       
   372                     break;
       
   373 
       
   374                 case "table":
       
   375                     // if the table is finished, analyze it and write it out
       
   376                     if (table != null) {
       
   377                         if (--table.nestDepth == 0) {
       
   378                             table.add(buffer.toString());
       
   379                             table.write(out);
       
   380                             table = null;
       
   381                             buffer.setLength(0);
       
   382                         }
       
   383                     }
       
   384                     break;
       
   385 
       
   386                 case "thead":
       
   387                 case "tbody":
       
   388                 case "tr":
       
   389                 case "td":
       
   390                 case "th":
       
   391                     // ending any of these elements implicity or explicitly ends the
       
   392                     // current cell
       
   393                     table.endCell();
       
   394                     break;
       
   395 
       
   396             }
       
   397             flushBuffer();
       
   398         }
       
   399 
       
   400         @Override
       
   401         protected void content(String content) {
       
   402             if (table != null) {
       
   403                 table.content(content);
       
   404             } else if (allowMain && needMain && !content.isBlank()) {
       
   405                 // insert <main> if required and if we have palpable content
       
   406                 out.write("<main>");
       
   407                 needMain = false;
       
   408                 needEndMain = true;
       
   409             }
       
   410             flushBuffer();
       
   411         }
       
   412 
       
   413         @Override
       
   414         protected void comment(String comment) {
       
   415             flushBuffer();
       
   416         }
       
   417 
       
   418         /**
       
   419          * Flushes the buffer, either by adding it into a table, if one is
       
   420          * in progress, or by writing it out.
       
   421          */
       
   422         private void flushBuffer() {
       
   423             String s = buffer.toString();
       
   424             if (table != null) {
       
   425                 table.add(s);
       
   426             } else {
       
   427                 out.write(s);
       
   428             }
       
   429             buffer.setLength(0);
       
   430 
       
   431         }
       
   432     }
       
   433 
       
   434     /**
       
   435      * Storage for the content of a {@code <table>} element} until we can determine
       
   436      * whether we should add {@code scope="row"} to the cells in a given column,
       
   437      * and if so, which column.
       
   438      *
       
   439      * The column with the highest number of unique entries is selected;
       
   440      * in case of ambiguity, a column whose heading begins "name" is chosen.
       
   441      *
       
   442      * Only "simple" tables are supported. Tables with any of the following
       
   443      * features are not considered "simple" and will not be modified:
       
   444      * <ul>
       
   445      *     <li>Tables containing nested tables</li>
       
   446      *     <li>Tables containing cells that use "rowspan" and "colspan" attributes</li>
       
   447      *     <li>Tables containing cells that already use "scope" attributes</li>
       
   448      * </ul>
       
   449      */
       
   450     class Table {
       
   451         /**
       
   452          * A fragment of HTML in this table.
       
   453          */
       
   454         class Entry {
       
   455             /** The fragment. */
       
   456             final String html;
       
   457             /** The column for a {@code <td>} fragment, or -1. */
       
   458             final int column;
       
   459 
       
   460             Entry(String html, int column) {
       
   461                 this.html = html;
       
   462                 this.column = column;
       
   463             }
       
   464         }
       
   465 
       
   466         /** Whether or not this is a "simple" table. */
       
   467         boolean simple = true;
       
   468 
       
   469         /** The nesting depth of the current table, within enclosing tables. */
       
   470         int nestDepth;
       
   471 
       
   472         /** A list of the HTML fragments that make up this table. */
       
   473         List<Entry> entries;
       
   474 
       
   475         /** The plain text contents of each column, used to determine the primary column. */
       
   476         List<Set<String>> columnContents;
       
   477 
       
   478         /** The column index of the next cell to be found. */
       
   479         int nextCellColumnIndex;
       
   480 
       
   481         /** A flag to mark the start of a {@code <td>} cell. */
       
   482         boolean startTDCell;
       
   483 
       
   484         /** The column index of the current cell, or -1 if not in a cell. */
       
   485         int currCellColumnIndex;
       
   486 
       
   487         /** The plain text contents of the current column. */
       
   488         Set<String> currColumnContents;
       
   489 
       
   490         /** The plain text content of the current cell. */
       
   491         StringBuilder currCellContent;
       
   492 
       
   493         /** The kind ({@code th} or {@code td}) of the current cell. */
       
   494         String currCellKind;
       
   495 
       
   496         /**
       
   497          * The index of the column, if any, containing a heading beginning "name".
       
   498          * This column is given preferential treatment when deciding the primary column.
       
   499          */
       
   500         int nameColumn;
       
   501 
       
   502         Table() {
       
   503             entries = new ArrayList<>();
       
   504             columnContents = new ArrayList<>();
       
   505         }
       
   506 
       
   507         void startCell(String name) {
       
   508             endCell();
       
   509             startTDCell = name.equals("td");
       
   510             currCellColumnIndex = nextCellColumnIndex++;
       
   511             currColumnContents = getColumn(currCellColumnIndex);
       
   512             currCellContent = new StringBuilder();
       
   513             currCellKind = name;
       
   514         }
       
   515 
       
   516         void endCell() {
       
   517             if (currCellContent != null) {
       
   518                 String c = currCellContent.toString().trim();
       
   519                 if (Objects.equals(currCellKind, "th")
       
   520                         && c.toLowerCase(Locale.US).startsWith("name")) {
       
   521                     nameColumn = currCellColumnIndex;
       
   522                 }
       
   523                 currColumnContents.add(c);
       
   524                 currCellContent = null;
       
   525                 currCellColumnIndex = -1;
       
   526                 currColumnContents = null;
       
   527             }
       
   528         }
       
   529 
       
   530         void content(String content) {
       
   531             if (currCellContent != null) {
       
   532                 currCellContent.append(content);
       
   533             }
       
   534         }
       
   535 
       
   536         void add(String html) {
       
   537             int index = startTDCell ? currCellColumnIndex : -1;
       
   538             entries.add(new Entry(html, index));
       
   539             startTDCell = false;
       
   540         }
       
   541 
       
   542         void write(PrintWriter out) {
       
   543             int max = -1;
       
   544             int maxIndex = -1;
       
   545             int index = 0;
       
   546             for (Set<String> c : columnContents) {
       
   547                 if (c.size() > max || c.size() == max && index == nameColumn) {
       
   548                     max = c.size();
       
   549                     maxIndex = index;
       
   550                 }
       
   551                 index++;
       
   552             }
       
   553             boolean updateEndTd = false;
       
   554             Pattern styleAttr = Pattern.compile("(?<before>.*style=\")(?<style>[^\"]*)(?<after>\".*)");
       
   555             for (Entry e : entries) {
       
   556                 if (simple && e.column == maxIndex) {
       
   557                     String attrs = e.html.substring(3, e.html.length() - 1);
       
   558                     out.write("<th");
       
   559                     Matcher m = styleAttr.matcher(attrs);
       
   560                     if (m.matches()) {
       
   561                         out.write(m.group("before"));
       
   562                         out.write("font-weight: normal; ");
       
   563                         String style = m.group("style");
       
   564                         if (!style.contains("text-align")) {
       
   565                             out.write("text-align: left; ");
       
   566                         }
       
   567                         out.write(style);
       
   568                         out.write(m.group("after"));
       
   569                     } else {
       
   570                         out.write(" style=\"font-weight: normal; text-align:left;\" ");
       
   571                         out.write(attrs);
       
   572                     }
       
   573                     out.write(" scope=\"row\"");
       
   574                     out.write(">");
       
   575                     updateEndTd = true;
       
   576                 } else if (updateEndTd && e.html.equalsIgnoreCase("</td>")) {
       
   577                     out.write("</th>");
       
   578                     updateEndTd = false;
       
   579                 } else {
       
   580                     out.write(e.html);
       
   581                     if (updateEndTd && e.html.regionMatches(true, 0, "<td", 0, 3)) {
       
   582                         // a new cell has been started without explicitly closing the
       
   583                         // cell that was being updated
       
   584                         updateEndTd = false;
       
   585                     }
       
   586                 }
       
   587             }
       
   588         }
       
   589 
       
   590         private Set<String> getColumn(int index) {
       
   591             while (columnContents.size() <= index) {
       
   592                 columnContents.add(new LinkedHashSet<>());
       
   593             }
       
   594 
       
   595             return columnContents.get(index);
       
   596         }
       
   597     }
       
   598 
       
   599     /**
       
   600      * A basic HTML parser.
       
   601      * Override the protected methods as needed to get notified of significant items
       
   602      * in any file that is read.
       
   603      */
       
   604     abstract class HtmlParser {
       
   605 
       
   606         private Path file;
       
   607         private Reader in;
       
   608         protected int ch;
       
   609         private int lineNumber;
       
   610         private boolean inScript;
       
   611         private boolean xml;
       
   612 
       
   613         /**
       
   614          * Read a file.
       
   615          * @param file the file
       
   616          */
       
   617         void read(Path file) {
       
   618             try (Reader r = Files.newBufferedReader(file)) {
       
   619                 this.file = file;
       
   620                 read(r);
       
   621             } catch (IOException e) {
       
   622                 error(file, -1, e);
       
   623             }
       
   624         }
       
   625 
       
   626         HtmlParser() { }
       
   627 
       
   628         /**
       
   629          * Read a stream.
       
   630          * @param r the stream
       
   631          */
       
   632         void read(Reader r) {
       
   633             try {
       
   634                 this.in = r;
       
   635                 StringBuilder content = new StringBuilder();
       
   636 
       
   637                 startFile(file);
       
   638                 try {
       
   639                     lineNumber = 1;
       
   640                     xml = false;
       
   641                     nextChar();
       
   642 
       
   643                     while (ch != -1) {
       
   644                         if (ch == '<') {
       
   645                             content(content.toString());
       
   646                             content.setLength(0);
       
   647                             html();
       
   648                         } else {
       
   649                             content.append((char) ch);
       
   650                             if (ch == '\n') {
       
   651                                 content(content.toString());
       
   652                                 content.setLength(0);
       
   653                             }
       
   654                             nextChar();
       
   655                         }
       
   656                     }
       
   657                 } finally {
       
   658                     endFile();
       
   659                 }
       
   660             } catch (IOException e) {
       
   661                 error(file, lineNumber, e);
       
   662             } catch (Throwable t) {
       
   663                 error(file, lineNumber, t);
       
   664                 t.printStackTrace(System.err);
       
   665             }
       
   666         }
       
   667 
       
   668         protected int getLineNumber() {
       
   669             return lineNumber;
       
   670         }
       
   671 
       
   672         /**
       
   673          * Called when a file has been opened, before parsing begins.
       
   674          * This is always the first notification when reading a file.
       
   675          * This implementation does nothing.
       
   676          *
       
   677          * @param file the file
       
   678          */
       
   679         protected void startFile(Path file) { }
       
   680 
       
   681         /**
       
   682          * Called when the parser has finished reading a file.
       
   683          * This is always the last notification when reading a file,
       
   684          * unless any errors occur while closing the file.
       
   685          * This implementation does nothing.
       
   686          */
       
   687         protected void endFile() { }
       
   688 
       
   689         /**
       
   690          * Called when a doctype declaration is found, at the beginning of the file.
       
   691          * This implementation does nothing.
       
   692          * @param s the doctype declaration
       
   693          */
       
   694         protected void doctype(String s) { }
       
   695 
       
   696         /**
       
   697          * Called when the opening tag of an HTML element is encountered.
       
   698          * This implementation does nothing.
       
   699          * @param name the name of the tag
       
   700          * @param attrs the attribute
       
   701          * @param selfClosing whether or not this is a self-closing tag
       
   702          */
       
   703         protected void startElement(String name, Map<String,String> attrs, boolean selfClosing) { }
       
   704 
       
   705         /**
       
   706          * Called when the closing tag of an HTML tag is encountered.
       
   707          * This implementation does nothing.
       
   708          * @param name the name of the tag
       
   709          */
       
   710         protected void endElement(String name) { }
       
   711 
       
   712         /**
       
   713          * Called for sequences of character content.
       
   714          * @param content the character content
       
   715          */
       
   716         protected void content(String content) { }
       
   717 
       
   718         /**
       
   719          * Called for sequences of comment.
       
   720          * @param comment the comment
       
   721          */
       
   722         protected void comment(String comment) { }
       
   723 
       
   724         /**
       
   725          * Called when an error has been encountered.
       
   726          * @param file the file being read
       
   727          * @param lineNumber the line number of line containing the error
       
   728          * @param message a description of the error
       
   729          */
       
   730         protected abstract void error(Path file, int lineNumber, String message);
       
   731 
       
   732         /**
       
   733          * Called when an exception has been encountered.
       
   734          * @param file the file being read
       
   735          * @param lineNumber the line number of the line being read when the exception was found
       
   736          * @param t the exception
       
   737          */
       
   738         protected abstract void error(Path file, int lineNumber, Throwable t);
       
   739 
       
   740         protected int nextChar() throws IOException {
       
   741             ch = in.read();
       
   742             if (ch == '\n')
       
   743                 lineNumber++;
       
   744             return ch;
       
   745         }
       
   746 
       
   747         /**
       
   748          * Read the start or end of an HTML tag, or an HTML comment
       
   749          * {@literal <identifier attrs> } or {@literal </identifier> }
       
   750          * @throws java.io.IOException if there is a problem reading the file
       
   751          */
       
   752         protected void html() throws IOException {
       
   753             nextChar();
       
   754             if (isIdentifierStart((char) ch)) {
       
   755                 String name = readIdentifier().toLowerCase(Locale.US);
       
   756                 Map<String,String> attrs = htmlAttrs();
       
   757                 if (attrs != null) {
       
   758                     boolean selfClosing = false;
       
   759                     if (ch == '/') {
       
   760                         nextChar();
       
   761                         selfClosing = true;
       
   762                     }
       
   763                     if (ch == '>') {
       
   764                         nextChar();
       
   765                         startElement(name, attrs, selfClosing);
       
   766                         if (name.equals("script")) {
       
   767                             inScript = true;
       
   768                         }
       
   769                         return;
       
   770                     }
       
   771                 }
       
   772             } else if (ch == '/') {
       
   773                 nextChar();
       
   774                 if (isIdentifierStart((char) ch)) {
       
   775                     String name = readIdentifier().toLowerCase(Locale.US);
       
   776                     skipWhitespace();
       
   777                     if (ch == '>') {
       
   778                         nextChar();
       
   779                         endElement(name);
       
   780                         if (name.equals("script")) {
       
   781                             inScript = false;
       
   782                         }
       
   783                         return;
       
   784                     }
       
   785                 }
       
   786             } else if (ch == '!') {
       
   787                 nextChar();
       
   788                 if (ch == '-') {
       
   789                     nextChar();
       
   790                     if (ch == '-') {
       
   791                         nextChar();
       
   792                         StringBuilder comment = new StringBuilder();
       
   793                         while (ch != -1) {
       
   794                             int dash = 0;
       
   795                             while (ch == '-') {
       
   796                                 dash++;
       
   797                                 comment.append(ch);
       
   798                                 nextChar();
       
   799                             }
       
   800                             // Strictly speaking, a comment should not contain "--"
       
   801                             // so dash > 2 is an error, dash == 2 implies ch == '>'
       
   802                             // See http://www.w3.org/TR/html-markup/syntax.html#syntax-comments
       
   803                             // for more details.
       
   804                             if (dash >= 2 && ch == '>') {
       
   805                                 comment.setLength(comment.length() - 2);
       
   806                                 comment(comment.toString());
       
   807                                 nextChar();
       
   808                                 return;
       
   809                             }
       
   810 
       
   811                             comment.append(ch);
       
   812                             nextChar();
       
   813                         }
       
   814                     }
       
   815                 } else if (ch == '[') {
       
   816                     nextChar();
       
   817                     if (ch == 'C') {
       
   818                         nextChar();
       
   819                         if (ch == 'D') {
       
   820                             nextChar();
       
   821                             if (ch == 'A') {
       
   822                                 nextChar();
       
   823                                 if (ch == 'T') {
       
   824                                     nextChar();
       
   825                                     if (ch == 'A') {
       
   826                                         nextChar();
       
   827                                         if (ch == '[') {
       
   828                                             while (true) {
       
   829                                                 nextChar();
       
   830                                                 if (ch == ']') {
       
   831                                                     nextChar();
       
   832                                                     if (ch == ']') {
       
   833                                                         nextChar();
       
   834                                                         if (ch == '>') {
       
   835                                                             nextChar();
       
   836                                                             return;
       
   837                                                         }
       
   838                                                     }
       
   839                                                 }
       
   840                                             }
       
   841 
       
   842                                         }
       
   843                                     }
       
   844                                 }
       
   845                             }
       
   846                         }
       
   847                     }
       
   848                 } else {
       
   849                     StringBuilder sb = new StringBuilder();
       
   850                     while (ch != -1 && ch != '>') {
       
   851                         sb.append((char) ch);
       
   852                         nextChar();
       
   853                     }
       
   854                     Pattern p = Pattern.compile("(?is)doctype\\s+html\\s?.*");
       
   855                     String s = sb.toString();
       
   856                     if (p.matcher(s).matches()) {
       
   857                         doctype(s);
       
   858                         return;
       
   859                     }
       
   860                 }
       
   861             } else if (ch == '?') {
       
   862                 nextChar();
       
   863                 if (ch == 'x') {
       
   864                     nextChar();
       
   865                     if (ch == 'm') {
       
   866                         nextChar();
       
   867                         if (ch == 'l') {
       
   868                             Map<String,String> attrs = htmlAttrs();
       
   869                             if (ch == '?') {
       
   870                                 nextChar();
       
   871                                 if (ch == '>') {
       
   872                                     nextChar();
       
   873                                     xml = true;
       
   874                                     return;
       
   875                                 }
       
   876                             }
       
   877                         }
       
   878                     }
       
   879 
       
   880                 }
       
   881             }
       
   882 
       
   883             if (!inScript) {
       
   884                 error(file, lineNumber, "bad html");
       
   885             }
       
   886         }
       
   887 
       
   888         /**
       
   889          * Read a series of HTML attributes, terminated by {@literal > }.
       
   890          * Each attribute is of the form {@literal identifier[=value] }.
       
   891          * "value" may be unquoted, single-quoted, or double-quoted.
       
   892          */
       
   893         private Map<String,String> htmlAttrs() throws IOException {
       
   894             Map<String, String> map = new LinkedHashMap<>();
       
   895             skipWhitespace();
       
   896 
       
   897             while (isIdentifierStart((char) ch)) {
       
   898                 String name = readAttributeName().toLowerCase(Locale.US);
       
   899                 skipWhitespace();
       
   900                 String value = null;
       
   901                 if (ch == '=') {
       
   902                     nextChar();
       
   903                     skipWhitespace();
       
   904                     if (ch == '\'' || ch == '"') {
       
   905                         char quote = (char) ch;
       
   906                         nextChar();
       
   907                         StringBuilder sb = new StringBuilder();
       
   908                         while (ch != -1 && ch != quote) {
       
   909                             sb.append((char) ch);
       
   910                             nextChar();
       
   911                         }
       
   912                         value = sb.toString() // hack to replace common entities
       
   913                                 .replace("&lt;", "<")
       
   914                                 .replace("&gt;", ">")
       
   915                                 .replace("&amp;", "&");
       
   916                         nextChar();
       
   917                     } else {
       
   918                         StringBuilder sb = new StringBuilder();
       
   919                         while (ch != -1 && !isUnquotedAttrValueTerminator((char) ch)) {
       
   920                             sb.append((char) ch);
       
   921                             nextChar();
       
   922                         }
       
   923                         value = sb.toString();
       
   924                     }
       
   925                     skipWhitespace();
       
   926                 }
       
   927                 map.put(name, value);
       
   928             }
       
   929 
       
   930             return map;
       
   931         }
       
   932 
       
   933         private boolean isIdentifierStart(char ch) {
       
   934             return Character.isUnicodeIdentifierStart(ch);
       
   935         }
       
   936 
       
   937         private String readIdentifier() throws IOException {
       
   938             StringBuilder sb = new StringBuilder();
       
   939             sb.append((char) ch);
       
   940             nextChar();
       
   941             while (ch != -1 && Character.isUnicodeIdentifierPart(ch)) {
       
   942                 sb.append((char) ch);
       
   943                 nextChar();
       
   944             }
       
   945             return sb.toString();
       
   946         }
       
   947 
       
   948         private String readAttributeName() throws IOException {
       
   949             StringBuilder sb = new StringBuilder();
       
   950             sb.append((char) ch);
       
   951             nextChar();
       
   952             while (ch != -1 && Character.isUnicodeIdentifierPart(ch)
       
   953                     || ch == '-'
       
   954                     || (xml || sb.toString().startsWith("xml")) && ch == ':') {
       
   955                 sb.append((char) ch);
       
   956                 nextChar();
       
   957             }
       
   958             return sb.toString();
       
   959         }
       
   960 
       
   961         private boolean isWhitespace(char ch) {
       
   962             return Character.isWhitespace(ch);
       
   963         }
       
   964 
       
   965         private void skipWhitespace() throws IOException {
       
   966             while (isWhitespace((char) ch)) {
       
   967                 nextChar();
       
   968             }
       
   969         }
       
   970 
       
   971         private boolean isUnquotedAttrValueTerminator(char ch) {
       
   972             switch (ch) {
       
   973                 case '\f': case '\n': case '\r': case '\t':
       
   974                 case ' ':
       
   975                 case '"': case '\'': case '`':
       
   976                 case '=': case '<': case '>':
       
   977                     return true;
       
   978                 default:
       
   979                     return false;
       
   980             }
       
   981         }
       
   982     }
       
   983 
       
   984 }