make/jdk/src/classes/build/tools/fixuppandoc/Main.java
author chegar
Thu, 17 Oct 2019 20:54:25 +0100
branchdatagramsocketimpl-branch
changeset 58679 9c3209ff7550
parent 58678 9cf78a70fa4f
parent 55651 45fc36beb0aa
permissions -rw-r--r--
datagramsocketimpl-branch: merge with default

/*
 * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package build.tools.fixuppandoc;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.Writer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Fixup HTML generated by pandoc.
 *
 * <h2>{@code <html>}</h2>
 *
 * Replace the existing element with {@code <html lang="en">}, removing references to XML.
 *
 * <h2>{@code <main>}</h2>
 *
 * {@code <main>} is inserted if palpable content is found that is not with a
 * section such as {@code header},  {@code footer},  {@code aside}.
 *
 * {@code </main>} is inserted if {@code <main>} was inserted and a section
 * is started that should not be included in the main section.
 *
 * <h2>Tables: row headings</h2>
 *
 * For simple tables, as typically generated by _pandoc_, determine the column
 * whose contents are unique, and convert the cells in that column to be header
 * cells with {@code scope="row"}. In case of ambiguity, a column containing a
 * {@code <th>} whose contents begin with <em>name</em> is preferred.
 * When converting the cell, the {@code style} attribute will be updated to
 * specify {@code font-weight: normal}, and if there is not already an explicit
 * setting for {@code text-align}, then the style will be updated to include
 * {@code text-align:left;}.
 *
 * These rules do not apply if the table contains any cells that include
 * a setting for the {@code scope} attribute, or if the table contains
 * spanning cells or nested tables.
 *
 * <h2>{@code <meta name="generator">}</h2>
 *
 * Update the content string, to indicate it has been processed by this program.
 *
 * <h2>{@code <nav id="TOC">}</h2>
 *
 * Set attribute {@code title="Table Of Contents"}
 *
 */
public class Main {
    /**
     * Runs the program.
     *
     * <pre>
     *     java build.tools.fixuppandoc.Main [-o output-file] [input-file]
     * </pre>
     *
     * If no input file is specified, the program will read from standard input.
     * If no output file is specified, the program will write to standard output.
     * Any error messages will be written to the standard error stream.
     *
     * @param args the command-line arguments
     */
    public static void main(String... args) {
        try {
            new Main().run(args);
        } catch (IOException | IllegalArgumentException e) {
            System.err.println(e);
            System.exit(1);
        } catch (Throwable t) {
            t.printStackTrace(System.err);
            System.exit(1);
        }
    }

    private void run(String... args) throws IOException {
        Path inFile = null;
        Path outFile = null;

        for (int i = 0; i < args.length; i++) {
            String arg = args[i];
            if (arg.equals("-o") && i + 1 < args.length) {
                outFile = Path.of(args[++i]);
            } else if (arg.startsWith("-")) {
                throw new IllegalArgumentException(arg);
            } else if (inFile == null) {
                inFile = Path.of(arg);
            } else {
                throw new IllegalArgumentException(arg);
            }
        }

        new Fixup().run(inFile, outFile);
    }

    /**
     * A class to read HTML, copying input to output, modifying
     * fragments as needed.
     */
    class Fixup extends HtmlParser {
        /** The output stream. */
        PrintWriter out;

        /** A stream for reporting errors. */
        PrintStream err = System.err;

        /**
         * Flag to indicate when {@code <main>} is permitted around palpable content.
         * Set within {@code <body>}; disabled within elements in which {@code <main>}
         * is not permitted.
         */
        boolean allowMain = false;

        /**
         * Flag to indicate that {@code <main>} is required.
         * Set on {@code <body>}; reset when {@code <main>} is either found or generated.
         */
        boolean needMain = false;

        /**
         * Flag to indicate that {@code </main>} is required.
         * Set if {@code <main>} is generated.
         * Reset when a start or end element is found that requires that {@code </main>}
         * needs to be generated if necessary.
         */
        boolean needEndMain = false;

        /**
         * Handler for {@code <table>} elements.
         */
        Table table;

        /**
         * Run the program, copying an input file to an output file.
         * If the input file is {@code null}, input is read from the standard input.
         * If the output file is {@code null}, output is written to the standard output.
         *
         * @param inFile the input file
         * @param outFile the output file
         * @throws IOException if an IO error occurs
         */
        void run(Path inFile, Path outFile) throws IOException {
            try (Writer out = openWriter(outFile)) {
                this.out = new PrintWriter(out);
                if (inFile != null) {
                    read(inFile);
                } else {
                    read(new BufferedReader(new InputStreamReader(System.in)));
                }
            }
        }

        /**
         * Returns a writer for a file, or for the standard output if the file is {@code null}.
         *
         * @param file the file
         * @return the writer
         * @throws IOException if an IO error occurs
         */
        private Writer openWriter(Path file) throws IOException {
            if (file != null) {
                return Files.newBufferedWriter(file);
            } else {
                return new BufferedWriter(new OutputStreamWriter(System.out) {
                    @Override
                    public void close() throws IOException {
                        flush();
                    }
                });
            }
        }

        @Override
        protected void error(Path file, int lineNumber, String message) {
            err.print(file == null ? "<stdin>" : file);
            if (lineNumber > 0) {
                err.print(":");
                err.print(lineNumber);
            }
            err.print(": ");
            err.println(message);
        }

        @Override
        protected void error(Path file, int lineNumber, Throwable t) {
            error(file, lineNumber, t.toString());
            t.printStackTrace(err);
        }

        /**
         * The buffer in which input is stored until an appropriate action can be determined.
         * Using the buffer ensures that the output exactly matches the input, except where
         * it is intentionally modified.
         */
        private StringBuilder buffer = new StringBuilder();

        @Override
        public int nextChar() throws IOException {
            if (ch > 0) {
                buffer.append((char) ch);
            }
            return super.nextChar();
        }

        @Override
        protected void doctype(String s) {
            flushBuffer();
        }

        @Override
        protected void startElement(String name, Map<String,String> attrs, boolean selfClosing) {
            switch (name) {
                case "html":
                    // replace the existing <html> fragment
                    out.write("<html lang=\"en\">");
                    buffer.setLength(0);
                    break;

                case "meta":
                    // update the meta-data for the generator
                    if (Objects.equals(attrs.get("name"), "generator")) {
                        out.write(buffer.toString()
                                .replaceAll("(content=\"[^\"]*)(\")", "$1,fixuphtml$2"));
                        buffer.setLength(0);
                    }
                    break;

                case "article":
                case "aside":
                case "footer":
                case "header":
                case "nav":
                    // starting one of these elements will terminate <main> if one is being
                    // inserted
                    if (needEndMain) {
                        out.write("</main>");
                        needEndMain = false;
                    }
                    // <main> is not permitted within these elements
                    allowMain = false;
                    if (name.equals("nav") && Objects.equals(attrs.get("id"), "TOC")) {
                        out.write(buffer.toString()
                                .replaceAll(">$", " title=\"Table Of Contents\">"));
                        buffer.setLength(0);
                    }
                    break;

                case "body":
                    // within <body>, <main> is both permitted and required
                    allowMain = true;
                    needMain = true;
                    break;

                case "main":
                    // an explicit <main> found in the input; no need to add one
                    needMain = false;
                    break;

                case "table":
                    // The entire content of a <table> is buffered, until it can be
                    // determined in which column of the table contains the cells
                    // that can be used to identify the row.
                    if (table == null) {
                        table = new Table();
                    } else {
                        // tables containing nested tables are not updated
                        table.simple = false;
                    }
                    table.nestDepth++;
                    break;

                case "thead":
                case "tbody":
                    if (table != null) {
                        table.endCell();
                    }
                    break;

                case "tr":
                    if (table != null) {
                        table.endCell();
                        table.nextCellColumnIndex = 0;
                    }
                    break;

                case "td":
                case "th":
                    if (table != null) {
                        if (attrs.containsKey("rowspan")
                                || attrs.containsKey("colspan")
                                || attrs.containsKey("scope")) {
                            // tables containing spanning cells and tables that already
                            // contain scope attributes are not updated
                            table.simple = false;
                        }
                        table.startCell(name);
                    }
                    break;
            }

            // by default, the content is deemed to be palpable content, and so
            // insert <main> if it is permitted and one is still required,
            // while also ensuring that it does not appear before <body>
            if (allowMain && needMain && !name.equals("body")) {
                out.write("<main>");
                needMain = false;
                needEndMain = true;
            }

            flushBuffer();
        }

        @Override
        protected void endElement(String name) {
            switch (name) {
                case "article":
                case "aside":
                case "footer":
                case "header":
                case "nav":
                    // The code does not handle nested elements of these kinds, but could.
                    // So, assuming they are not nested, ending these elements implies
                    // that <main> is once again permitted.
                    allowMain = true;
                    break;

                case "body":
                    // The document is nearly done; insert <main> if needed
                    if (needEndMain) {
                        out.write("</main>");
                        needEndMain = false;
                    }
                    break;

                case "table":
                    // if the table is finished, analyze it and write it out
                    if (table != null) {
                        if (--table.nestDepth == 0) {
                            table.add(buffer.toString());
                            table.write(out);
                            table = null;
                            buffer.setLength(0);
                        }
                    }
                    break;

                case "thead":
                case "tbody":
                case "tr":
                case "td":
                case "th":
                    // ending any of these elements implicity or explicitly ends the
                    // current cell
                    table.endCell();
                    break;

            }
            flushBuffer();
        }

        @Override
        protected void content(String content) {
            if (table != null) {
                table.content(content);
            } else if (allowMain && needMain && !content.isBlank()) {
                // insert <main> if required and if we have palpable content
                out.write("<main>");
                needMain = false;
                needEndMain = true;
            }
            flushBuffer();
        }

        @Override
        protected void comment(String comment) {
            flushBuffer();
        }

        /**
         * Flushes the buffer, either by adding it into a table, if one is
         * in progress, or by writing it out.
         */
        private void flushBuffer() {
            String s = buffer.toString();
            if (table != null) {
                table.add(s);
            } else {
                out.write(s);
            }
            buffer.setLength(0);

        }
    }

    /**
     * Storage for the content of a {@code <table>} element} until we can determine
     * whether we should add {@code scope="row"} to the cells in a given column,
     * and if so, which column.
     *
     * The column with the highest number of unique entries is selected;
     * in case of ambiguity, a column whose heading begins "name" is chosen.
     *
     * Only "simple" tables are supported. Tables with any of the following
     * features are not considered "simple" and will not be modified:
     * <ul>
     *     <li>Tables containing nested tables</li>
     *     <li>Tables containing cells that use "rowspan" and "colspan" attributes</li>
     *     <li>Tables containing cells that already use "scope" attributes</li>
     * </ul>
     */
    class Table {
        /**
         * A fragment of HTML in this table.
         */
        class Entry {
            /** The fragment. */
            final String html;
            /** The column for a {@code <td>} fragment, or -1. */
            final int column;

            Entry(String html, int column) {
                this.html = html;
                this.column = column;
            }
        }

        /** Whether or not this is a "simple" table. */
        boolean simple = true;

        /** The nesting depth of the current table, within enclosing tables. */
        int nestDepth;

        /** A list of the HTML fragments that make up this table. */
        List<Entry> entries;

        /** The plain text contents of each column, used to determine the primary column. */
        List<Set<String>> columnContents;

        /** The column index of the next cell to be found. */
        int nextCellColumnIndex;

        /** A flag to mark the start of a {@code <td>} cell. */
        boolean startTDCell;

        /** The column index of the current cell, or -1 if not in a cell. */
        int currCellColumnIndex;

        /** The plain text contents of the current column. */
        Set<String> currColumnContents;

        /** The plain text content of the current cell. */
        StringBuilder currCellContent;

        /** The kind ({@code th} or {@code td}) of the current cell. */
        String currCellKind;

        /**
         * The index of the column, if any, containing a heading beginning "name".
         * This column is given preferential treatment when deciding the primary column.
         */
        int nameColumn;

        Table() {
            entries = new ArrayList<>();
            columnContents = new ArrayList<>();
        }

        void startCell(String name) {
            endCell();
            startTDCell = name.equals("td");
            currCellColumnIndex = nextCellColumnIndex++;
            currColumnContents = getColumn(currCellColumnIndex);
            currCellContent = new StringBuilder();
            currCellKind = name;
        }

        void endCell() {
            if (currCellContent != null) {
                String c = currCellContent.toString().trim();
                if (Objects.equals(currCellKind, "th")
                        && c.toLowerCase(Locale.US).startsWith("name")) {
                    nameColumn = currCellColumnIndex;
                }
                currColumnContents.add(c);
                currCellContent = null;
                currCellColumnIndex = -1;
                currColumnContents = null;
            }
        }

        void content(String content) {
            if (currCellContent != null) {
                currCellContent.append(content);
            }
        }

        void add(String html) {
            int index = startTDCell ? currCellColumnIndex : -1;
            entries.add(new Entry(html, index));
            startTDCell = false;
        }

        void write(PrintWriter out) {
            int max = -1;
            int maxIndex = -1;
            int index = 0;
            for (Set<String> c : columnContents) {
                if (c.size() > max || c.size() == max && index == nameColumn) {
                    max = c.size();
                    maxIndex = index;
                }
                index++;
            }
            boolean updateEndTd = false;
            Pattern styleAttr = Pattern.compile("(?<before>.*style=\")(?<style>[^\"]*)(?<after>\".*)");
            for (Entry e : entries) {
                if (simple && e.column == maxIndex) {
                    String attrs = e.html.substring(3, e.html.length() - 1);
                    out.write("<th");
                    Matcher m = styleAttr.matcher(attrs);
                    if (m.matches()) {
                        out.write(m.group("before"));
                        out.write("font-weight: normal; ");
                        String style = m.group("style");
                        if (!style.contains("text-align")) {
                            out.write("text-align: left; ");
                        }
                        out.write(style);
                        out.write(m.group("after"));
                    } else {
                        out.write(" style=\"font-weight: normal; text-align:left;\" ");
                        out.write(attrs);
                    }
                    out.write(" scope=\"row\"");
                    out.write(">");
                    updateEndTd = true;
                } else if (updateEndTd && e.html.equalsIgnoreCase("</td>")) {
                    out.write("</th>");
                    updateEndTd = false;
                } else {
                    out.write(e.html);
                    if (updateEndTd && e.html.regionMatches(true, 0, "<td", 0, 3)) {
                        // a new cell has been started without explicitly closing the
                        // cell that was being updated
                        updateEndTd = false;
                    }
                }
            }
        }

        private Set<String> getColumn(int index) {
            while (columnContents.size() <= index) {
                columnContents.add(new LinkedHashSet<>());
            }

            return columnContents.get(index);
        }
    }

    /**
     * A basic HTML parser.
     * Override the protected methods as needed to get notified of significant items
     * in any file that is read.
     */
    abstract class HtmlParser {

        private Path file;
        private Reader in;
        protected int ch;
        private int lineNumber;
        private boolean inScript;
        private boolean xml;

        /**
         * Read a file.
         * @param file the file
         */
        void read(Path file) {
            try (Reader r = Files.newBufferedReader(file)) {
                this.file = file;
                read(r);
            } catch (IOException e) {
                error(file, -1, e);
            }
        }

        HtmlParser() { }

        /**
         * Read a stream.
         * @param r the stream
         */
        void read(Reader r) {
            try {
                this.in = r;
                StringBuilder content = new StringBuilder();

                startFile(file);
                try {
                    lineNumber = 1;
                    xml = false;
                    nextChar();

                    while (ch != -1) {
                        if (ch == '<') {
                            content(content.toString());
                            content.setLength(0);
                            html();
                        } else {
                            content.append((char) ch);
                            if (ch == '\n') {
                                content(content.toString());
                                content.setLength(0);
                            }
                            nextChar();
                        }
                    }
                } finally {
                    endFile();
                }
            } catch (IOException e) {
                error(file, lineNumber, e);
            } catch (Throwable t) {
                error(file, lineNumber, t);
                t.printStackTrace(System.err);
            }
        }

        protected int getLineNumber() {
            return lineNumber;
        }

        /**
         * Called when a file has been opened, before parsing begins.
         * This is always the first notification when reading a file.
         * This implementation does nothing.
         *
         * @param file the file
         */
        protected void startFile(Path file) { }

        /**
         * Called when the parser has finished reading a file.
         * This is always the last notification when reading a file,
         * unless any errors occur while closing the file.
         * This implementation does nothing.
         */
        protected void endFile() { }

        /**
         * Called when a doctype declaration is found, at the beginning of the file.
         * This implementation does nothing.
         * @param s the doctype declaration
         */
        protected void doctype(String s) { }

        /**
         * Called when the opening tag of an HTML element is encountered.
         * This implementation does nothing.
         * @param name the name of the tag
         * @param attrs the attribute
         * @param selfClosing whether or not this is a self-closing tag
         */
        protected void startElement(String name, Map<String,String> attrs, boolean selfClosing) { }

        /**
         * Called when the closing tag of an HTML tag is encountered.
         * This implementation does nothing.
         * @param name the name of the tag
         */
        protected void endElement(String name) { }

        /**
         * Called for sequences of character content.
         * @param content the character content
         */
        protected void content(String content) { }

        /**
         * Called for sequences of comment.
         * @param comment the comment
         */
        protected void comment(String comment) { }

        /**
         * Called when an error has been encountered.
         * @param file the file being read
         * @param lineNumber the line number of line containing the error
         * @param message a description of the error
         */
        protected abstract void error(Path file, int lineNumber, String message);

        /**
         * Called when an exception has been encountered.
         * @param file the file being read
         * @param lineNumber the line number of the line being read when the exception was found
         * @param t the exception
         */
        protected abstract void error(Path file, int lineNumber, Throwable t);

        protected int nextChar() throws IOException {
            ch = in.read();
            if (ch == '\n')
                lineNumber++;
            return ch;
        }

        /**
         * Read the start or end of an HTML tag, or an HTML comment
         * {@literal <identifier attrs> } or {@literal </identifier> }
         * @throws java.io.IOException if there is a problem reading the file
         */
        protected void html() throws IOException {
            nextChar();
            if (isIdentifierStart((char) ch)) {
                String name = readIdentifier().toLowerCase(Locale.US);
                Map<String,String> attrs = htmlAttrs();
                if (attrs != null) {
                    boolean selfClosing = false;
                    if (ch == '/') {
                        nextChar();
                        selfClosing = true;
                    }
                    if (ch == '>') {
                        nextChar();
                        startElement(name, attrs, selfClosing);
                        if (name.equals("script")) {
                            inScript = true;
                        }
                        return;
                    }
                }
            } else if (ch == '/') {
                nextChar();
                if (isIdentifierStart((char) ch)) {
                    String name = readIdentifier().toLowerCase(Locale.US);
                    skipWhitespace();
                    if (ch == '>') {
                        nextChar();
                        endElement(name);
                        if (name.equals("script")) {
                            inScript = false;
                        }
                        return;
                    }
                }
            } else if (ch == '!') {
                nextChar();
                if (ch == '-') {
                    nextChar();
                    if (ch == '-') {
                        nextChar();
                        StringBuilder comment = new StringBuilder();
                        while (ch != -1) {
                            int dash = 0;
                            while (ch == '-') {
                                dash++;
                                comment.append(ch);
                                nextChar();
                            }
                            // Strictly speaking, a comment should not contain "--"
                            // so dash > 2 is an error, dash == 2 implies ch == '>'
                            // See http://www.w3.org/TR/html-markup/syntax.html#syntax-comments
                            // for more details.
                            if (dash >= 2 && ch == '>') {
                                comment.setLength(comment.length() - 2);
                                comment(comment.toString());
                                nextChar();
                                return;
                            }

                            comment.append(ch);
                            nextChar();
                        }
                    }
                } else if (ch == '[') {
                    nextChar();
                    if (ch == 'C') {
                        nextChar();
                        if (ch == 'D') {
                            nextChar();
                            if (ch == 'A') {
                                nextChar();
                                if (ch == 'T') {
                                    nextChar();
                                    if (ch == 'A') {
                                        nextChar();
                                        if (ch == '[') {
                                            while (true) {
                                                nextChar();
                                                if (ch == ']') {
                                                    nextChar();
                                                    if (ch == ']') {
                                                        nextChar();
                                                        if (ch == '>') {
                                                            nextChar();
                                                            return;
                                                        }
                                                    }
                                                }
                                            }

                                        }
                                    }
                                }
                            }
                        }
                    }
                } else {
                    StringBuilder sb = new StringBuilder();
                    while (ch != -1 && ch != '>') {
                        sb.append((char) ch);
                        nextChar();
                    }
                    Pattern p = Pattern.compile("(?is)doctype\\s+html\\s?.*");
                    String s = sb.toString();
                    if (p.matcher(s).matches()) {
                        doctype(s);
                        return;
                    }
                }
            } else if (ch == '?') {
                nextChar();
                if (ch == 'x') {
                    nextChar();
                    if (ch == 'm') {
                        nextChar();
                        if (ch == 'l') {
                            Map<String,String> attrs = htmlAttrs();
                            if (ch == '?') {
                                nextChar();
                                if (ch == '>') {
                                    nextChar();
                                    xml = true;
                                    return;
                                }
                            }
                        }
                    }

                }
            }

            if (!inScript) {
                error(file, lineNumber, "bad html");
            }
        }

        /**
         * Read a series of HTML attributes, terminated by {@literal > }.
         * Each attribute is of the form {@literal identifier[=value] }.
         * "value" may be unquoted, single-quoted, or double-quoted.
         */
        private Map<String,String> htmlAttrs() throws IOException {
            Map<String, String> map = new LinkedHashMap<>();
            skipWhitespace();

            while (isIdentifierStart((char) ch)) {
                String name = readAttributeName().toLowerCase(Locale.US);
                skipWhitespace();
                String value = null;
                if (ch == '=') {
                    nextChar();
                    skipWhitespace();
                    if (ch == '\'' || ch == '"') {
                        char quote = (char) ch;
                        nextChar();
                        StringBuilder sb = new StringBuilder();
                        while (ch != -1 && ch != quote) {
                            sb.append((char) ch);
                            nextChar();
                        }
                        value = sb.toString() // hack to replace common entities
                                .replace("&lt;", "<")
                                .replace("&gt;", ">")
                                .replace("&amp;", "&");
                        nextChar();
                    } else {
                        StringBuilder sb = new StringBuilder();
                        while (ch != -1 && !isUnquotedAttrValueTerminator((char) ch)) {
                            sb.append((char) ch);
                            nextChar();
                        }
                        value = sb.toString();
                    }
                    skipWhitespace();
                }
                map.put(name, value);
            }

            return map;
        }

        private boolean isIdentifierStart(char ch) {
            return Character.isUnicodeIdentifierStart(ch);
        }

        private String readIdentifier() throws IOException {
            StringBuilder sb = new StringBuilder();
            sb.append((char) ch);
            nextChar();
            while (ch != -1 && Character.isUnicodeIdentifierPart(ch)) {
                sb.append((char) ch);
                nextChar();
            }
            return sb.toString();
        }

        private String readAttributeName() throws IOException {
            StringBuilder sb = new StringBuilder();
            sb.append((char) ch);
            nextChar();
            while (ch != -1 && Character.isUnicodeIdentifierPart(ch)
                    || ch == '-'
                    || (xml || sb.toString().startsWith("xml")) && ch == ':') {
                sb.append((char) ch);
                nextChar();
            }
            return sb.toString();
        }

        private boolean isWhitespace(char ch) {
            return Character.isWhitespace(ch);
        }

        private void skipWhitespace() throws IOException {
            while (isWhitespace((char) ch)) {
                nextChar();
            }
        }

        private boolean isUnquotedAttrValueTerminator(char ch) {
            switch (ch) {
                case '\f': case '\n': case '\r': case '\t':
                case ' ':
                case '"': case '\'': case '`':
                case '=': case '<': case '>':
                    return true;
                default:
                    return false;
            }
        }
    }

}