} is inserted if palpable content is found that is not with a + * section such as {@code header}, {@code footer}, {@code aside}. + * + * {@code

} was inserted and a section + * is started that should not be included in the main section. + * + *

Tables: row headings

+ * + * {@code scope="row"} is added to the {@code } elements in the first + * column whose cell contents are all different and therefore which can be + * used to identify the row. In case of ambiguity, a column containing + * a {@code } whose contents begin name is preferred. + * + * + *

{@code }

+ * + * Update the content string, to indicate it has been processed by this program. + * + */ +public class Main { + /** + * Runs the program. + * + *

+     *     java build.tools.fixuppandoc.Main [-o output-file] [input-file]
+     *

+ * + * If no input file is specified, the program will read from standard input. + * If no output file is specified, the program will write to standard output. + * Any error messages will be written to the standard error stream. + * + * @param args the command-line arguments + */ + public static void main(String... args) { + try { + new Main().run(args); + } catch (IOException | IllegalArgumentException e) { + System.err.println(e); + System.exit(1); + } catch (Throwable t) { + t.printStackTrace(System.err); + System.exit(1); + } + } + + private void run(String... args) throws IOException { + Path inFile = null; + Path outFile = null; + + for (int i = 0; i < args.length; i++) { + String arg = args[i]; + if (arg.equals("-o") && i + 1 < args.length) { + outFile = Path.of(args[++i]); + } else if (arg.startsWith("-")) { + throw new IllegalArgumentException(arg); + } else if (inFile == null) { + inFile = Path.of(arg); + } else { + throw new IllegalArgumentException(arg); + } + } + + new Fixup().run(inFile, outFile); + } + + /** + * A class to read HTML, copying input to output, modifying + * fragments as needed. + */ + class Fixup extends HtmlParser { + /** The output stream. */ + PrintWriter out; + + /** A stream for reporting errors. */ + PrintStream err = System.err; + + /** + * Flag to indicate when {@code

} is permitted around palpable content. + * Set within {@code }; disabled within elements in which {@code

} + * is not permitted. + */ + boolean allowMain = false; + + /** + * Flag to indicate that {@code

} is required. + * Set on {@code }; reset when {@code

} is either found or generated. + */ + boolean needMain = false; + + /** + * Flag to indicate that {@code

} is required. + * Set if {@code

} is generated. + * Reset when a start or end element is found that requires that {@code

} + * needs to be generated if necessary. + */ + boolean needEndMain = false; + + /** + * Handler for {@code } elements. + */ + Table table; + + /** + * Run the program, copying an input file to an output file. + * If the input file is {@code null}, input is read from the standard input. + * If the output file is {@code null}, output is written to the standard output. + * + * @param inFile the input file + * @param outFile the output file + * @throws IOException if an IO error occurs + */ + void run(Path inFile, Path outFile) throws IOException { + try (Writer out = openWriter(outFile)) { + this.out = new PrintWriter(out); + if (inFile != null) { + read(inFile); + } else { + read(new BufferedReader(new InputStreamReader(System.in))); + } + } + } + + /** + * Returns a writer for a file, or for the standard output if the file is {@code null}. + * + * @param file the file + * @return the writer + * @throws IOException if an IO error occurs + */ + private Writer openWriter(Path file) throws IOException { + if (file != null) { + return Files.newBufferedWriter(file); + } else { + return new BufferedWriter(new OutputStreamWriter(System.out) { + @Override + public void close() throws IOException { + flush(); + } + }); + } + } + + @Override + protected void error(Path file, int lineNumber, String message) { + err.print(file == null ? "" : file); + if (lineNumber > 0) { + err.print(":"); + err.print(lineNumber); + } + err.print(": "); + err.println(message); + } + + @Override + protected void error(Path file, int lineNumber, Throwable t) { + error(file, lineNumber, t.toString()); + t.printStackTrace(err); + } + + /** + * The buffer in which input is stored until an appropriate action can be determined. + * Using the buffer ensures that the output exactly matches the input, except where + * it is intentionally modified. + */ + private StringBuilder buffer = new StringBuilder(); + + @Override + public int nextChar() throws IOException { + if (ch > 0) { + buffer.append((char) ch); + } + return super.nextChar(); + } + + @Override + protected void doctype(String s) { + flushBuffer(); + } + + @Override + protected void startElement(String name, Map attrs, boolean selfClosing) { + switch (name) { + case "html": + // replace the existing fragment + out.write(""); + buffer.setLength(0); + break; + + case "meta": + // update the meta-data for the generator + if (Objects.equals(attrs.get("name"), "generator")) { + out.write(buffer.toString() + .replaceAll("(content=\"[^\"]*)(\")", "$1,fixuphtml$2")); + buffer.setLength(0); + } + break; + + case "article": + case "aside": + case "footer": + case "header": + case "nav": + // starting one of these elements will terminate

if one is being + // inserted + if (needEndMain) { + out.write("

"); + needEndMain = false; + } + //

is not permitted within these elements + allowMain = false; + break; + + case "body": + // within ,

is both permitted and required + allowMain = true; + needMain = true; + break; + + case "main": + // an explicit

found in the input; no need to add one + needMain = false; + break; + + case "table": + // The entire content of a

is buffered, until it can be + // determined in which column of the table contains the cells + // that can be used to identify the row. + if (table == null) { + table = new Table(); + } else { + // tables containing nested tables are not updated + table.simple = false; + } + table.nestDepth++; + break; + + case "thead": + case "tbody": + if (table != null) { + table.endCell(); + } + break; + + case "tr": + if (table != null) { + table.endCell(); + table.nextCellColumnIndex = 0; + } + break; + + case "td": + case "th": + if (table != null) { + if (attrs.containsKey("rowspan") + || attrs.containsKey("colspan") + || attrs.containsKey("scope")) { + // tables containing spanning cells and tables that already + // contain scope attributes are not updated + table.simple = false; + } + table.startCell(name); + } + break; + } + + // by default, the content is deemed to be palpable content, and so + // insert

if it is permitted and one is still required, + // while also ensuring that it does not appear before + if (allowMain && needMain && !name.equals("body")) { + out.write("

"); + needMain = false; + needEndMain = true; + } + + flushBuffer(); + } + + @Override + protected void endElement(String name) { + switch (name) { + case "article": + case "aside": + case "footer": + case "header": + case "nav": + // The code does not handle nested elements of these kinds, but could. + // So, assuming they are not nested, ending these elements implies + // that

is once again permitted. + allowMain = true; + break; + + case "body": + // The document is nearly done; insert

if needed + if (needEndMain) { + out.write("

"); + needEndMain = false; + } + break; + + case "table": + // if the table is finished, analyze it and write it out + if (table != null) { + if (--table.nestDepth == 0) { + table.add(buffer.toString()); + table.write(out); + table = null; + buffer.setLength(0); + } + } + break; + + case "thead": + case "tbody": + case "tr": + case "td": + case "th": + // ending any of these elements implicity or explicitly ends the + // current cell + table.endCell(); + break; + + } + flushBuffer(); + } + + @Override + protected void content(String content) { + if (table != null) { + table.content(content); + } else if (allowMain && needMain && !content.isBlank()) { + // insert

if required and if we have palpable content + out.write("

"); + needMain = false; + needEndMain = true; + } + flushBuffer(); + } + + @Override + protected void comment(String comment) { + flushBuffer(); + } + + /** + * Flushes the buffer, either by adding it into a table, if one is + * in progress, or by writing it out. + */ + private void flushBuffer() { + String s = buffer.toString(); + if (table != null) { + table.add(s); + } else { + out.write(s); + } + buffer.setLength(0); + + } + } + + /** + * Storage for the content of a {@code

} element} until we can determine + * whether we should add {@code scope="row"} to the cells in a given column, + * and if so, which column. + * + * The column with the highest number of unique entries is selected; + * in case of ambiguity, a column whose heading begins "name" is chosen. + * + * Only "simple" tables are supported. Tables with any of the following + * features are not considered "simple" and will not be modified: + *

Tables containing nested tables
Tables containing cells that use "rowspan" and "colspan" attributes
Tables containing cells that already use "scope" attributes

+ */ + class Table { + /** + * A fragment of HTML in this table. + */ + class Entry { + /** The fragment. */ + final String html; + /** The column for a {@code

} fragment, or -1. */ + final int column; + + Entry(String html, int column) { + this.html = html; + this.column = column; + } + } + + /** Whether or not this is a "simple" table. */ + boolean simple = true; + + /** The nesting depth of the current table, within enclosing tables. */ + int nestDepth; + + /** A list of the HTML fragments that make up this table. */ + List entries; + + /** The plain text contents of each column, used to determine the primary column. */ + List> columnContents; + + /** The column index of the next cell to be found. */ + int nextCellColumnIndex; + + /** A flag to mark the start of a {@code

} cell. */ + boolean startTDCell; + + /** The column index of the current cell, or -1 if not in a cell. */ + int currCellColumnIndex; + + /** The plain text contents of the current column. */ + Set currColumnContents; + + /** The plain text content of the current cell. */ + StringBuilder currCellContent; + + /** The kind ({@code th} or {@code td}) of the current cell. */ + String currCellKind; + + /** + * The index of the column, if any, containing a heading beginning "name". + * This column is given preferential treatment when deciding the primary column. + */ + int nameColumn; + + Table() { + entries = new ArrayList<>(); + columnContents = new ArrayList<>(); + } + + void startCell(String name) { + endCell(); + startTDCell = name.equals("td"); + currCellColumnIndex = nextCellColumnIndex++; + currColumnContents = getColumn(currCellColumnIndex); + currCellContent = new StringBuilder(); + currCellKind = name; + } + + void endCell() { + if (currCellContent != null) { + String c = currCellContent.toString().trim(); + if (Objects.equals(currCellKind, "th") + && c.toLowerCase(Locale.US).startsWith("name")) { + nameColumn = currCellColumnIndex; + } + currColumnContents.add(c); + currCellContent = null; + currCellColumnIndex = -1; + currColumnContents = null; + } + } + + void content(String content) { + if (currCellContent != null) { + currCellContent.append(content); + } + } + + void add(String html) { + int index = startTDCell ? currCellColumnIndex : -1; + entries.add(new Entry(html, index)); + startTDCell = false; + } + + void write(PrintWriter out) { + int max = -1; + int maxIndex = -1; + int index = 0; + for (Set c : columnContents) { + if (c.size() > max || c.size() == max && index == nameColumn) { + max = c.size(); + maxIndex = index; + } + index++; + } + for (Entry e : entries) { + if (simple && e.column == maxIndex) { + out.write(e.html.substring(0, e.html.length() - 1)); + out.write(" scope=\"row\">"); + } else { + out.write(e.html); + } + } + } + + private Set getColumn(int index) { + while (columnContents.size() <= index) { + columnContents.add(new LinkedHashSet<>()); + } + + return columnContents.get(index); + } + } + + /** + * A basic HTML parser. + * Override the protected methods as needed to get notified of significant items + * in any file that is read. + */ + abstract class HtmlParser { + + private Path file; + private Reader in; + protected int ch; + private int lineNumber; + private boolean inScript; + private boolean xml; + + /** + * Read a file. + * @param file the file + */ + void read(Path file) { + try (Reader r = Files.newBufferedReader(file)) { + this.file = file; + read(r); + } catch (IOException e) { + error(file, -1, e); + } + } + + HtmlParser() { } + + /** + * Read a stream. + * @param r the stream + */ + void read(Reader r) { + try { + this.in = r; + StringBuilder content = new StringBuilder(); + + startFile(file); + try { + lineNumber = 1; + xml = false; + nextChar(); + + while (ch != -1) { + if (ch == '<') { + content(content.toString()); + content.setLength(0); + html(); + } else { + content.append((char) ch); + if (ch == '\n') { + content(content.toString()); + content.setLength(0); + } + nextChar(); + } + } + } finally { + endFile(); + } + } catch (IOException e) { + error(file, lineNumber, e); + } catch (Throwable t) { + error(file, lineNumber, t); + t.printStackTrace(System.err); + } + } + + protected int getLineNumber() { + return lineNumber; + } + + /** + * Called when a file has been opened, before parsing begins. + * This is always the first notification when reading a file. + * This implementation does nothing. + * + * @param file the file + */ + protected void startFile(Path file) { } + + /** + * Called when the parser has finished reading a file. + * This is always the last notification when reading a file, + * unless any errors occur while closing the file. + * This implementation does nothing. + */ + protected void endFile() { } + + /** + * Called when a doctype declaration is found, at the beginning of the file. + * This implementation does nothing. + * @param s the doctype declaration + */ + protected void doctype(String s) { } + + /** + * Called when the opening tag of an HTML element is encountered. + * This implementation does nothing. + * @param name the name of the tag + * @param attrs the attribute + * @param selfClosing whether or not this is a self-closing tag + */ + protected void startElement(String name, Map attrs, boolean selfClosing) { } + + /** + * Called when the closing tag of an HTML tag is encountered. + * This implementation does nothing. + * @param name the name of the tag + */ + protected void endElement(String name) { } + + /** + * Called for sequences of character content. + * @param content the character content + */ + protected void content(String content) { } + + /** + * Called for sequences of comment. + * @param comment the comment + */ + protected void comment(String comment) { } + + /** + * Called when an error has been encountered. + * @param file the file being read + * @param lineNumber the line number of line containing the error + * @param message a description of the error + */ + protected abstract void error(Path file, int lineNumber, String message); + + /** + * Called when an exception has been encountered. + * @param file the file being read + * @param lineNumber the line number of the line being read when the exception was found + * @param t the exception + */ + protected abstract void error(Path file, int lineNumber, Throwable t); + + protected int nextChar() throws IOException { + ch = in.read(); + if (ch == '\n') + lineNumber++; + return ch; + } + + /** + * Read the start or end of an HTML tag, or an HTML comment + * {@literal } or {@literal } + * @throws java.io.IOException if there is a problem reading the file + */ + protected void html() throws IOException { + nextChar(); + if (isIdentifierStart((char) ch)) { + String name = readIdentifier().toLowerCase(Locale.US); + Map attrs = htmlAttrs(); + if (attrs != null) { + boolean selfClosing = false; + if (ch == '/') { + nextChar(); + selfClosing = true; + } + if (ch == '>') { + nextChar(); + startElement(name, attrs, selfClosing); + if (name.equals("script")) { + inScript = true; + } + return; + } + } + } else if (ch == '/') { + nextChar(); + if (isIdentifierStart((char) ch)) { + String name = readIdentifier().toLowerCase(Locale.US); + skipWhitespace(); + if (ch == '>') { + nextChar(); + endElement(name); + if (name.equals("script")) { + inScript = false; + } + return; + } + } + } else if (ch == '!') { + nextChar(); + if (ch == '-') { + nextChar(); + if (ch == '-') { + nextChar(); + StringBuilder comment = new StringBuilder(); + while (ch != -1) { + int dash = 0; + while (ch == '-') { + dash++; + comment.append(ch); + nextChar(); + } + // Strictly speaking, a comment should not contain "--" + // so dash > 2 is an error, dash == 2 implies ch == '>' + // See http://www.w3.org/TR/html-markup/syntax.html#syntax-comments + // for more details. + if (dash >= 2 && ch == '>') { + comment.setLength(comment.length() - 2); + comment(comment.toString()); + nextChar(); + return; + } + + comment.append(ch); + nextChar(); + } + } + } else if (ch == '[') { + nextChar(); + if (ch == 'C') { + nextChar(); + if (ch == 'D') { + nextChar(); + if (ch == 'A') { + nextChar(); + if (ch == 'T') { + nextChar(); + if (ch == 'A') { + nextChar(); + if (ch == '[') { + while (true) { + nextChar(); + if (ch == ']') { + nextChar(); + if (ch == ']') { + nextChar(); + if (ch == '>') { + nextChar(); + return; + } + } + } + } + + } + } + } + } + } + } + } else { + StringBuilder sb = new StringBuilder(); + while (ch != -1 && ch != '>') { + sb.append((char) ch); + nextChar(); + } + Pattern p = Pattern.compile("(?is)doctype\\s+html\\s?.*"); + String s = sb.toString(); + if (p.matcher(s).matches()) { + doctype(s); + return; + } + } + } else if (ch == '?') { + nextChar(); + if (ch == 'x') { + nextChar(); + if (ch == 'm') { + nextChar(); + if (ch == 'l') { + Map attrs = htmlAttrs(); + if (ch == '?') { + nextChar(); + if (ch == '>') { + nextChar(); + xml = true; + return; + } + } + } + } + + } + } + + if (!inScript) { + error(file, lineNumber, "bad html"); + } + } + + /** + * Read a series of HTML attributes, terminated by {@literal > }. + * Each attribute is of the form {@literal identifier[=value] }. + * "value" may be unquoted, single-quoted, or double-quoted. + */ + private Map htmlAttrs() throws IOException { + Map map = new LinkedHashMap<>(); + skipWhitespace(); + + while (isIdentifierStart((char) ch)) { + String name = readAttributeName().toLowerCase(Locale.US); + skipWhitespace(); + String value = null; + if (ch == '=') { + nextChar(); + skipWhitespace(); + if (ch == '\'' || ch == '"') { + char quote = (char) ch; + nextChar(); + StringBuilder sb = new StringBuilder(); + while (ch != -1 && ch != quote) { + sb.append((char) ch); + nextChar(); + } + value = sb.toString() // hack to replace common entities + .replace("<", "<") + .replace(">", ">") + .replace("&", "&"); + nextChar(); + } else { + StringBuilder sb = new StringBuilder(); + while (ch != -1 && !isUnquotedAttrValueTerminator((char) ch)) { + sb.append((char) ch); + nextChar(); + } + value = sb.toString(); + } + skipWhitespace(); + } + map.put(name, value); + } + + return map; + } + + private boolean isIdentifierStart(char ch) { + return Character.isUnicodeIdentifierStart(ch); + } + + private String readIdentifier() throws IOException { + StringBuilder sb = new StringBuilder(); + sb.append((char) ch); + nextChar(); + while (ch != -1 && Character.isUnicodeIdentifierPart(ch)) { + sb.append((char) ch); + nextChar(); + } + return sb.toString(); + } + + private String readAttributeName() throws IOException { + StringBuilder sb = new StringBuilder(); + sb.append((char) ch); + nextChar(); + while (ch != -1 && Character.isUnicodeIdentifierPart(ch) + || ch == '-' + || (xml || sb.toString().startsWith("xml")) && ch == ':') { + sb.append((char) ch); + nextChar(); + } + return sb.toString(); + } + + private boolean isWhitespace(char ch) { + return Character.isWhitespace(ch); + } + + private void skipWhitespace() throws IOException { + while (isWhitespace((char) ch)) { + nextChar(); + } + } + + private boolean isUnquotedAttrValueTerminator(char ch) { + switch (ch) { + case '\f': case '\n': case '\r': case '\t': + case ' ': + case '"': case '\'': case '`': + case '=': case '<': case '>': + return true; + default: + return false; + } + } + } + +}