} element} until we can determine
+ * whether we should add {@code scope="row"} to the cells in a given column,
+ * and if so, which column.
+ *
+ * The column with the highest number of unique entries is selected;
+ * in case of ambiguity, a column whose heading begins "name" is chosen.
+ *
+ * Only "simple" tables are supported. Tables with any of the following
+ * features are not considered "simple" and will not be modified:
+ *
+ * - Tables containing nested tables
+ * - Tables containing cells that use "rowspan" and "colspan" attributes
+ * - Tables containing cells that already use "scope" attributes
+ *
+ */
+ class Table {
+ /**
+ * A fragment of HTML in this table.
+ */
+ class Entry {
+ /** The fragment. */
+ final String html;
+ /** The column for a {@code } fragment, or -1. */
+ final int column;
+
+ Entry(String html, int column) {
+ this.html = html;
+ this.column = column;
+ }
+ }
+
+ /** Whether or not this is a "simple" table. */
+ boolean simple = true;
+
+ /** The nesting depth of the current table, within enclosing tables. */
+ int nestDepth;
+
+ /** A list of the HTML fragments that make up this table. */
+ List entries;
+
+ /** The plain text contents of each column, used to determine the primary column. */
+ List> columnContents;
+
+ /** The column index of the next cell to be found. */
+ int nextCellColumnIndex;
+
+ /** A flag to mark the start of a {@code } cell. */
+ boolean startTDCell;
+
+ /** The column index of the current cell, or -1 if not in a cell. */
+ int currCellColumnIndex;
+
+ /** The plain text contents of the current column. */
+ Set currColumnContents;
+
+ /** The plain text content of the current cell. */
+ StringBuilder currCellContent;
+
+ /** The kind ({@code th} or {@code td}) of the current cell. */
+ String currCellKind;
+
+ /**
+ * The index of the column, if any, containing a heading beginning "name".
+ * This column is given preferential treatment when deciding the primary column.
+ */
+ int nameColumn;
+
+ Table() {
+ entries = new ArrayList<>();
+ columnContents = new ArrayList<>();
+ }
+
+ void startCell(String name) {
+ endCell();
+ startTDCell = name.equals("td");
+ currCellColumnIndex = nextCellColumnIndex++;
+ currColumnContents = getColumn(currCellColumnIndex);
+ currCellContent = new StringBuilder();
+ currCellKind = name;
+ }
+
+ void endCell() {
+ if (currCellContent != null) {
+ String c = currCellContent.toString().trim();
+ if (Objects.equals(currCellKind, "th")
+ && c.toLowerCase(Locale.US).startsWith("name")) {
+ nameColumn = currCellColumnIndex;
+ }
+ currColumnContents.add(c);
+ currCellContent = null;
+ currCellColumnIndex = -1;
+ currColumnContents = null;
+ }
+ }
+
+ void content(String content) {
+ if (currCellContent != null) {
+ currCellContent.append(content);
+ }
+ }
+
+ void add(String html) {
+ int index = startTDCell ? currCellColumnIndex : -1;
+ entries.add(new Entry(html, index));
+ startTDCell = false;
+ }
+
+ void write(PrintWriter out) {
+ int max = -1;
+ int maxIndex = -1;
+ int index = 0;
+ for (Set c : columnContents) {
+ if (c.size() > max || c.size() == max && index == nameColumn) {
+ max = c.size();
+ maxIndex = index;
+ }
+ index++;
+ }
+ for (Entry e : entries) {
+ if (simple && e.column == maxIndex) {
+ out.write(e.html.substring(0, e.html.length() - 1));
+ out.write(" scope=\"row\">");
+ } else {
+ out.write(e.html);
+ }
+ }
+ }
+
+ private Set getColumn(int index) {
+ while (columnContents.size() <= index) {
+ columnContents.add(new LinkedHashSet<>());
+ }
+
+ return columnContents.get(index);
+ }
+ }
+
+ /**
+ * A basic HTML parser.
+ * Override the protected methods as needed to get notified of significant items
+ * in any file that is read.
+ */
+ abstract class HtmlParser {
+
+ private Path file;
+ private Reader in;
+ protected int ch;
+ private int lineNumber;
+ private boolean inScript;
+ private boolean xml;
+
+ /**
+ * Read a file.
+ * @param file the file
+ */
+ void read(Path file) {
+ try (Reader r = Files.newBufferedReader(file)) {
+ this.file = file;
+ read(r);
+ } catch (IOException e) {
+ error(file, -1, e);
+ }
+ }
+
+ HtmlParser() { }
+
+ /**
+ * Read a stream.
+ * @param r the stream
+ */
+ void read(Reader r) {
+ try {
+ this.in = r;
+ StringBuilder content = new StringBuilder();
+
+ startFile(file);
+ try {
+ lineNumber = 1;
+ xml = false;
+ nextChar();
+
+ while (ch != -1) {
+ if (ch == '<') {
+ content(content.toString());
+ content.setLength(0);
+ html();
+ } else {
+ content.append((char) ch);
+ if (ch == '\n') {
+ content(content.toString());
+ content.setLength(0);
+ }
+ nextChar();
+ }
+ }
+ } finally {
+ endFile();
+ }
+ } catch (IOException e) {
+ error(file, lineNumber, e);
+ } catch (Throwable t) {
+ error(file, lineNumber, t);
+ t.printStackTrace(System.err);
+ }
+ }
+
+ protected int getLineNumber() {
+ return lineNumber;
+ }
+
+ /**
+ * Called when a file has been opened, before parsing begins.
+ * This is always the first notification when reading a file.
+ * This implementation does nothing.
+ *
+ * @param file the file
+ */
+ protected void startFile(Path file) { }
+
+ /**
+ * Called when the parser has finished reading a file.
+ * This is always the last notification when reading a file,
+ * unless any errors occur while closing the file.
+ * This implementation does nothing.
+ */
+ protected void endFile() { }
+
+ /**
+ * Called when a doctype declaration is found, at the beginning of the file.
+ * This implementation does nothing.
+ * @param s the doctype declaration
+ */
+ protected void doctype(String s) { }
+
+ /**
+ * Called when the opening tag of an HTML element is encountered.
+ * This implementation does nothing.
+ * @param name the name of the tag
+ * @param attrs the attribute
+ * @param selfClosing whether or not this is a self-closing tag
+ */
+ protected void startElement(String name, Map attrs, boolean selfClosing) { }
+
+ /**
+ * Called when the closing tag of an HTML tag is encountered.
+ * This implementation does nothing.
+ * @param name the name of the tag
+ */
+ protected void endElement(String name) { }
+
+ /**
+ * Called for sequences of character content.
+ * @param content the character content
+ */
+ protected void content(String content) { }
+
+ /**
+ * Called for sequences of comment.
+ * @param comment the comment
+ */
+ protected void comment(String comment) { }
+
+ /**
+ * Called when an error has been encountered.
+ * @param file the file being read
+ * @param lineNumber the line number of line containing the error
+ * @param message a description of the error
+ */
+ protected abstract void error(Path file, int lineNumber, String message);
+
+ /**
+ * Called when an exception has been encountered.
+ * @param file the file being read
+ * @param lineNumber the line number of the line being read when the exception was found
+ * @param t the exception
+ */
+ protected abstract void error(Path file, int lineNumber, Throwable t);
+
+ protected int nextChar() throws IOException {
+ ch = in.read();
+ if (ch == '\n')
+ lineNumber++;
+ return ch;
+ }
+
+ /**
+ * Read the start or end of an HTML tag, or an HTML comment
+ * {@literal } or {@literal }
+ * @throws java.io.IOException if there is a problem reading the file
+ */
+ protected void html() throws IOException {
+ nextChar();
+ if (isIdentifierStart((char) ch)) {
+ String name = readIdentifier().toLowerCase(Locale.US);
+ Map attrs = htmlAttrs();
+ if (attrs != null) {
+ boolean selfClosing = false;
+ if (ch == '/') {
+ nextChar();
+ selfClosing = true;
+ }
+ if (ch == '>') {
+ nextChar();
+ startElement(name, attrs, selfClosing);
+ if (name.equals("script")) {
+ inScript = true;
+ }
+ return;
+ }
+ }
+ } else if (ch == '/') {
+ nextChar();
+ if (isIdentifierStart((char) ch)) {
+ String name = readIdentifier().toLowerCase(Locale.US);
+ skipWhitespace();
+ if (ch == '>') {
+ nextChar();
+ endElement(name);
+ if (name.equals("script")) {
+ inScript = false;
+ }
+ return;
+ }
+ }
+ } else if (ch == '!') {
+ nextChar();
+ if (ch == '-') {
+ nextChar();
+ if (ch == '-') {
+ nextChar();
+ StringBuilder comment = new StringBuilder();
+ while (ch != -1) {
+ int dash = 0;
+ while (ch == '-') {
+ dash++;
+ comment.append(ch);
+ nextChar();
+ }
+ // Strictly speaking, a comment should not contain "--"
+ // so dash > 2 is an error, dash == 2 implies ch == '>'
+ // See http://www.w3.org/TR/html-markup/syntax.html#syntax-comments
+ // for more details.
+ if (dash >= 2 && ch == '>') {
+ comment.setLength(comment.length() - 2);
+ comment(comment.toString());
+ nextChar();
+ return;
+ }
+
+ comment.append(ch);
+ nextChar();
+ }
+ }
+ } else if (ch == '[') {
+ nextChar();
+ if (ch == 'C') {
+ nextChar();
+ if (ch == 'D') {
+ nextChar();
+ if (ch == 'A') {
+ nextChar();
+ if (ch == 'T') {
+ nextChar();
+ if (ch == 'A') {
+ nextChar();
+ if (ch == '[') {
+ while (true) {
+ nextChar();
+ if (ch == ']') {
+ nextChar();
+ if (ch == ']') {
+ nextChar();
+ if (ch == '>') {
+ nextChar();
+ return;
+ }
+ }
+ }
+ }
+
+ }
+ }
+ }
+ }
+ }
+ }
+ } else {
+ StringBuilder sb = new StringBuilder();
+ while (ch != -1 && ch != '>') {
+ sb.append((char) ch);
+ nextChar();
+ }
+ Pattern p = Pattern.compile("(?is)doctype\\s+html\\s?.*");
+ String s = sb.toString();
+ if (p.matcher(s).matches()) {
+ doctype(s);
+ return;
+ }
+ }
+ } else if (ch == '?') {
+ nextChar();
+ if (ch == 'x') {
+ nextChar();
+ if (ch == 'm') {
+ nextChar();
+ if (ch == 'l') {
+ Map attrs = htmlAttrs();
+ if (ch == '?') {
+ nextChar();
+ if (ch == '>') {
+ nextChar();
+ xml = true;
+ return;
+ }
+ }
+ }
+ }
+
+ }
+ }
+
+ if (!inScript) {
+ error(file, lineNumber, "bad html");
+ }
+ }
+
+ /**
+ * Read a series of HTML attributes, terminated by {@literal > }.
+ * Each attribute is of the form {@literal identifier[=value] }.
+ * "value" may be unquoted, single-quoted, or double-quoted.
+ */
+ private Map htmlAttrs() throws IOException {
+ Map map = new LinkedHashMap<>();
+ skipWhitespace();
+
+ while (isIdentifierStart((char) ch)) {
+ String name = readAttributeName().toLowerCase(Locale.US);
+ skipWhitespace();
+ String value = null;
+ if (ch == '=') {
+ nextChar();
+ skipWhitespace();
+ if (ch == '\'' || ch == '"') {
+ char quote = (char) ch;
+ nextChar();
+ StringBuilder sb = new StringBuilder();
+ while (ch != -1 && ch != quote) {
+ sb.append((char) ch);
+ nextChar();
+ }
+ value = sb.toString() // hack to replace common entities
+ .replace("<", "<")
+ .replace(">", ">")
+ .replace("&", "&");
+ nextChar();
+ } else {
+ StringBuilder sb = new StringBuilder();
+ while (ch != -1 && !isUnquotedAttrValueTerminator((char) ch)) {
+ sb.append((char) ch);
+ nextChar();
+ }
+ value = sb.toString();
+ }
+ skipWhitespace();
+ }
+ map.put(name, value);
+ }
+
+ return map;
+ }
+
+ private boolean isIdentifierStart(char ch) {
+ return Character.isUnicodeIdentifierStart(ch);
+ }
+
+ private String readIdentifier() throws IOException {
+ StringBuilder sb = new StringBuilder();
+ sb.append((char) ch);
+ nextChar();
+ while (ch != -1 && Character.isUnicodeIdentifierPart(ch)) {
+ sb.append((char) ch);
+ nextChar();
+ }
+ return sb.toString();
+ }
+
+ private String readAttributeName() throws IOException {
+ StringBuilder sb = new StringBuilder();
+ sb.append((char) ch);
+ nextChar();
+ while (ch != -1 && Character.isUnicodeIdentifierPart(ch)
+ || ch == '-'
+ || (xml || sb.toString().startsWith("xml")) && ch == ':') {
+ sb.append((char) ch);
+ nextChar();
+ }
+ return sb.toString();
+ }
+
+ private boolean isWhitespace(char ch) {
+ return Character.isWhitespace(ch);
+ }
+
+ private void skipWhitespace() throws IOException {
+ while (isWhitespace((char) ch)) {
+ nextChar();
+ }
+ }
+
+ private boolean isUnquotedAttrValueTerminator(char ch) {
+ switch (ch) {
+ case '\f': case '\n': case '\r': case '\t':
+ case ' ':
+ case '"': case '\'': case '`':
+ case '=': case '<': case '>':
+ return true;
+ default:
+ return false;
+ }
+ }
+ }
+
+}
| |