#!/bin/bash
XMLNS_H="http://www.w3.org/1999/xhtml"
# If we set xmlns_h="…", we can omit: --option xmlns_h "$XMLNS_H"
# because XML namespaces can be provided either as an option or as an environmental variable.
# Options have precedence.
findFiles() {
find -print0;
}
fetchAttributes() {
relpipe-in-filesystem \
--parallel 8 \
--file name \
--streamlet xpath \
--option xmlns_h "$XMLNS_H" \
--option attribute '.' --option mode boolean --as 'valid_xml' \
--option attribute 'namespace-uri()' --as 'root_xmlns' \
--option attribute '/h:html/h:head/h:title' --as 'title' \
--option attribute 'count(//h:h1)' --as 'h1_count' \
--option attribute 'count(//h:h2)' --as 'h2_count' \
--option attribute 'count(//h:h3)' --as 'h3_count'
}
filterAndOrder() {
relpipe-tr-sql \
--relation "pages" \
"SELECT
name,
title,
h1_count,
h2_count,
h3_count
FROM filesystem WHERE root_xmlns = ?
ORDER BY h1_count + h2_count + h3_count DESC
LIMIT 5" \
--type-cast 'h1_count' integer \
--type-cast 'h2_count' integer \
--type-cast 'h3_count' integer \
--parameter "$XMLNS_H";
}
findFiles | fetchAttributes | filterAndOrder | relpipe-out-gui -title "Pages and titles"