relpipe-data/examples/xhtml-filesystem-xpath.sh
author František Kučera <franta-hg@frantovo.cz>
Mon, 03 Feb 2020 22:10:07 +0100
branchv_0
changeset 294 abbc9bcfbcc4
permissions -rwxr-xr-x
Release v0.15 – streamlets, parallel processing

#!/bin/bash

XMLNS_H="http://www.w3.org/1999/xhtml" 

# If we set xmlns_h="…", we can omit: --option xmlns_h "$XMLNS_H"
# because XML namespaces can be provided either as an option or as an environmental variable.
# Options have precedence.

findFiles() {
	find -print0;
}

fetchAttributes() {
	relpipe-in-filesystem \
		--parallel 8 \
		--file name \
		--streamlet xpath \
			--option xmlns_h "$XMLNS_H" \
			--option attribute '.' --option mode boolean --as 'valid_xml'  \
			--option attribute 'namespace-uri()'         --as 'root_xmlns' \
			--option attribute '/h:html/h:head/h:title'  --as 'title' \
			--option attribute 'count(//h:h1)'           --as 'h1_count' \
			--option attribute 'count(//h:h2)'           --as 'h2_count' \
			--option attribute 'count(//h:h3)'           --as 'h3_count'
}

filterAndOrder() {
	relpipe-tr-sql \
		--relation "pages" \
			"SELECT
				name,
				title,
				h1_count,
				h2_count,
				h3_count
			FROM filesystem WHERE root_xmlns = ?
			ORDER BY h1_count + h2_count + h3_count DESC
			LIMIT 5" \
			--type-cast 'h1_count' integer \
			--type-cast 'h2_count' integer \
			--type-cast 'h3_count' integer \
			--parameter "$XMLNS_H";
}

findFiles | fetchAttributes | filterAndOrder | relpipe-out-gui -title "Pages and titles"