relpipe-data/examples/xhtml-filesystem-xpath.sh
branchv_0
changeset 294 abbc9bcfbcc4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/relpipe-data/examples/xhtml-filesystem-xpath.sh	Mon Feb 03 22:10:07 2020 +0100
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+XMLNS_H="http://www.w3.org/1999/xhtml" 
+
+# If we set xmlns_h="…", we can omit: --option xmlns_h "$XMLNS_H"
+# because XML namespaces can be provided either as an option or as an environmental variable.
+# Options have precedence.
+
+findFiles() {
+	find -print0;
+}
+
+fetchAttributes() {
+	relpipe-in-filesystem \
+		--parallel 8 \
+		--file name \
+		--streamlet xpath \
+			--option xmlns_h "$XMLNS_H" \
+			--option attribute '.' --option mode boolean --as 'valid_xml'  \
+			--option attribute 'namespace-uri()'         --as 'root_xmlns' \
+			--option attribute '/h:html/h:head/h:title'  --as 'title' \
+			--option attribute 'count(//h:h1)'           --as 'h1_count' \
+			--option attribute 'count(//h:h2)'           --as 'h2_count' \
+			--option attribute 'count(//h:h3)'           --as 'h3_count'
+}
+
+filterAndOrder() {
+	relpipe-tr-sql \
+		--relation "pages" \
+			"SELECT
+				name,
+				title,
+				h1_count,
+				h2_count,
+				h3_count
+			FROM filesystem WHERE root_xmlns = ?
+			ORDER BY h1_count + h2_count + h3_count DESC
+			LIMIT 5" \
+			--type-cast 'h1_count' integer \
+			--type-cast 'h2_count' integer \
+			--type-cast 'h3_count' integer \
+			--parameter "$XMLNS_H";
+}
+
+findFiles | fetchAttributes | filterAndOrder | relpipe-out-gui -title "Pages and titles"