diff -r b862d16a2e9f -r abbc9bcfbcc4 relpipe-data/examples/xhtml-filesystem-xpath.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/relpipe-data/examples/xhtml-filesystem-xpath.sh Mon Feb 03 22:10:07 2020 +0100 @@ -0,0 +1,45 @@ +#!/bin/bash + +XMLNS_H="http://www.w3.org/1999/xhtml" + +# If we set xmlns_h="…", we can omit: --option xmlns_h "$XMLNS_H" +# because XML namespaces can be provided either as an option or as an environmental variable. +# Options have precedence. + +findFiles() { + find -print0; +} + +fetchAttributes() { + relpipe-in-filesystem \ + --parallel 8 \ + --file name \ + --streamlet xpath \ + --option xmlns_h "$XMLNS_H" \ + --option attribute '.' --option mode boolean --as 'valid_xml' \ + --option attribute 'namespace-uri()' --as 'root_xmlns' \ + --option attribute '/h:html/h:head/h:title' --as 'title' \ + --option attribute 'count(//h:h1)' --as 'h1_count' \ + --option attribute 'count(//h:h2)' --as 'h2_count' \ + --option attribute 'count(//h:h3)' --as 'h3_count' +} + +filterAndOrder() { + relpipe-tr-sql \ + --relation "pages" \ + "SELECT + name, + title, + h1_count, + h2_count, + h3_count + FROM filesystem WHERE root_xmlns = ? + ORDER BY h1_count + h2_count + h3_count DESC + LIMIT 5" \ + --type-cast 'h1_count' integer \ + --type-cast 'h2_count' integer \ + --type-cast 'h3_count' integer \ + --parameter "$XMLNS_H"; +} + +findFiles | fetchAttributes | filterAndOrder | relpipe-out-gui -title "Pages and titles"