relpipe-data/examples/xhtml-filesystem-xpath.sh
author František Kučera <franta-hg@frantovo.cz>
Mon, 03 Feb 2020 22:10:07 +0100
branchv_0
changeset 294 abbc9bcfbcc4
permissions -rwxr-xr-x
Release v0.15 – streamlets, parallel processing
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
294
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     1
#!/bin/bash
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     2
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     3
XMLNS_H="http://www.w3.org/1999/xhtml" 
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     4
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     5
# If we set xmlns_h="…", we can omit: --option xmlns_h "$XMLNS_H"
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     6
# because XML namespaces can be provided either as an option or as an environmental variable.
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     7
# Options have precedence.
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     8
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     9
findFiles() {
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    10
	find -print0;
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    11
}
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    12
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    13
fetchAttributes() {
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    14
	relpipe-in-filesystem \
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    15
		--parallel 8 \
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    16
		--file name \
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    17
		--streamlet xpath \
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    18
			--option xmlns_h "$XMLNS_H" \
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    19
			--option attribute '.' --option mode boolean --as 'valid_xml'  \
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    20
			--option attribute 'namespace-uri()'         --as 'root_xmlns' \
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    21
			--option attribute '/h:html/h:head/h:title'  --as 'title' \
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    22
			--option attribute 'count(//h:h1)'           --as 'h1_count' \
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    23
			--option attribute 'count(//h:h2)'           --as 'h2_count' \
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    24
			--option attribute 'count(//h:h3)'           --as 'h3_count'
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    25
}
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    26
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    27
filterAndOrder() {
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    28
	relpipe-tr-sql \
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    29
		--relation "pages" \
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    30
			"SELECT
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    31
				name,
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    32
				title,
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    33
				h1_count,
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    34
				h2_count,
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    35
				h3_count
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    36
			FROM filesystem WHERE root_xmlns = ?
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    37
			ORDER BY h1_count + h2_count + h3_count DESC
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    38
			LIMIT 5" \
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    39
			--type-cast 'h1_count' integer \
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    40
			--type-cast 'h2_count' integer \
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    41
			--type-cast 'h3_count' integer \
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    42
			--parameter "$XMLNS_H";
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    43
}
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    44
abbc9bcfbcc4 Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    45
findFiles | fetchAttributes | filterAndOrder | relpipe-out-gui -title "Pages and titles"