relpipe-data/examples/xhtml-filesystem-xpath.sh
branchv_0
changeset 294 abbc9bcfbcc4
equal deleted inserted replaced
293:b862d16a2e9f 294:abbc9bcfbcc4
       
     1 #!/bin/bash
       
     2 
       
     3 XMLNS_H="http://www.w3.org/1999/xhtml" 
       
     4 
       
     5 # If we set xmlns_h="…", we can omit: --option xmlns_h "$XMLNS_H"
       
     6 # because XML namespaces can be provided either as an option or as an environmental variable.
       
     7 # Options have precedence.
       
     8 
       
     9 findFiles() {
       
    10 	find -print0;
       
    11 }
       
    12 
       
    13 fetchAttributes() {
       
    14 	relpipe-in-filesystem \
       
    15 		--parallel 8 \
       
    16 		--file name \
       
    17 		--streamlet xpath \
       
    18 			--option xmlns_h "$XMLNS_H" \
       
    19 			--option attribute '.' --option mode boolean --as 'valid_xml'  \
       
    20 			--option attribute 'namespace-uri()'         --as 'root_xmlns' \
       
    21 			--option attribute '/h:html/h:head/h:title'  --as 'title' \
       
    22 			--option attribute 'count(//h:h1)'           --as 'h1_count' \
       
    23 			--option attribute 'count(//h:h2)'           --as 'h2_count' \
       
    24 			--option attribute 'count(//h:h3)'           --as 'h3_count'
       
    25 }
       
    26 
       
    27 filterAndOrder() {
       
    28 	relpipe-tr-sql \
       
    29 		--relation "pages" \
       
    30 			"SELECT
       
    31 				name,
       
    32 				title,
       
    33 				h1_count,
       
    34 				h2_count,
       
    35 				h3_count
       
    36 			FROM filesystem WHERE root_xmlns = ?
       
    37 			ORDER BY h1_count + h2_count + h3_count DESC
       
    38 			LIMIT 5" \
       
    39 			--type-cast 'h1_count' integer \
       
    40 			--type-cast 'h2_count' integer \
       
    41 			--type-cast 'h3_count' integer \
       
    42 			--parameter "$XMLNS_H";
       
    43 }
       
    44 
       
    45 findFiles | fetchAttributes | filterAndOrder | relpipe-out-gui -title "Pages and titles"