diff -r cc60c8dd7924 -r 5bc2bb8b7946 relpipe-data/examples/html-tagsoup-1.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/relpipe-data/examples/html-tagsoup-1.sh Mon Feb 21 00:43:11 2022 +0100 @@ -0,0 +1,63 @@ +#!/bin/bash + +HTML=' + +

Our company is focused on: +

+ + + +

Visit our front page and check the news!!! + + + +

+download free MP3 now +

+ + + +'; + +fetch_html() { + # there might be a wget/curl call to download a fresh version of the web page + echo "$HTML"; +} + + +extract_relations() { + relpipe-in-htmltable \ + --relation 'field_of_business' \ + --records '//li' \ + --attribute 'priority' integer 'count(preceding::li)+1' \ + --attribute 'name' string '.' \ + --attribute 'normalized' string 'normalize-space(.)' \ + --relation 'hyperlink' \ + --records '//a' \ + --attribute 'url' string '@href' \ + --attribute 'name' string '.' \ + --attribute 'xpath' string '.' --mode xpath \ + --relation 'download_token' \ + --records '//a[@class="mp3"]' \ + --attribute 'value' string 'substring(@href, 15)' \ + --relation 'hidden_footer' \ + --records '//comment()[count(following::*)=0]' \ + --attribute 'text' string 'normalize-space(.)' +} + +format_result() { [[ -t 1 ]] && relpipe-out-tabular || cat; } + + +# fetch_html | html2xml 1>&2 + +fetch_html | extract_relations | format_result