--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/relpipe-data/examples/html-tagsoup-1.sh Mon Feb 21 00:43:11 2022 +0100
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+HTML='
+
+<p>Our company is focused on:
+<ul>
+<li>video game arcades
+<li>laundry</li>
+<LI>cigarette machines and trucking
+<li>personal loans and politics
+</ul>
+
+<!-- TODO: add more GIFs -->
+
+<P>Visit our <a href="index.htm">front page</a> and check the <A href="news.php">news</a>!!!
+
+<!--
+Yes, this HTML tagsoup is total mess.
+But do you still remember the pure joy when you put your first website on the internet?
+-->
+
+<p>
+download <a href="mp3.cgi?token=61686f6a-e1ab-4b92-c357-474e552f4c69" class="mp3">free MP3</a> now
+</P>
+
+<!--
+Best viewed in Netscape Navigator and resolution 800×600
+(c) Fvkgrra Pnaqyrf ltd. 1984-2022 -->
+
+';
+
+fetch_html() {
+ # there might be a wget/curl call to download a fresh version of the web page
+ echo "$HTML";
+}
+
+
+extract_relations() {
+ relpipe-in-htmltable \
+ --relation 'field_of_business' \
+ --records '//li' \
+ --attribute 'priority' integer 'count(preceding::li)+1' \
+ --attribute 'name' string '.' \
+ --attribute 'normalized' string 'normalize-space(.)' \
+ --relation 'hyperlink' \
+ --records '//a' \
+ --attribute 'url' string '@href' \
+ --attribute 'name' string '.' \
+ --attribute 'xpath' string '.' --mode xpath \
+ --relation 'download_token' \
+ --records '//a[@class="mp3"]' \
+ --attribute 'value' string 'substring(@href, 15)' \
+ --relation 'hidden_footer' \
+ --records '//comment()[count(following::*)=0]' \
+ --attribute 'text' string 'normalize-space(.)'
+}
+
+format_result() { [[ -t 1 ]] && relpipe-out-tabular || cat; }
+
+
+# fetch_html | html2xml 1>&2
+
+fetch_html | extract_relations | format_result