relpipe-data/examples/html-tagsoup-1.sh
branchv_0
changeset 329 5bc2bb8b7946
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/relpipe-data/examples/html-tagsoup-1.sh	Mon Feb 21 00:43:11 2022 +0100
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+HTML='
+
+<p>Our company is focused on:
+<ul>
+<li>video game arcades
+<li>laundry</li>
+<LI>cigarette machines and trucking
+<li>personal loans and politics
+</ul>
+
+<!-- TODO: add more GIFs -->
+
+<P>Visit our <a href="index.htm">front page</a> and check the <A href="news.php">news</a>!!!
+
+<!--
+Yes, this HTML tagsoup is total mess.
+But do you still remember the pure joy when you put your first website on the internet?
+-->
+
+<p>
+download <a href="mp3.cgi?token=61686f6a-e1ab-4b92-c357-474e552f4c69" class="mp3">free MP3</a> now
+</P>
+
+<!--
+Best viewed in Netscape Navigator and resolution 800×600
+(c) Fvkgrra Pnaqyrf ltd. 1984-2022 -->
+
+';
+
+fetch_html() {
+	# there might be a wget/curl call to download a fresh version of the web page
+	echo "$HTML";
+}
+
+
+extract_relations() {
+	relpipe-in-htmltable \
+		--relation 'field_of_business' \
+			--records '//li' \
+			--attribute 'priority'   integer  'count(preceding::li)+1' \
+			--attribute 'name'       string   '.' \
+			--attribute 'normalized' string   'normalize-space(.)' \
+		--relation 'hyperlink' \
+			--records '//a' \
+			--attribute 'url'        string   '@href' \
+			--attribute 'name'       string   '.' \
+			--attribute 'xpath'      string   '.' --mode xpath \
+		--relation 'download_token' \
+			--records '//a[@class="mp3"]' \
+			--attribute 'value'      string   'substring(@href, 15)' \
+		--relation 'hidden_footer' \
+			--records '//comment()[count(following::*)=0]' \
+			--attribute 'text'       string   'normalize-space(.)'
+}
+
+format_result() { [[ -t 1 ]] && relpipe-out-tabular || cat; }
+
+
+# fetch_html | html2xml 1>&2
+
+fetch_html | extract_relations | format_result