relpipe-data/examples/html-tagsoup-1.sh
author František Kučera <franta-hg@frantovo.cz>
Mon, 21 Feb 2022 00:43:11 +0100
branchv_0
changeset 329 5bc2bb8b7946
permissions -rwxr-xr-x
Release v0.18

#!/bin/bash

HTML='

<p>Our company is focused on:
<ul>
<li>video game arcades
<li>laundry</li>
<LI>cigarette machines and trucking
<li>personal loans and politics
</ul>

<!-- TODO: add more GIFs -->

<P>Visit our <a href="index.htm">front page</a> and check the <A href="news.php">news</a>!!!

<!--
Yes, this HTML tagsoup is total mess.
But do you still remember the pure joy when you put your first website on the internet?
-->

<p>
download <a href="mp3.cgi?token=61686f6a-e1ab-4b92-c357-474e552f4c69" class="mp3">free MP3</a> now
</P>

<!--
Best viewed in Netscape Navigator and resolution 800×600
(c) Fvkgrra Pnaqyrf ltd. 1984-2022 -->

';

fetch_html() {
	# there might be a wget/curl call to download a fresh version of the web page
	echo "$HTML";
}


extract_relations() {
	relpipe-in-htmltable \
		--relation 'field_of_business' \
			--records '//li' \
			--attribute 'priority'   integer  'count(preceding::li)+1' \
			--attribute 'name'       string   '.' \
			--attribute 'normalized' string   'normalize-space(.)' \
		--relation 'hyperlink' \
			--records '//a' \
			--attribute 'url'        string   '@href' \
			--attribute 'name'       string   '.' \
			--attribute 'xpath'      string   '.' --mode xpath \
		--relation 'download_token' \
			--records '//a[@class="mp3"]' \
			--attribute 'value'      string   'substring(@href, 15)' \
		--relation 'hidden_footer' \
			--records '//comment()[count(following::*)=0]' \
			--attribute 'text'       string   'normalize-space(.)'
}

format_result() { [[ -t 1 ]] && relpipe-out-tabular || cat; }


# fetch_html | html2xml 1>&2

fetch_html | extract_relations | format_result