#!/bin/bash
HTML='
<p>Our company is focused on:
<ul>
<li>video game arcades
<li>laundry</li>
<LI>cigarette machines and trucking
<li>personal loans and politics
</ul>
<!-- TODO: add more GIFs -->
<P>Visit our <a href="index.htm">front page</a> and check the <A href="news.php">news</a>!!!
<!--
Yes, this HTML tagsoup is total mess.
But do you still remember the pure joy when you put your first website on the internet?
-->
<p>
download <a href="mp3.cgi?token=61686f6a-e1ab-4b92-c357-474e552f4c69" class="mp3">free MP3</a> now
</P>
<!--
Best viewed in Netscape Navigator and resolution 800×600
(c) Fvkgrra Pnaqyrf ltd. 1984-2022 -->
';
fetch_html() {
# there might be a wget/curl call to download a fresh version of the web page
echo "$HTML";
}
extract_relations() {
relpipe-in-htmltable \
--relation 'field_of_business' \
--records '//li' \
--attribute 'priority' integer 'count(preceding::li)+1' \
--attribute 'name' string '.' \
--attribute 'normalized' string 'normalize-space(.)' \
--relation 'hyperlink' \
--records '//a' \
--attribute 'url' string '@href' \
--attribute 'name' string '.' \
--attribute 'xpath' string '.' --mode xpath \
--relation 'download_token' \
--records '//a[@class="mp3"]' \
--attribute 'value' string 'substring(@href, 15)' \
--relation 'hidden_footer' \
--records '//comment()[count(following::*)=0]' \
--attribute 'text' string 'normalize-space(.)'
}
format_result() { [[ -t 1 ]] && relpipe-out-tabular || cat; }
# fetch_html | html2xml 1>&2
fetch_html | extract_relations | format_result