329
|
1 |
#!/bin/bash
|
|
2 |
|
|
3 |
HTML='
|
|
4 |
|
|
5 |
<p>Our company is focused on:
|
|
6 |
<ul>
|
|
7 |
<li>video game arcades
|
|
8 |
<li>laundry</li>
|
|
9 |
<LI>cigarette machines and trucking
|
|
10 |
<li>personal loans and politics
|
|
11 |
</ul>
|
|
12 |
|
|
13 |
<!-- TODO: add more GIFs -->
|
|
14 |
|
|
15 |
<P>Visit our <a href="index.htm">front page</a> and check the <A href="news.php">news</a>!!!
|
|
16 |
|
|
17 |
<!--
|
|
18 |
Yes, this HTML tagsoup is total mess.
|
|
19 |
But do you still remember the pure joy when you put your first website on the internet?
|
|
20 |
-->
|
|
21 |
|
|
22 |
<p>
|
|
23 |
download <a href="mp3.cgi?token=61686f6a-e1ab-4b92-c357-474e552f4c69" class="mp3">free MP3</a> now
|
|
24 |
</P>
|
|
25 |
|
|
26 |
<!--
|
|
27 |
Best viewed in Netscape Navigator and resolution 800×600
|
|
28 |
(c) Fvkgrra Pnaqyrf ltd. 1984-2022 -->
|
|
29 |
|
|
30 |
';
|
|
31 |
|
|
32 |
fetch_html() {
|
|
33 |
# there might be a wget/curl call to download a fresh version of the web page
|
|
34 |
echo "$HTML";
|
|
35 |
}
|
|
36 |
|
|
37 |
|
|
38 |
extract_relations() {
|
|
39 |
relpipe-in-htmltable \
|
|
40 |
--relation 'field_of_business' \
|
|
41 |
--records '//li' \
|
|
42 |
--attribute 'priority' integer 'count(preceding::li)+1' \
|
|
43 |
--attribute 'name' string '.' \
|
|
44 |
--attribute 'normalized' string 'normalize-space(.)' \
|
|
45 |
--relation 'hyperlink' \
|
|
46 |
--records '//a' \
|
|
47 |
--attribute 'url' string '@href' \
|
|
48 |
--attribute 'name' string '.' \
|
|
49 |
--attribute 'xpath' string '.' --mode xpath \
|
|
50 |
--relation 'download_token' \
|
|
51 |
--records '//a[@class="mp3"]' \
|
|
52 |
--attribute 'value' string 'substring(@href, 15)' \
|
|
53 |
--relation 'hidden_footer' \
|
|
54 |
--records '//comment()[count(following::*)=0]' \
|
|
55 |
--attribute 'text' string 'normalize-space(.)'
|
|
56 |
}
|
|
57 |
|
|
58 |
format_result() { [[ -t 1 ]] && relpipe-out-tabular || cat; }
|
|
59 |
|
|
60 |
|
|
61 |
# fetch_html | html2xml 1>&2
|
|
62 |
|
|
63 |
fetch_html | extract_relations | format_result
|