|
1 #!/bin/bash |
|
2 |
|
3 HTML=' |
|
4 |
|
5 <p>Our company is focused on: |
|
6 <ul> |
|
7 <li>video game arcades |
|
8 <li>laundry</li> |
|
9 <LI>cigarette machines and trucking |
|
10 <li>personal loans and politics |
|
11 </ul> |
|
12 |
|
13 <!-- TODO: add more GIFs --> |
|
14 |
|
15 <P>Visit our <a href="index.htm">front page</a> and check the <A href="news.php">news</a>!!! |
|
16 |
|
17 <!-- |
|
18 Yes, this HTML tagsoup is total mess. |
|
19 But do you still remember the pure joy when you put your first website on the internet? |
|
20 --> |
|
21 |
|
22 <p> |
|
23 download <a href="mp3.cgi?token=61686f6a-e1ab-4b92-c357-474e552f4c69" class="mp3">free MP3</a> now |
|
24 </P> |
|
25 |
|
26 <!-- |
|
27 Best viewed in Netscape Navigator and resolution 800×600 |
|
28 (c) Fvkgrra Pnaqyrf ltd. 1984-2022 --> |
|
29 |
|
30 '; |
|
31 |
|
32 fetch_html() { |
|
33 # there might be a wget/curl call to download a fresh version of the web page |
|
34 echo "$HTML"; |
|
35 } |
|
36 |
|
37 |
|
38 extract_relations() { |
|
39 relpipe-in-htmltable \ |
|
40 --relation 'field_of_business' \ |
|
41 --records '//li' \ |
|
42 --attribute 'priority' integer 'count(preceding::li)+1' \ |
|
43 --attribute 'name' string '.' \ |
|
44 --attribute 'normalized' string 'normalize-space(.)' \ |
|
45 --relation 'hyperlink' \ |
|
46 --records '//a' \ |
|
47 --attribute 'url' string '@href' \ |
|
48 --attribute 'name' string '.' \ |
|
49 --attribute 'xpath' string '.' --mode xpath \ |
|
50 --relation 'download_token' \ |
|
51 --records '//a[@class="mp3"]' \ |
|
52 --attribute 'value' string 'substring(@href, 15)' \ |
|
53 --relation 'hidden_footer' \ |
|
54 --records '//comment()[count(following::*)=0]' \ |
|
55 --attribute 'text' string 'normalize-space(.)' |
|
56 } |
|
57 |
|
58 format_result() { [[ -t 1 ]] && relpipe-out-tabular || cat; } |
|
59 |
|
60 |
|
61 # fetch_html | html2xml 1>&2 |
|
62 |
|
63 fetch_html | extract_relations | format_result |