relpipe-data/examples/html-tagsoup-1.sh
author František Kučera <franta-hg@frantovo.cz>
Mon, 21 Feb 2022 00:43:11 +0100
branchv_0
changeset 329 5bc2bb8b7946
permissions -rwxr-xr-x
Release v0.18
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
329
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     1
#!/bin/bash
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     2
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     3
HTML='
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     4
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     5
<p>Our company is focused on:
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     6
<ul>
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     7
<li>video game arcades
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     8
<li>laundry</li>
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     9
<LI>cigarette machines and trucking
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    10
<li>personal loans and politics
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    11
</ul>
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    12
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    13
<!-- TODO: add more GIFs -->
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    14
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    15
<P>Visit our <a href="index.htm">front page</a> and check the <A href="news.php">news</a>!!!
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    16
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    17
<!--
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    18
Yes, this HTML tagsoup is total mess.
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    19
But do you still remember the pure joy when you put your first website on the internet?
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    20
-->
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    21
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    22
<p>
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    23
download <a href="mp3.cgi?token=61686f6a-e1ab-4b92-c357-474e552f4c69" class="mp3">free MP3</a> now
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    24
</P>
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    25
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    26
<!--
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    27
Best viewed in Netscape Navigator and resolution 800×600
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    28
(c) Fvkgrra Pnaqyrf ltd. 1984-2022 -->
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    29
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    30
';
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    31
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    32
fetch_html() {
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    33
	# there might be a wget/curl call to download a fresh version of the web page
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    34
	echo "$HTML";
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    35
}
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    36
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    37
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    38
extract_relations() {
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    39
	relpipe-in-htmltable \
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    40
		--relation 'field_of_business' \
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    41
			--records '//li' \
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    42
			--attribute 'priority'   integer  'count(preceding::li)+1' \
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    43
			--attribute 'name'       string   '.' \
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    44
			--attribute 'normalized' string   'normalize-space(.)' \
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    45
		--relation 'hyperlink' \
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    46
			--records '//a' \
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    47
			--attribute 'url'        string   '@href' \
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    48
			--attribute 'name'       string   '.' \
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    49
			--attribute 'xpath'      string   '.' --mode xpath \
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    50
		--relation 'download_token' \
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    51
			--records '//a[@class="mp3"]' \
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    52
			--attribute 'value'      string   'substring(@href, 15)' \
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    53
		--relation 'hidden_footer' \
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    54
			--records '//comment()[count(following::*)=0]' \
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    55
			--attribute 'text'       string   'normalize-space(.)'
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    56
}
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    57
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    58
format_result() { [[ -t 1 ]] && relpipe-out-tabular || cat; }
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    59
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    60
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    61
# fetch_html | html2xml 1>&2
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    62
5bc2bb8b7946 Release v0.18
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    63
fetch_html | extract_relations | format_result