relpipe-data/examples/html-tagsoup-1.sh
branchv_0
changeset 329 5bc2bb8b7946
equal deleted inserted replaced
328:cc60c8dd7924 329:5bc2bb8b7946
       
     1 #!/bin/bash
       
     2 
       
     3 HTML='
       
     4 
       
     5 <p>Our company is focused on:
       
     6 <ul>
       
     7 <li>video game arcades
       
     8 <li>laundry</li>
       
     9 <LI>cigarette machines and trucking
       
    10 <li>personal loans and politics
       
    11 </ul>
       
    12 
       
    13 <!-- TODO: add more GIFs -->
       
    14 
       
    15 <P>Visit our <a href="index.htm">front page</a> and check the <A href="news.php">news</a>!!!
       
    16 
       
    17 <!--
       
    18 Yes, this HTML tagsoup is total mess.
       
    19 But do you still remember the pure joy when you put your first website on the internet?
       
    20 -->
       
    21 
       
    22 <p>
       
    23 download <a href="mp3.cgi?token=61686f6a-e1ab-4b92-c357-474e552f4c69" class="mp3">free MP3</a> now
       
    24 </P>
       
    25 
       
    26 <!--
       
    27 Best viewed in Netscape Navigator and resolution 800×600
       
    28 (c) Fvkgrra Pnaqyrf ltd. 1984-2022 -->
       
    29 
       
    30 ';
       
    31 
       
    32 fetch_html() {
       
    33 	# there might be a wget/curl call to download a fresh version of the web page
       
    34 	echo "$HTML";
       
    35 }
       
    36 
       
    37 
       
    38 extract_relations() {
       
    39 	relpipe-in-htmltable \
       
    40 		--relation 'field_of_business' \
       
    41 			--records '//li' \
       
    42 			--attribute 'priority'   integer  'count(preceding::li)+1' \
       
    43 			--attribute 'name'       string   '.' \
       
    44 			--attribute 'normalized' string   'normalize-space(.)' \
       
    45 		--relation 'hyperlink' \
       
    46 			--records '//a' \
       
    47 			--attribute 'url'        string   '@href' \
       
    48 			--attribute 'name'       string   '.' \
       
    49 			--attribute 'xpath'      string   '.' --mode xpath \
       
    50 		--relation 'download_token' \
       
    51 			--records '//a[@class="mp3"]' \
       
    52 			--attribute 'value'      string   'substring(@href, 15)' \
       
    53 		--relation 'hidden_footer' \
       
    54 			--records '//comment()[count(following::*)=0]' \
       
    55 			--attribute 'text'       string   'normalize-space(.)'
       
    56 }
       
    57 
       
    58 format_result() { [[ -t 1 ]] && relpipe-out-tabular || cat; }
       
    59 
       
    60 
       
    61 # fetch_html | html2xml 1>&2
       
    62 
       
    63 fetch_html | extract_relations | format_result