author | František Kučera <franta-hg@frantovo.cz> |
Mon, 27 Jul 2020 16:17:13 +0200 | |
branch | v_0 |
changeset 307 | 3b6638149349 |
parent 294 | abbc9bcfbcc4 |
permissions | -rwxr-xr-x |
294
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
1 |
#!/bin/bash |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
2 |
|
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
3 |
XMLNS_H="http://www.w3.org/1999/xhtml" |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
4 |
|
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
5 |
# If we set xmlns_h="…", we can omit: --option xmlns_h "$XMLNS_H" |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
6 |
# because XML namespaces can be provided either as an option or as an environmental variable. |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
7 |
# Options have precedence. |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
8 |
|
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
9 |
findFiles() { |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
10 |
find -print0; |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
11 |
} |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
12 |
|
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
13 |
fetchAttributes() { |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
14 |
relpipe-in-filesystem \ |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
15 |
--parallel 8 \ |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
16 |
--file name \ |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
17 |
--streamlet xpath \ |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
18 |
--option xmlns_h "$XMLNS_H" \ |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
19 |
--option attribute '.' --option mode boolean --as 'valid_xml' \ |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
20 |
--option attribute 'namespace-uri()' --as 'root_xmlns' \ |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
21 |
--option attribute '/h:html/h:head/h:title' --as 'title' \ |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
22 |
--option attribute 'count(//h:h1)' --as 'h1_count' \ |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
23 |
--option attribute 'count(//h:h2)' --as 'h2_count' \ |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
24 |
--option attribute 'count(//h:h3)' --as 'h3_count' |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
25 |
} |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
26 |
|
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
27 |
filterAndOrder() { |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
28 |
relpipe-tr-sql \ |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
29 |
--relation "pages" \ |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
30 |
"SELECT |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
31 |
name, |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
32 |
title, |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
33 |
h1_count, |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
34 |
h2_count, |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
35 |
h3_count |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
36 |
FROM filesystem WHERE root_xmlns = ? |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
37 |
ORDER BY h1_count + h2_count + h3_count DESC |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
38 |
LIMIT 5" \ |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
39 |
--type-cast 'h1_count' integer \ |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
40 |
--type-cast 'h2_count' integer \ |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
41 |
--type-cast 'h3_count' integer \ |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
42 |
--parameter "$XMLNS_H"; |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
43 |
} |
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
44 |
|
abbc9bcfbcc4
Release v0.15 – streamlets, parallel processing
František Kučera <franta-hg@frantovo.cz>
parents:
diff
changeset
|
45 |
findFiles | fetchAttributes | filterAndOrder | relpipe-out-gui -title "Pages and titles" |