relpipe-data/streamlets-preview.xml
author František Kučera <franta-hg@frantovo.cz>
Fri, 17 Jan 2020 19:56:22 +0100
branchv_0
changeset 292 c4b4864225de
child 326 ab7f333f1225
permissions -rw-r--r--
streamlets preview
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
292
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     1
<stránka
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     2
	xmlns="https://trac.frantovo.cz/xml-web-generator/wiki/xmlns/strana"
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     3
	xmlns:m="https://trac.frantovo.cz/xml-web-generator/wiki/xmlns/makro">
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     4
	
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     5
	<nadpis>Streamlets preview</nadpis>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     6
	<perex>an early example of streamlets in relpipe-in-filesystem</perex>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     7
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     8
	<text xmlns="http://www.w3.org/1999/xhtml">
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     9
		
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    10
		<p>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    11
			<em>This is an early preview published at 2020-01-17 before the v0.15 release.</em>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    12
		</p>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    13
		
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    14
		<p>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    15
			First prepare some files:
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    16
		</p>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    17
		
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    18
		<m:pre jazyk="shell"><![CDATA[$ wget --xattr https://upload.wikimedia.org/wikipedia/commons/d/d4/HURD_Live_CD.png
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    19
$ wget --xattr https://sane-software.globalcode.info/v_0/ssm.en.pdf
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    20
$ wget --xattr https://alt2xml.globalcode.info/sql-api_alt2xml_talk_2014.pdf
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    21
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    22
$ ls -1
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    23
HURD_Live_CD.png
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    24
search.sh
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    25
sql-api_alt2xml_talk_2014.pdf
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    26
ssm.en.pdf]]></m:pre>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    27
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    28
		<p>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    29
			Collect metadata (file path, extended attributes, image size, number of PDF pages, number of text lines, OCR recognized text extracted from images and plain-text extracted from PDF files),
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    30
			filter the results (do restriction), select only certain attributes (do projection)
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    31
			and format result as a table:
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    32
		</p>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    33
		
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    34
		<m:pre jazyk="shell"><![CDATA[find -print0 \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    35
	| relpipe-in-filesystem \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    36
		--file path \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    37
		--xattr xdg.origin.url --as 'url' \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    38
		--streamlet exiftool \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    39
			--option 'attribute' 'PNG:ImageWidth'  --as 'width' \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    40
			--option 'attribute' 'PNG:ImageHeight' --as 'height' \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    41
			--option 'attribute' 'PDF:PageCount'   --as 'page_count' \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    42
		--streamlet lines_count \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    43
		--streamlet tesseract \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    44
			--option 'language' 'eng' \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    45
			--as 'ocr_text' \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    46
		--streamlet pdftotext --as 'pdf_text' \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    47
	| relpipe-tr-awk \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    48
		--relation filesystem \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    49
		--where 'path ~ /\.sh$/ || url ~ /alt2xml\.globalcode\.info/ || ocr_text ~ /GNU/ || pdf_text ~ /Sane/' \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    50
	| relpipe-tr-cut filesystem 'path|url|width|height|page_count|lines_count' \
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    51
	| relpipe-out-tabular
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    52
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    53
# if too wide, add: | less -RSi]]></m:pre>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    54
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    55
		<p>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    56
			Which will print:
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    57
		</p>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    58
		
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    59
		<m:pre jazyk="text"><![CDATA[filesystem:
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    60
 ╭─────────────────────────────────┬──────────────────────────────────────────────────────────────────────┬────────────────┬─────────────────┬─────────────────────┬───────────────────────╮
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    61
 │ path                   (string) │ url                                                         (string) │ width (string) │ height (string) │ page_count (string) │ lines_count (integer) │
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    62
 ├─────────────────────────────────┼──────────────────────────────────────────────────────────────────────┼────────────────┼─────────────────┼─────────────────────┼───────────────────────┤
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    63
 │ ./HURD_Live_CD.png              │ https://upload.wikimedia.org/wikipedia/commons/d/d4/HURD_Live_CD.png │ 720            │ 400             │                     │                     8 │
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    64
 │ ./ssm.en.pdf                    │ https://sane-software.globalcode.info/v_0/ssm.en.pdf                 │                │                 │ 6                   │                   568 │
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    65
 │ ./sql-api_alt2xml_talk_2014.pdf │ https://alt2xml.globalcode.info/sql-api_alt2xml_talk_2014.pdf        │                │                 │ 21                  │                   696 │
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    66
 │ ./search.sh                     │                                                                      │                │                 │                     │                    21 │
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    67
 ╰─────────────────────────────────┴──────────────────────────────────────────────────────────────────────┴────────────────┴─────────────────┴─────────────────────┴───────────────────────╯
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    68
Record count: 4]]></m:pre>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    69
		
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    70
		<p>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    71
			How it looks in the terminal:
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    72
		</p>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    73
		
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    74
		<m:img src="img/streamlets-preview.png"/>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    75
		
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    76
		<p>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    77
			OCR and PDF text extractions (and also other metadata extractions) are done on-the-fly in the pipeline.
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    78
			Especially the OCR may take some time, so it is usually better in such case to break the pipe in the middle, 
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    79
			redirect intermediate result to a file (serves like an index or cache) and then use it multiple times 
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    80
			(just <code>cat</code> the file and continue the original pipeline; BTW: multiple files can be simply concatenated, the format is designed for such use). 
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    81
			But in most cases, it is not necessary and we work with live data.
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    82
		</p>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    83
		
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    84
		<p>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    85
			Please note that this is really fresh, it has not been released and can be seen only in the Mercurial repository.
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    86
			The streamlets used can be seen here: <a href="https://hg.globalcode.info/relpipe/relpipe-in-filesystem.cpp/file/tip/streamlet-examples">streamlet-examples</a>. 
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    87
			And even the upcoming release v0.15 is still a development version (it will work, but the API might change in future – until we release v1.0 which will be stable and production ready).
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    88
		</p>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    89
		
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    90
		<p>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    91
			Regarding performance:
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    92
			currently it is parallelized only over attributes (each streamlet instance runs in a separate process). 
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    93
			In v0.15 it will be parallelized also over records (files in this case).
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    94
		</p>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    95
		
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    96
	</text>
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    97
c4b4864225de streamlets preview
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    98
</stránka>