author | František Kučera <franta-hg@frantovo.cz> |
Sun, 21 Feb 2021 22:11:49 +0100 | |
branch | v_0 |
changeset 1 | db5418b04c63 |
parent 0 | 2f783f0573fa |
child 2 | 4b938619848b |
permissions | -rw-r--r-- |
0 | 1 |
/** |
2 |
* Relational pipes |
|
3 |
* Copyright © 2019 František Kučera (Frantovo.cz, GlobalCode.info) |
|
4 |
* |
|
5 |
* This program is free software: you can redistribute it and/or modify |
|
6 |
* it under the terms of the GNU General Public License as published by |
|
7 |
* the Free Software Foundation, version 3 of the License. |
|
8 |
* |
|
9 |
* This program is distributed in the hope that it will be useful, |
|
10 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
11 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
12 |
* GNU General Public License for more details. |
|
13 |
* |
|
14 |
* You should have received a copy of the GNU General Public License |
|
15 |
* along with this program. If not, see <http://www.gnu.org/licenses/>. |
|
16 |
*/ |
|
17 |
#pragma once |
|
18 |
||
19 |
namespace relpipe { |
|
20 |
namespace in { |
|
21 |
namespace xmltable { |
|
22 |
||
1
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
23 |
#include <sstream> |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
24 |
#include <stdexcept> |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
25 |
|
0 | 26 |
#include <libxml++-2.6/libxml++/libxml++.h> |
1
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
27 |
#include <gumbo.h> |
0 | 28 |
|
29 |
class XMLDocumentConstructor { |
|
30 |
private: |
|
31 |
std::istream* input = nullptr; |
|
32 |
xmlpp::DomParser* parser = nullptr; |
|
1
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
33 |
|
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
34 |
std::string readInput() { |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
35 |
std::string result(std::istreambuf_iterator<char>(*input),{}); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
36 |
return result; |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
37 |
} |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
38 |
|
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
39 |
class GumboTree { |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
40 |
private: |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
41 |
std::string htmlText; |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
42 |
GumboOptions options = kGumboDefaultOptions; |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
43 |
GumboOutput* tree = nullptr; |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
44 |
|
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
45 |
void html2xml(xmlpp::Document* document, xmlpp::Element* parent, GumboNode* htmlNode) { |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
46 |
if (htmlNode->type == GUMBO_NODE_ELEMENT || htmlNode->type == GUMBO_NODE_TEMPLATE) { |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
47 |
std::string elementName = gumbo_normalized_tagname(htmlNode->v.element.tag); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
48 |
|
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
49 |
if (htmlNode->v.element.tag == GUMBO_TAG_UNKNOWN || elementName.empty()) { |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
50 |
elementName = "UNKNOWN"; |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
51 |
// TODO: find original element name, maybe gumbo_normalize_svg_tagname() |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
52 |
} |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
53 |
|
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
54 |
// TODO: optional support for namespaces |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
55 |
xmlpp::Element* element; |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
56 |
if (parent == nullptr) { |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
57 |
element = document->create_root_node(elementName); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
58 |
} else { |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
59 |
element = parent->add_child(elementName); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
60 |
} |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
61 |
|
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
62 |
for (size_t i = 0; i < htmlNode->v.element.attributes.length; i++) { |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
63 |
GumboAttribute* htmlAttribute = static_cast<GumboAttribute*> (htmlNode->v.element.attributes.data[i]); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
64 |
element->set_attribute(htmlAttribute->name, htmlAttribute->value); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
65 |
} |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
66 |
|
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
67 |
for (size_t i = 0; i < htmlNode->v.element.children.length; i++) { |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
68 |
html2xml(document, element, static_cast<GumboNode*> (htmlNode->v.element.children.data[i])); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
69 |
} |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
70 |
|
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
71 |
|
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
72 |
} else if (htmlNode->type == GUMBO_NODE_TEXT || htmlNode->type == GUMBO_NODE_WHITESPACE) { |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
73 |
std::string value = htmlNode->v.text.text; |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
74 |
parent->add_child_text(value); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
75 |
} else if (htmlNode->type == GUMBO_NODE_CDATA) { |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
76 |
parent->add_child_cdata(htmlNode->v.text.text); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
77 |
} else if (htmlNode->type == GUMBO_NODE_COMMENT) { |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
78 |
parent->add_child_comment(htmlNode->v.text.text); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
79 |
} else { |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
80 |
throw std::logic_error("Unable to convert HTML to XML: Unsupported GumboNode type: " + std::to_string(htmlNode->type)); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
81 |
} |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
82 |
} |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
83 |
|
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
84 |
public: |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
85 |
|
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
86 |
GumboTree(std::string htmlText) : htmlText(htmlText) { |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
87 |
tree = gumbo_parse_with_options(&options, this->htmlText.c_str(), this->htmlText.size()); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
88 |
if (tree == nullptr) throw std::logic_error("Unable to parse HTML text."); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
89 |
} |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
90 |
|
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
91 |
virtual ~GumboTree() { |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
92 |
if (tree) gumbo_destroy_output(&options, tree); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
93 |
} |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
94 |
|
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
95 |
void exportTo(xmlpp::Document* document) { |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
96 |
auto doctype = tree->document->v.document; |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
97 |
if (strlen(doctype.name)) document->set_internal_subset(doctype.name, doctype.public_identifier, doctype.system_identifier); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
98 |
html2xml(document, nullptr, tree->root); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
99 |
} |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
100 |
}; |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
101 |
|
0 | 102 |
public: |
103 |
||
104 |
XMLDocumentConstructor(std::istream* input, xmlpp::DomParser* parser) : input(input), parser(parser) { |
|
105 |
} |
|
1
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
106 |
|
0 | 107 |
void setOption(const std::string& uri, const std::string& value) { |
108 |
} |
|
109 |
||
110 |
void process() { |
|
1
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
111 |
GumboTree gumboTree(readInput()); |
db5418b04c63
first working version
František Kučera <franta-hg@frantovo.cz>
parents:
0
diff
changeset
|
112 |
gumboTree.exportTo(parser->get_document()); |
0 | 113 |
} |
114 |
}; |
|
115 |
||
116 |
} |
|
117 |
} |
|
118 |
} |