18 |
18 |
19 namespace relpipe { |
19 namespace relpipe { |
20 namespace in { |
20 namespace in { |
21 namespace xmltable { |
21 namespace xmltable { |
22 |
22 |
|
23 #include <sstream> |
|
24 #include <stdexcept> |
|
25 |
23 #include <libxml++-2.6/libxml++/libxml++.h> |
26 #include <libxml++-2.6/libxml++/libxml++.h> |
|
27 #include <gumbo.h> |
24 |
28 |
25 class XMLDocumentConstructor { |
29 class XMLDocumentConstructor { |
26 private: |
30 private: |
27 std::istream* input = nullptr; |
31 std::istream* input = nullptr; |
28 xmlpp::DomParser* parser = nullptr; |
32 xmlpp::DomParser* parser = nullptr; |
|
33 |
|
34 std::string readInput() { |
|
35 std::string result(std::istreambuf_iterator<char>(*input),{}); |
|
36 return result; |
|
37 } |
|
38 |
|
39 class GumboTree { |
|
40 private: |
|
41 std::string htmlText; |
|
42 GumboOptions options = kGumboDefaultOptions; |
|
43 GumboOutput* tree = nullptr; |
|
44 |
|
45 void html2xml(xmlpp::Document* document, xmlpp::Element* parent, GumboNode* htmlNode) { |
|
46 if (htmlNode->type == GUMBO_NODE_ELEMENT || htmlNode->type == GUMBO_NODE_TEMPLATE) { |
|
47 std::string elementName = gumbo_normalized_tagname(htmlNode->v.element.tag); |
|
48 |
|
49 if (htmlNode->v.element.tag == GUMBO_TAG_UNKNOWN || elementName.empty()) { |
|
50 elementName = "UNKNOWN"; |
|
51 // TODO: find original element name, maybe gumbo_normalize_svg_tagname() |
|
52 } |
|
53 |
|
54 // TODO: optional support for namespaces |
|
55 xmlpp::Element* element; |
|
56 if (parent == nullptr) { |
|
57 element = document->create_root_node(elementName); |
|
58 } else { |
|
59 element = parent->add_child(elementName); |
|
60 } |
|
61 |
|
62 for (size_t i = 0; i < htmlNode->v.element.attributes.length; i++) { |
|
63 GumboAttribute* htmlAttribute = static_cast<GumboAttribute*> (htmlNode->v.element.attributes.data[i]); |
|
64 element->set_attribute(htmlAttribute->name, htmlAttribute->value); |
|
65 } |
|
66 |
|
67 for (size_t i = 0; i < htmlNode->v.element.children.length; i++) { |
|
68 html2xml(document, element, static_cast<GumboNode*> (htmlNode->v.element.children.data[i])); |
|
69 } |
|
70 |
|
71 |
|
72 } else if (htmlNode->type == GUMBO_NODE_TEXT || htmlNode->type == GUMBO_NODE_WHITESPACE) { |
|
73 std::string value = htmlNode->v.text.text; |
|
74 parent->add_child_text(value); |
|
75 } else if (htmlNode->type == GUMBO_NODE_CDATA) { |
|
76 parent->add_child_cdata(htmlNode->v.text.text); |
|
77 } else if (htmlNode->type == GUMBO_NODE_COMMENT) { |
|
78 parent->add_child_comment(htmlNode->v.text.text); |
|
79 } else { |
|
80 throw std::logic_error("Unable to convert HTML to XML: Unsupported GumboNode type: " + std::to_string(htmlNode->type)); |
|
81 } |
|
82 } |
|
83 |
|
84 public: |
|
85 |
|
86 GumboTree(std::string htmlText) : htmlText(htmlText) { |
|
87 tree = gumbo_parse_with_options(&options, this->htmlText.c_str(), this->htmlText.size()); |
|
88 if (tree == nullptr) throw std::logic_error("Unable to parse HTML text."); |
|
89 } |
|
90 |
|
91 virtual ~GumboTree() { |
|
92 if (tree) gumbo_destroy_output(&options, tree); |
|
93 } |
|
94 |
|
95 void exportTo(xmlpp::Document* document) { |
|
96 auto doctype = tree->document->v.document; |
|
97 if (strlen(doctype.name)) document->set_internal_subset(doctype.name, doctype.public_identifier, doctype.system_identifier); |
|
98 html2xml(document, nullptr, tree->root); |
|
99 } |
|
100 }; |
|
101 |
29 public: |
102 public: |
30 |
103 |
31 XMLDocumentConstructor(std::istream* input, xmlpp::DomParser* parser) : input(input), parser(parser) { |
104 XMLDocumentConstructor(std::istream* input, xmlpp::DomParser* parser) : input(input), parser(parser) { |
32 } |
105 } |
33 |
106 |
34 void setOption(const std::string& uri, const std::string& value) { |
107 void setOption(const std::string& uri, const std::string& value) { |
35 } |
108 } |
36 |
109 |
37 void process() { |
110 void process() { |
38 parser->parse_stream(*input); |
111 GumboTree gumboTree(readInput()); |
|
112 gumboTree.exportTo(parser->get_document()); |
39 } |
113 } |
40 }; |
114 }; |
41 |
115 |
42 } |
116 } |
43 } |
117 } |