--- a/src/CMakeLists.txt Sat Feb 20 20:32:56 2021 +0100
+++ b/src/CMakeLists.txt Sun Feb 21 22:11:49 2021 +0100
@@ -17,7 +17,7 @@
# Relpipe libraries:
INCLUDE(FindPkgConfig)
-pkg_check_modules (RELPIPE_LIBS relpipe-lib-writer.cpp relpipe-lib-cli.cpp libxml++-2.6)
+pkg_check_modules (RELPIPE_LIBS relpipe-lib-writer.cpp relpipe-lib-cli.cpp libxml++-2.6 gumbo)
include_directories(${RELPIPE_LIBS_INCLUDE_DIRS})
link_directories(${RELPIPE_LIBS_LIBRARY_DIRS})
--- a/src/XMLDocumentConstructor.h Sat Feb 20 20:32:56 2021 +0100
+++ b/src/XMLDocumentConstructor.h Sun Feb 21 22:11:49 2021 +0100
@@ -20,22 +20,96 @@
namespace in {
namespace xmltable {
+#include <sstream>
+#include <stdexcept>
+
#include <libxml++-2.6/libxml++/libxml++.h>
+#include <gumbo.h>
class XMLDocumentConstructor {
private:
std::istream* input = nullptr;
xmlpp::DomParser* parser = nullptr;
+
+ std::string readInput() {
+ std::string result(std::istreambuf_iterator<char>(*input),{});
+ return result;
+ }
+
+ class GumboTree {
+ private:
+ std::string htmlText;
+ GumboOptions options = kGumboDefaultOptions;
+ GumboOutput* tree = nullptr;
+
+ void html2xml(xmlpp::Document* document, xmlpp::Element* parent, GumboNode* htmlNode) {
+ if (htmlNode->type == GUMBO_NODE_ELEMENT || htmlNode->type == GUMBO_NODE_TEMPLATE) {
+ std::string elementName = gumbo_normalized_tagname(htmlNode->v.element.tag);
+
+ if (htmlNode->v.element.tag == GUMBO_TAG_UNKNOWN || elementName.empty()) {
+ elementName = "UNKNOWN";
+ // TODO: find original element name, maybe gumbo_normalize_svg_tagname()
+ }
+
+ // TODO: optional support for namespaces
+ xmlpp::Element* element;
+ if (parent == nullptr) {
+ element = document->create_root_node(elementName);
+ } else {
+ element = parent->add_child(elementName);
+ }
+
+ for (size_t i = 0; i < htmlNode->v.element.attributes.length; i++) {
+ GumboAttribute* htmlAttribute = static_cast<GumboAttribute*> (htmlNode->v.element.attributes.data[i]);
+ element->set_attribute(htmlAttribute->name, htmlAttribute->value);
+ }
+
+ for (size_t i = 0; i < htmlNode->v.element.children.length; i++) {
+ html2xml(document, element, static_cast<GumboNode*> (htmlNode->v.element.children.data[i]));
+ }
+
+
+ } else if (htmlNode->type == GUMBO_NODE_TEXT || htmlNode->type == GUMBO_NODE_WHITESPACE) {
+ std::string value = htmlNode->v.text.text;
+ parent->add_child_text(value);
+ } else if (htmlNode->type == GUMBO_NODE_CDATA) {
+ parent->add_child_cdata(htmlNode->v.text.text);
+ } else if (htmlNode->type == GUMBO_NODE_COMMENT) {
+ parent->add_child_comment(htmlNode->v.text.text);
+ } else {
+ throw std::logic_error("Unable to convert HTML to XML: Unsupported GumboNode type: " + std::to_string(htmlNode->type));
+ }
+ }
+
+ public:
+
+ GumboTree(std::string htmlText) : htmlText(htmlText) {
+ tree = gumbo_parse_with_options(&options, this->htmlText.c_str(), this->htmlText.size());
+ if (tree == nullptr) throw std::logic_error("Unable to parse HTML text.");
+ }
+
+ virtual ~GumboTree() {
+ if (tree) gumbo_destroy_output(&options, tree);
+ }
+
+ void exportTo(xmlpp::Document* document) {
+ auto doctype = tree->document->v.document;
+ if (strlen(doctype.name)) document->set_internal_subset(doctype.name, doctype.public_identifier, doctype.system_identifier);
+ html2xml(document, nullptr, tree->root);
+ }
+ };
+
public:
XMLDocumentConstructor(std::istream* input, xmlpp::DomParser* parser) : input(input), parser(parser) {
}
-
+
void setOption(const std::string& uri, const std::string& value) {
}
void process() {
- parser->parse_stream(*input);
+ GumboTree gumboTree(readInput());
+ gumboTree.exportTo(parser->get_document());
}
};