# HG changeset patch # User František Kučera # Date 1613941909 -3600 # Node ID db5418b04c63769c97618efbdcfec3df020a2923 # Parent 2f783f0573fa8aa0eeb74b8ae9377efecdb868f4 first working version diff -r 2f783f0573fa -r db5418b04c63 src/CMakeLists.txt --- a/src/CMakeLists.txt Sat Feb 20 20:32:56 2021 +0100 +++ b/src/CMakeLists.txt Sun Feb 21 22:11:49 2021 +0100 @@ -17,7 +17,7 @@ # Relpipe libraries: INCLUDE(FindPkgConfig) -pkg_check_modules (RELPIPE_LIBS relpipe-lib-writer.cpp relpipe-lib-cli.cpp libxml++-2.6) +pkg_check_modules (RELPIPE_LIBS relpipe-lib-writer.cpp relpipe-lib-cli.cpp libxml++-2.6 gumbo) include_directories(${RELPIPE_LIBS_INCLUDE_DIRS}) link_directories(${RELPIPE_LIBS_LIBRARY_DIRS}) diff -r 2f783f0573fa -r db5418b04c63 src/XMLDocumentConstructor.h --- a/src/XMLDocumentConstructor.h Sat Feb 20 20:32:56 2021 +0100 +++ b/src/XMLDocumentConstructor.h Sun Feb 21 22:11:49 2021 +0100 @@ -20,22 +20,96 @@ namespace in { namespace xmltable { +#include +#include + #include +#include class XMLDocumentConstructor { private: std::istream* input = nullptr; xmlpp::DomParser* parser = nullptr; + + std::string readInput() { + std::string result(std::istreambuf_iterator(*input),{}); + return result; + } + + class GumboTree { + private: + std::string htmlText; + GumboOptions options = kGumboDefaultOptions; + GumboOutput* tree = nullptr; + + void html2xml(xmlpp::Document* document, xmlpp::Element* parent, GumboNode* htmlNode) { + if (htmlNode->type == GUMBO_NODE_ELEMENT || htmlNode->type == GUMBO_NODE_TEMPLATE) { + std::string elementName = gumbo_normalized_tagname(htmlNode->v.element.tag); + + if (htmlNode->v.element.tag == GUMBO_TAG_UNKNOWN || elementName.empty()) { + elementName = "UNKNOWN"; + // TODO: find original element name, maybe gumbo_normalize_svg_tagname() + } + + // TODO: optional support for namespaces + xmlpp::Element* element; + if (parent == nullptr) { + element = document->create_root_node(elementName); + } else { + element = parent->add_child(elementName); + } + + for (size_t i = 0; i < htmlNode->v.element.attributes.length; i++) { + GumboAttribute* htmlAttribute = static_cast (htmlNode->v.element.attributes.data[i]); + element->set_attribute(htmlAttribute->name, htmlAttribute->value); + } + + for (size_t i = 0; i < htmlNode->v.element.children.length; i++) { + html2xml(document, element, static_cast (htmlNode->v.element.children.data[i])); + } + + + } else if (htmlNode->type == GUMBO_NODE_TEXT || htmlNode->type == GUMBO_NODE_WHITESPACE) { + std::string value = htmlNode->v.text.text; + parent->add_child_text(value); + } else if (htmlNode->type == GUMBO_NODE_CDATA) { + parent->add_child_cdata(htmlNode->v.text.text); + } else if (htmlNode->type == GUMBO_NODE_COMMENT) { + parent->add_child_comment(htmlNode->v.text.text); + } else { + throw std::logic_error("Unable to convert HTML to XML: Unsupported GumboNode type: " + std::to_string(htmlNode->type)); + } + } + + public: + + GumboTree(std::string htmlText) : htmlText(htmlText) { + tree = gumbo_parse_with_options(&options, this->htmlText.c_str(), this->htmlText.size()); + if (tree == nullptr) throw std::logic_error("Unable to parse HTML text."); + } + + virtual ~GumboTree() { + if (tree) gumbo_destroy_output(&options, tree); + } + + void exportTo(xmlpp::Document* document) { + auto doctype = tree->document->v.document; + if (strlen(doctype.name)) document->set_internal_subset(doctype.name, doctype.public_identifier, doctype.system_identifier); + html2xml(document, nullptr, tree->root); + } + }; + public: XMLDocumentConstructor(std::istream* input, xmlpp::DomParser* parser) : input(input), parser(parser) { } - + void setOption(const std::string& uri, const std::string& value) { } void process() { - parser->parse_stream(*input); + GumboTree gumboTree(readInput()); + gumboTree.exportTo(parser->get_document()); } };