first working version v_0
authorFrantišek Kučera <franta-hg@frantovo.cz>
Sun, 21 Feb 2021 22:11:49 +0100
branchv_0
changeset 1 db5418b04c63
parent 0 2f783f0573fa
child 2 4b938619848b
first working version
src/CMakeLists.txt
src/XMLDocumentConstructor.h
--- a/src/CMakeLists.txt	Sat Feb 20 20:32:56 2021 +0100
+++ b/src/CMakeLists.txt	Sun Feb 21 22:11:49 2021 +0100
@@ -17,7 +17,7 @@
 
 # Relpipe libraries:
 INCLUDE(FindPkgConfig)
-pkg_check_modules (RELPIPE_LIBS relpipe-lib-writer.cpp relpipe-lib-cli.cpp libxml++-2.6)
+pkg_check_modules (RELPIPE_LIBS relpipe-lib-writer.cpp relpipe-lib-cli.cpp libxml++-2.6 gumbo)
 include_directories(${RELPIPE_LIBS_INCLUDE_DIRS})
 link_directories(${RELPIPE_LIBS_LIBRARY_DIRS})
 
--- a/src/XMLDocumentConstructor.h	Sat Feb 20 20:32:56 2021 +0100
+++ b/src/XMLDocumentConstructor.h	Sun Feb 21 22:11:49 2021 +0100
@@ -20,22 +20,96 @@
 namespace in {
 namespace xmltable {
 
+#include <sstream>
+#include <stdexcept>
+
 #include <libxml++-2.6/libxml++/libxml++.h>
+#include <gumbo.h>
 
 class XMLDocumentConstructor {
 private:
 	std::istream* input = nullptr;
 	xmlpp::DomParser* parser = nullptr;
+
+	std::string readInput() {
+		std::string result(std::istreambuf_iterator<char>(*input),{});
+		return result;
+	}
+
+	class GumboTree {
+	private:
+		std::string htmlText;
+		GumboOptions options = kGumboDefaultOptions;
+		GumboOutput* tree = nullptr;
+
+		void html2xml(xmlpp::Document* document, xmlpp::Element* parent, GumboNode* htmlNode) {
+			if (htmlNode->type == GUMBO_NODE_ELEMENT || htmlNode->type == GUMBO_NODE_TEMPLATE) {
+				std::string elementName = gumbo_normalized_tagname(htmlNode->v.element.tag);
+
+				if (htmlNode->v.element.tag == GUMBO_TAG_UNKNOWN || elementName.empty()) {
+					elementName = "UNKNOWN";
+					// TODO: find original element name, maybe gumbo_normalize_svg_tagname()
+				}
+
+				// TODO: optional support for namespaces
+				xmlpp::Element* element;
+				if (parent == nullptr) {
+					element = document->create_root_node(elementName);
+				} else {
+					element = parent->add_child(elementName);
+				}
+
+				for (size_t i = 0; i < htmlNode->v.element.attributes.length; i++) {
+					GumboAttribute* htmlAttribute = static_cast<GumboAttribute*> (htmlNode->v.element.attributes.data[i]);
+					element->set_attribute(htmlAttribute->name, htmlAttribute->value);
+				}
+
+				for (size_t i = 0; i < htmlNode->v.element.children.length; i++) {
+					html2xml(document, element, static_cast<GumboNode*> (htmlNode->v.element.children.data[i]));
+				}
+
+
+			} else if (htmlNode->type == GUMBO_NODE_TEXT || htmlNode->type == GUMBO_NODE_WHITESPACE) {
+				std::string value = htmlNode->v.text.text;
+				parent->add_child_text(value);
+			} else if (htmlNode->type == GUMBO_NODE_CDATA) {
+				parent->add_child_cdata(htmlNode->v.text.text);
+			} else if (htmlNode->type == GUMBO_NODE_COMMENT) {
+				parent->add_child_comment(htmlNode->v.text.text);
+			} else {
+				throw std::logic_error("Unable to convert HTML to XML: Unsupported GumboNode type: " + std::to_string(htmlNode->type));
+			}
+		}
+
+	public:
+
+		GumboTree(std::string htmlText) : htmlText(htmlText) {
+			tree = gumbo_parse_with_options(&options, this->htmlText.c_str(), this->htmlText.size());
+			if (tree == nullptr) throw std::logic_error("Unable to parse HTML text.");
+		}
+
+		virtual ~GumboTree() {
+			if (tree) gumbo_destroy_output(&options, tree);
+		}
+
+		void exportTo(xmlpp::Document* document) {
+			auto doctype = tree->document->v.document;
+			if (strlen(doctype.name)) document->set_internal_subset(doctype.name, doctype.public_identifier, doctype.system_identifier);
+			html2xml(document, nullptr, tree->root);
+		}
+	};
+
 public:
 
 	XMLDocumentConstructor(std::istream* input, xmlpp::DomParser* parser) : input(input), parser(parser) {
 	}
-	
+
 	void setOption(const std::string& uri, const std::string& value) {
 	}
 
 	void process() {
-		parser->parse_stream(*input);
+		GumboTree gumboTree(readInput());
+		gumboTree.exportTo(parser->get_document());
 	}
 };