src/XMLDocumentConstructor.h
branchv_0
changeset 1 db5418b04c63
parent 0 2f783f0573fa
child 2 4b938619848b
equal deleted inserted replaced
0:2f783f0573fa 1:db5418b04c63
    18 
    18 
    19 namespace relpipe {
    19 namespace relpipe {
    20 namespace in {
    20 namespace in {
    21 namespace xmltable {
    21 namespace xmltable {
    22 
    22 
       
    23 #include <sstream>
       
    24 #include <stdexcept>
       
    25 
    23 #include <libxml++-2.6/libxml++/libxml++.h>
    26 #include <libxml++-2.6/libxml++/libxml++.h>
       
    27 #include <gumbo.h>
    24 
    28 
    25 class XMLDocumentConstructor {
    29 class XMLDocumentConstructor {
    26 private:
    30 private:
    27 	std::istream* input = nullptr;
    31 	std::istream* input = nullptr;
    28 	xmlpp::DomParser* parser = nullptr;
    32 	xmlpp::DomParser* parser = nullptr;
       
    33 
       
    34 	std::string readInput() {
       
    35 		std::string result(std::istreambuf_iterator<char>(*input),{});
       
    36 		return result;
       
    37 	}
       
    38 
       
    39 	class GumboTree {
       
    40 	private:
       
    41 		std::string htmlText;
       
    42 		GumboOptions options = kGumboDefaultOptions;
       
    43 		GumboOutput* tree = nullptr;
       
    44 
       
    45 		void html2xml(xmlpp::Document* document, xmlpp::Element* parent, GumboNode* htmlNode) {
       
    46 			if (htmlNode->type == GUMBO_NODE_ELEMENT || htmlNode->type == GUMBO_NODE_TEMPLATE) {
       
    47 				std::string elementName = gumbo_normalized_tagname(htmlNode->v.element.tag);
       
    48 
       
    49 				if (htmlNode->v.element.tag == GUMBO_TAG_UNKNOWN || elementName.empty()) {
       
    50 					elementName = "UNKNOWN";
       
    51 					// TODO: find original element name, maybe gumbo_normalize_svg_tagname()
       
    52 				}
       
    53 
       
    54 				// TODO: optional support for namespaces
       
    55 				xmlpp::Element* element;
       
    56 				if (parent == nullptr) {
       
    57 					element = document->create_root_node(elementName);
       
    58 				} else {
       
    59 					element = parent->add_child(elementName);
       
    60 				}
       
    61 
       
    62 				for (size_t i = 0; i < htmlNode->v.element.attributes.length; i++) {
       
    63 					GumboAttribute* htmlAttribute = static_cast<GumboAttribute*> (htmlNode->v.element.attributes.data[i]);
       
    64 					element->set_attribute(htmlAttribute->name, htmlAttribute->value);
       
    65 				}
       
    66 
       
    67 				for (size_t i = 0; i < htmlNode->v.element.children.length; i++) {
       
    68 					html2xml(document, element, static_cast<GumboNode*> (htmlNode->v.element.children.data[i]));
       
    69 				}
       
    70 
       
    71 
       
    72 			} else if (htmlNode->type == GUMBO_NODE_TEXT || htmlNode->type == GUMBO_NODE_WHITESPACE) {
       
    73 				std::string value = htmlNode->v.text.text;
       
    74 				parent->add_child_text(value);
       
    75 			} else if (htmlNode->type == GUMBO_NODE_CDATA) {
       
    76 				parent->add_child_cdata(htmlNode->v.text.text);
       
    77 			} else if (htmlNode->type == GUMBO_NODE_COMMENT) {
       
    78 				parent->add_child_comment(htmlNode->v.text.text);
       
    79 			} else {
       
    80 				throw std::logic_error("Unable to convert HTML to XML: Unsupported GumboNode type: " + std::to_string(htmlNode->type));
       
    81 			}
       
    82 		}
       
    83 
       
    84 	public:
       
    85 
       
    86 		GumboTree(std::string htmlText) : htmlText(htmlText) {
       
    87 			tree = gumbo_parse_with_options(&options, this->htmlText.c_str(), this->htmlText.size());
       
    88 			if (tree == nullptr) throw std::logic_error("Unable to parse HTML text.");
       
    89 		}
       
    90 
       
    91 		virtual ~GumboTree() {
       
    92 			if (tree) gumbo_destroy_output(&options, tree);
       
    93 		}
       
    94 
       
    95 		void exportTo(xmlpp::Document* document) {
       
    96 			auto doctype = tree->document->v.document;
       
    97 			if (strlen(doctype.name)) document->set_internal_subset(doctype.name, doctype.public_identifier, doctype.system_identifier);
       
    98 			html2xml(document, nullptr, tree->root);
       
    99 		}
       
   100 	};
       
   101 
    29 public:
   102 public:
    30 
   103 
    31 	XMLDocumentConstructor(std::istream* input, xmlpp::DomParser* parser) : input(input), parser(parser) {
   104 	XMLDocumentConstructor(std::istream* input, xmlpp::DomParser* parser) : input(input), parser(parser) {
    32 	}
   105 	}
    33 	
   106 
    34 	void setOption(const std::string& uri, const std::string& value) {
   107 	void setOption(const std::string& uri, const std::string& value) {
    35 	}
   108 	}
    36 
   109 
    37 	void process() {
   110 	void process() {
    38 		parser->parse_stream(*input);
   111 		GumboTree gumboTree(readInput());
       
   112 		gumboTree.exportTo(parser->get_document());
    39 	}
   113 	}
    40 };
   114 };
    41 
   115 
    42 }
   116 }
    43 }
   117 }