src/XMLDocumentConstructor.h
author František Kučera <franta-hg@frantovo.cz>
Sun, 21 Feb 2021 23:52:46 +0100
branchv_0
changeset 2 4b938619848b
parent 1 db5418b04c63
permissions -rw-r--r--
comment on xmlns and DOCTYPE bug

/**
 * Relational pipes
 * Copyright © 2019 František Kučera (Frantovo.cz, GlobalCode.info)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
#pragma once

namespace relpipe {
namespace in {
namespace xmltable {

#include <sstream>
#include <stdexcept>

#include <libxml++-2.6/libxml++/libxml++.h>
#include <gumbo.h>

class XMLDocumentConstructor {
private:
	std::istream* input = nullptr;
	xmlpp::DomParser* parser = nullptr;

	std::string readInput() {
		std::string result(std::istreambuf_iterator<char>(*input),{});
		return result;
	}

	class GumboTree {
	private:
		std::string htmlText;
		GumboOptions options = kGumboDefaultOptions;
		GumboOutput* tree = nullptr;

		void html2xml(xmlpp::Document* document, xmlpp::Element* parent, GumboNode* htmlNode) {
			if (htmlNode->type == GUMBO_NODE_ELEMENT || htmlNode->type == GUMBO_NODE_TEMPLATE) {
				std::string elementName = gumbo_normalized_tagname(htmlNode->v.element.tag);

				if (htmlNode->v.element.tag == GUMBO_TAG_UNKNOWN || elementName.empty()) {
					elementName = "UNKNOWN";
					// TODO: find original element name, maybe gumbo_normalize_svg_tagname()
				}

				// TODO: optional support for namespaces
				xmlpp::Element* element;
				if (parent == nullptr) {
					element = document->create_root_node(elementName);
				} else {
					element = parent->add_child(elementName);
				}
				
				for (size_t i = 0; i < htmlNode->v.element.attributes.length; i++) {
					// FIXME: if the xmlns was set in exportTo() according to the DOCTYPE, we should not add xmlns attribute here (otherwise we would have two xmlns attributes and thus invalid xml)
					GumboAttribute* htmlAttribute = static_cast<GumboAttribute*> (htmlNode->v.element.attributes.data[i]);
					element->set_attribute(htmlAttribute->name, htmlAttribute->value);
				}

				for (size_t i = 0; i < htmlNode->v.element.children.length; i++) {
					html2xml(document, element, static_cast<GumboNode*> (htmlNode->v.element.children.data[i]));
				}


			} else if (htmlNode->type == GUMBO_NODE_TEXT || htmlNode->type == GUMBO_NODE_WHITESPACE) {
				std::string value = htmlNode->v.text.text;
				parent->add_child_text(value);
			} else if (htmlNode->type == GUMBO_NODE_CDATA) {
				parent->add_child_cdata(htmlNode->v.text.text);
			} else if (htmlNode->type == GUMBO_NODE_COMMENT) {
				parent->add_child_comment(htmlNode->v.text.text);
			} else {
				throw std::logic_error("Unable to convert HTML to XML: Unsupported GumboNode type: " + std::to_string(htmlNode->type));
			}
		}

	public:

		GumboTree(std::string htmlText) : htmlText(htmlText) {
			tree = gumbo_parse_with_options(&options, this->htmlText.c_str(), this->htmlText.size());
			if (tree == nullptr) throw std::logic_error("Unable to parse HTML text.");
		}

		virtual ~GumboTree() {
			if (tree) gumbo_destroy_output(&options, tree);
		}

		void exportTo(xmlpp::Document* document) {
			auto doctype = tree->document->v.document;
			if (strlen(doctype.name)) document->set_internal_subset(doctype.name, doctype.public_identifier, doctype.system_identifier);
			html2xml(document, nullptr, tree->root);
		}
	};

public:

	XMLDocumentConstructor(std::istream* input, xmlpp::DomParser* parser) : input(input), parser(parser) {
	}

	void setOption(const std::string& uri, const std::string& value) {
	}

	void process() {
		GumboTree gumboTree(readInput());
		gumboTree.exportTo(parser->get_document());
	}
};

}
}
}