src/XMLDocumentConstructor.h
author František Kučera <franta-hg@frantovo.cz>
Sun, 21 Feb 2021 22:11:49 +0100
branchv_0
changeset 1 db5418b04c63
parent 0 2f783f0573fa
child 2 4b938619848b
permissions -rw-r--r--
first working version
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
0
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     1
/**
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     2
 * Relational pipes
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     3
 * Copyright © 2019 František Kučera (Frantovo.cz, GlobalCode.info)
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     4
 *
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     5
 * This program is free software: you can redistribute it and/or modify
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     6
 * it under the terms of the GNU General Public License as published by
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     7
 * the Free Software Foundation, version 3 of the License.
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     8
 *
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
     9
 * This program is distributed in the hope that it will be useful,
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    10
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    11
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    12
 * GNU General Public License for more details.
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    13
 *
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    14
 * You should have received a copy of the GNU General Public License
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    15
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    16
 */
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    17
#pragma once
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    18
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    19
namespace relpipe {
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    20
namespace in {
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    21
namespace xmltable {
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    22
1
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    23
#include <sstream>
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    24
#include <stdexcept>
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    25
0
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    26
#include <libxml++-2.6/libxml++/libxml++.h>
1
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    27
#include <gumbo.h>
0
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    28
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    29
class XMLDocumentConstructor {
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    30
private:
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    31
	std::istream* input = nullptr;
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
    32
	xmlpp::DomParser* parser = nullptr;
1
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    33
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    34
	std::string readInput() {
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    35
		std::string result(std::istreambuf_iterator<char>(*input),{});
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    36
		return result;
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    37
	}
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    38
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    39
	class GumboTree {
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    40
	private:
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    41
		std::string htmlText;
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    42
		GumboOptions options = kGumboDefaultOptions;
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    43
		GumboOutput* tree = nullptr;
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    44
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    45
		void html2xml(xmlpp::Document* document, xmlpp::Element* parent, GumboNode* htmlNode) {
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    46
			if (htmlNode->type == GUMBO_NODE_ELEMENT || htmlNode->type == GUMBO_NODE_TEMPLATE) {
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    47
				std::string elementName = gumbo_normalized_tagname(htmlNode->v.element.tag);
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    48
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    49
				if (htmlNode->v.element.tag == GUMBO_TAG_UNKNOWN || elementName.empty()) {
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    50
					elementName = "UNKNOWN";
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    51
					// TODO: find original element name, maybe gumbo_normalize_svg_tagname()
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    52
				}
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    53
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    54
				// TODO: optional support for namespaces
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    55
				xmlpp::Element* element;
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    56
				if (parent == nullptr) {
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    57
					element = document->create_root_node(elementName);
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    58
				} else {
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    59
					element = parent->add_child(elementName);
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    60
				}
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    61
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    62
				for (size_t i = 0; i < htmlNode->v.element.attributes.length; i++) {
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    63
					GumboAttribute* htmlAttribute = static_cast<GumboAttribute*> (htmlNode->v.element.attributes.data[i]);
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    64
					element->set_attribute(htmlAttribute->name, htmlAttribute->value);
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    65
				}
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    66
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    67
				for (size_t i = 0; i < htmlNode->v.element.children.length; i++) {
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    68
					html2xml(document, element, static_cast<GumboNode*> (htmlNode->v.element.children.data[i]));
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    69
				}
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    70
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    71
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    72
			} else if (htmlNode->type == GUMBO_NODE_TEXT || htmlNode->type == GUMBO_NODE_WHITESPACE) {
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    73
				std::string value = htmlNode->v.text.text;
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    74
				parent->add_child_text(value);
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    75
			} else if (htmlNode->type == GUMBO_NODE_CDATA) {
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    76
				parent->add_child_cdata(htmlNode->v.text.text);
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    77
			} else if (htmlNode->type == GUMBO_NODE_COMMENT) {
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    78
				parent->add_child_comment(htmlNode->v.text.text);
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    79
			} else {
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    80
				throw std::logic_error("Unable to convert HTML to XML: Unsupported GumboNode type: " + std::to_string(htmlNode->type));
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    81
			}
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    82
		}
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    83
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    84
	public:
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    85
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    86
		GumboTree(std::string htmlText) : htmlText(htmlText) {
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    87
			tree = gumbo_parse_with_options(&options, this->htmlText.c_str(), this->htmlText.size());
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    88
			if (tree == nullptr) throw std::logic_error("Unable to parse HTML text.");
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    89
		}
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    90
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    91
		virtual ~GumboTree() {
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    92
			if (tree) gumbo_destroy_output(&options, tree);
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    93
		}
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    94
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    95
		void exportTo(xmlpp::Document* document) {
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    96
			auto doctype = tree->document->v.document;
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    97
			if (strlen(doctype.name)) document->set_internal_subset(doctype.name, doctype.public_identifier, doctype.system_identifier);
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    98
			html2xml(document, nullptr, tree->root);
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
    99
		}
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
   100
	};
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
   101
0
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
   102
public:
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
   103
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
   104
	XMLDocumentConstructor(std::istream* input, xmlpp::DomParser* parser) : input(input), parser(parser) {
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
   105
	}
1
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
   106
0
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
   107
	void setOption(const std::string& uri, const std::string& value) {
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
   108
	}
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
   109
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
   110
	void process() {
1
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
   111
		GumboTree gumboTree(readInput());
db5418b04c63 first working version
František Kučera <franta-hg@frantovo.cz>
parents: 0
diff changeset
   112
		gumboTree.exportTo(parser->get_document());
0
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
   113
	}
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
   114
};
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
   115
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
   116
}
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
   117
}
2f783f0573fa project skeleton
František Kučera <franta-hg@frantovo.cz>
parents:
diff changeset
   118
}