/**
* Relational pipes
* Copyright © 2019 František Kučera (Frantovo.cz, GlobalCode.info)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
namespace relpipe {
namespace in {
namespace xmltable {
#include <sstream>
#include <stdexcept>
#include <libxml++-2.6/libxml++/libxml++.h>
#include <gumbo.h>
class XMLDocumentConstructor {
private:
std::istream* input = nullptr;
xmlpp::DomParser* parser = nullptr;
std::string readInput() {
std::string result(std::istreambuf_iterator<char>(*input),{});
return result;
}
class GumboTree {
private:
std::string htmlText;
GumboOptions options = kGumboDefaultOptions;
GumboOutput* tree = nullptr;
void html2xml(xmlpp::Document* document, xmlpp::Element* parent, GumboNode* htmlNode) {
if (htmlNode->type == GUMBO_NODE_ELEMENT || htmlNode->type == GUMBO_NODE_TEMPLATE) {
std::string elementName = gumbo_normalized_tagname(htmlNode->v.element.tag);
if (htmlNode->v.element.tag == GUMBO_TAG_UNKNOWN || elementName.empty()) {
elementName = "UNKNOWN";
// TODO: find original element name, maybe gumbo_normalize_svg_tagname()
}
// TODO: optional support for namespaces
xmlpp::Element* element;
if (parent == nullptr) {
element = document->create_root_node(elementName);
} else {
element = parent->add_child(elementName);
}
for (size_t i = 0; i < htmlNode->v.element.attributes.length; i++) {
GumboAttribute* htmlAttribute = static_cast<GumboAttribute*> (htmlNode->v.element.attributes.data[i]);
element->set_attribute(htmlAttribute->name, htmlAttribute->value);
}
for (size_t i = 0; i < htmlNode->v.element.children.length; i++) {
html2xml(document, element, static_cast<GumboNode*> (htmlNode->v.element.children.data[i]));
}
} else if (htmlNode->type == GUMBO_NODE_TEXT || htmlNode->type == GUMBO_NODE_WHITESPACE) {
std::string value = htmlNode->v.text.text;
parent->add_child_text(value);
} else if (htmlNode->type == GUMBO_NODE_CDATA) {
parent->add_child_cdata(htmlNode->v.text.text);
} else if (htmlNode->type == GUMBO_NODE_COMMENT) {
parent->add_child_comment(htmlNode->v.text.text);
} else {
throw std::logic_error("Unable to convert HTML to XML: Unsupported GumboNode type: " + std::to_string(htmlNode->type));
}
}
public:
GumboTree(std::string htmlText) : htmlText(htmlText) {
tree = gumbo_parse_with_options(&options, this->htmlText.c_str(), this->htmlText.size());
if (tree == nullptr) throw std::logic_error("Unable to parse HTML text.");
}
virtual ~GumboTree() {
if (tree) gumbo_destroy_output(&options, tree);
}
void exportTo(xmlpp::Document* document) {
auto doctype = tree->document->v.document;
if (strlen(doctype.name)) document->set_internal_subset(doctype.name, doctype.public_identifier, doctype.system_identifier);
html2xml(document, nullptr, tree->root);
}
};
public:
XMLDocumentConstructor(std::istream* input, xmlpp::DomParser* parser) : input(input), parser(parser) {
}
void setOption(const std::string& uri, const std::string& value) {
}
void process() {
GumboTree gumboTree(readInput());
gumboTree.exportTo(parser->get_document());
}
};
}
}
}