streamlet-examples/xpath.cpp
author František Kučera <franta-hg@frantovo.cz>
Fri, 13 May 2022 21:35:30 +0200
branchv_0
changeset 96 c34106244a54
parent 78 5a63bf594f53
permissions -rw-r--r--
portable order of (i++) parameters

/**
 * Relational pipes
 * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

#include "streamlet-common.h"

#include <unistd.h>
#include <regex>
#include <libxml++-2.6/libxml++/libxml++.h>

/**
 * This streamlet provides values from XML files.
 * It uses the XPath language to define, what portion of XML should be returned.
 * 
 * With no options it does not provide any attributes.
 * 
 * XPath expressions are passed as 'attribute' options.
 * e.g. --option 'attribute' 'name()' will return single attribute with the name of the root node.
 * 
 * Attributes can be renamed using aliases: --option 'attribute' 'name()' --as 'name'. Otherwise the full XPath expression is used as a name.
 * Number of aliases should match the number of attributes (otherwise only first attributes are renamed, because aliases are global, not relative to the --option).
 * 
 * Like relpipe-in-xmltable, this streamlet supports several modes:
 *  - string
 *  - boolean
 *  - raw-xml
 *  - line-number
 *  - xpath
 * 
 * The raw-xml mode provides a portion of the original XML defined by the XPath and can be further parametrized by options:
 *  - raw-xml-nodelist-wrapper-name
 *  - raw-xml-nodelist-wrapper-uri
 *  - raw-xml-nodelist-wrapper-prefix
 *  - raw-xml-attribute-wrapper-name
 *  - raw-xml-attribute-wrapper-uri
 *  - raw-xml-attribute-wrapper-prefix
 * 
 * XInclude processing may be turned on using: --option xinclude true
 * 
 * TODO: more OOP, move to separate repository, proper CMake project, clean-up, stabilize API
 */
class XPathStreamlet : public Streamlet {
private:
	xmlpp::Node::PrefixNsMap ns;

	void findXmlnsInEnvironment() {
		std::regex xmlnsEnvPattern("xmlns_(.*)=(.*)");
		std::cmatch match;
		for (char **env = environ; *env; env++) if (std::regex_match(*env, match, xmlnsEnvPattern)) ns[std::string(match[1])] = match[2];
	}

	void findXmlnsInOptions() {
		for (Option o : getOptions(std::wregex(L"xmlns[:_](.*)"))) ns[toBytes(o.nameMatch[1])] = toBytes(o.value);
		for (Option o : getOptions(std::wregex(L"xmlns"), std::wregex(L"([^:]+):(.*)"))) ns[toBytes(o.valueMatch[1])] = toBytes(o.valueMatch[2]);
	}

	bool xinclude = false;

	void findXIncludeOptions() {
		for (Option o : getOptions(L"xinclude")) xinclude = o.value == L"true";
	}

	std::wstring rawXmlNodeListWrapperName;
	std::wstring rawXmlNodeListWrapperUri;
	std::wstring rawXmlNodeListWrapperPrefix;

	std::wstring rawXmlAttributeWrapperName = L"attribute";
	std::wstring rawXmlAttributeWrapperUri;
	std::wstring rawXmlAttributeWrapperPrefix;

	void findRawXmlOptions() {
		for (Option o : getOptions(L"raw-xml-nodelist-wrapper-name")) rawXmlNodeListWrapperName = o.value;
		for (Option o : getOptions(L"raw-xml-nodelist-wrapper-uri")) rawXmlNodeListWrapperUri = o.value;
		for (Option o : getOptions(L"raw-xml-nodelist-wrapper-prefix")) rawXmlNodeListWrapperPrefix = o.value;
		for (Option o : getOptions(L"raw-xml-attribute-wrapper-name")) rawXmlAttributeWrapperName = o.value;
		for (Option o : getOptions(L"raw-xml-attribute-wrapper-uri")) rawXmlAttributeWrapperUri = o.value;
		for (Option o : getOptions(L"raw-xml-attribute-wrapper-prefix")) rawXmlAttributeWrapperPrefix = o.value;
	}

	// Modes should share the logic of relpipe-in-xmltable

	enum class Mode {
		STRING,
		BOOLEAN,
		// TODO: support also XML number, when we have a rational or decimal numbers in Relational pipes
		RAW_XML,
		LINE_NUMBER,
		XPATH
	};

	Mode toMode(std::wstring modeName) {
		if (modeName == L"string") return Mode::STRING;
		else if (modeName == L"boolean") return Mode::BOOLEAN;
		else if (modeName == L"raw-xml") return Mode::RAW_XML;
		else if (modeName == L"line-number") return Mode::LINE_NUMBER;
		else if (modeName == L"xpath") return Mode::XPATH;
		else throw std::invalid_argument("Unsupported mode: " + toBytes(modeName));
	}

	std::wstring toType(Mode mode) {
		if (mode == Mode::BOOLEAN) return BOOLEAN;
		else if (mode == Mode::LINE_NUMBER) return INTEGER;
		else return STRING;
	}

	std::wstring formatRawXML(std::wstring rawXML) {
		std::wregex pattern(L"^<\\?xml version=\"1.0\" encoding=\"UTF-8\"\\?>\n|\n$");
		return std::regex_replace(rawXML, pattern, L"");
	}

	void importNode(xmlpp::Node* parent, xmlpp::Node* child) {
		if (dynamic_cast<xmlpp::AttributeNode*> (child)) parent->add_child_with_new_ns(
				toBytes(rawXmlAttributeWrapperName),
				toBytes(rawXmlAttributeWrapperUri),
				toBytes(rawXmlAttributeWrapperPrefix))->import_node(child);
		else parent->import_node(child, true);
	}

	void importNode(xmlpp::Document* document, xmlpp::Node* child) {
		if (dynamic_cast<xmlpp::AttributeNode*> (child)) document->create_root_node(
				toBytes(rawXmlAttributeWrapperName),
				toBytes(rawXmlAttributeWrapperUri),
				toBytes(rawXmlAttributeWrapperPrefix))->import_node(child);
		else document->create_root_node_by_import(child, true);
	}

	std::wstring toRawXML(xmlpp::Node* parent, std::string xpath, xmlpp::Node::PrefixNsMap ns) {
		xmlpp::Document d;
		xmlpp::NodeSet nodes = parent->find(xpath, ns);

		if (rawXmlNodeListWrapperName.size()) {
			d.create_root_node(
					toBytes(rawXmlNodeListWrapperName),
					toBytes(rawXmlNodeListWrapperUri),
					toBytes(rawXmlNodeListWrapperPrefix));
			for (xmlpp::Node* node : nodes) importNode(d.get_root_node(), node);
		} else {
			if (nodes.size() == 1) importNode(&d, nodes[0]);
			else if (nodes.size() > 1) throw std::invalid_argument("Multiple nodes found where only one was expected. Use nodelist wrapper."); // TODO: better relpipe exception
			else return L"";
		}
		return formatRawXML(fromBytes(d.write_to_string()));
	}

	class XPathAttribute {
	public:

		std::wstring name;
		std::wstring xpath;
		Mode mode = Mode::STRING;
	};

	std::vector<XPathAttribute> xpathAttributes;

protected:

	std::vector<AttributeMetadata> getOutputAttributesMetadata() override {
		findXmlnsInEnvironment();
		findXmlnsInOptions();
		findRawXmlOptions();
		findXIncludeOptions();

		std::vector<AttributeMetadata> oam;

		std::vector<Option> modeOptions = getOptions(L"mode");
		std::vector<Option> attributeOptions = getOptions(L"attribute");
		for (int i = 0, limit = attributeOptions.size(); i < limit; i++) {
			Mode mode = i < modeOptions.size() ? toMode(modeOptions[i].value) : Mode::STRING;
			std::wstring alias = getAlias(i, attributeOptions[i].value);
			xpathAttributes.push_back({alias, attributeOptions[i].value, mode});
			oam.push_back({alias, toType(mode)});
		}

		return oam;
	}

	std::vector<OutputAttribute> getOutputAttributes() override {
		std::vector<OutputAttribute> oa;

		try {
			xmlpp::DomParser parser;
			parser.parse_file(toBytes(getCurrentFile()));
			if (xinclude) parser.get_document()->process_xinclude(true);
			xmlpp::Element* root = parser.get_document()->get_root_node();

			for (XPathAttribute xpathAttribute : xpathAttributes) {
				std::string xpath = toBytes(xpathAttribute.xpath);
				std::wstring result;
				bool isNull = false;

				if (xpathAttribute.mode == Mode::STRING) {
					result = fromBytes(root->eval_to_string(xpath, ns));
				} else if (xpathAttribute.mode == Mode::BOOLEAN) {
					result = root->eval_to_boolean(xpath, ns) ? L"true" : L"false";
				} else if (xpathAttribute.mode == Mode::LINE_NUMBER) {
					xmlpp::NodeSet attributeNodes = root->find(xpath, ns);
					if (attributeNodes.size()) result = std::to_wstring(attributeNodes[0]->get_line());
					else isNull = true;
				} else if (xpathAttribute.mode == Mode::XPATH) {
					xmlpp::NodeSet attributeNodes = root->find(xpath, ns);
					if (attributeNodes.size()) result = fromBytes(attributeNodes[0]->get_path());
					else isNull = true;
				} else if (xpathAttribute.mode == Mode::RAW_XML) {
					result = toRawXML(root, xpath, ns);
				} else {
					throw std::logic_error("Unsupported mode."); // should never happer
				}

				oa.push_back({result, isNull});
			}
		} catch (xmlpp::parse_error& e) {
			for (XPathAttribute xpathAttribute : xpathAttributes) oa.push_back({L"", true});
			// invalid XML → xmlpp::parse_error → just skip this file
			// invalid XPath → xmlpp::exception → failure
		}


		return oa;
	}
};

STREAMLET_RUN(XPathStreamlet)