add --xml-attribute support (both input and output attributes) v_0
authorFrantišek Kučera <franta-hg@frantovo.cz>
Wed, 30 Dec 2020 17:09:15 +0100
branchv_0
changeset 7 7f2d09c3b1de
parent 6 e498b3466342
child 8 9f95cfd68f25
add --xml-attribute support (both input and output attributes)
src/XPathHandler.h
--- a/src/XPathHandler.h	Wed Dec 30 10:33:24 2020 +0100
+++ b/src/XPathHandler.h	Wed Dec 30 17:09:15 2020 +0100
@@ -21,6 +21,7 @@
 #include <vector>
 #include <codecvt>
 #include <regex>
+#include <stdexcept>
 
 #include <libxml++-2.6/libxml++/libxml++.h>
 
@@ -83,19 +84,54 @@
 		return convertor.from_bytes(value);
 	}
 
+	bool isXmlAttribute(const relpipe::common::type::StringX& attributeName) {
+		for (auto pattern : currentRelationConfiguration->xmlAttributes) if (std::regex_match(attributeName, std::wregex(pattern))) return true;
+		return false;
+	}
+
+	const relpipe::common::type::StringX formatRawXML(const relpipe::common::type::StringX& rawXML) {
+		// TODO: move to a common library (used also in relpipe-in-xmltable)
+		std::wregex pattern(L"^<\\?xml version=\"1.0\" encoding=\"UTF-8\"\\?>\n|\n$");
+		return std::regex_replace(rawXML, pattern, L"");
+	}
+
+	const relpipe::common::type::StringX serialize(xmlpp::Element* element, bool asXml) {
+		if (element) {
+			if (asXml) {
+				xmlpp::Document d;
+				d.create_root_node_by_import(element, true);
+				return formatRawXML(x2s(d.write_to_string()));
+			} else {
+				return element->get_child_text() ? x2s(element->get_child_text()->get_content()) : L"";
+			}
+		} else {
+			return L"";
+		}
+	}
+
+	xmlpp::Element* findSingleElement(const xmlpp::NodeSet& nodeset) {
+		if (nodeset.empty()) return nullptr;
+		else if (nodeset.size() > 1) throw std::invalid_argument("XPath should find one or zero elements.");
+		else if (xmlpp::Element * element = dynamic_cast<xmlpp::Element*> (nodeset[0])) return element;
+		else if (nodeset[0]->get_path() == "/") return findSingleElement(nodeset[0]->find("*")); // support also "/" not only "/*" expressions (return root element in both cases)
+		else throw std::invalid_argument("XPath should find an element, not other kinds of nodes.");
+	}
+
 	void writeInputAttributes() {
 		for (xmlpp::Node* attributeNode : recordElement->get_children()) {
 			if (xmlpp::Element * attributeElement = dynamic_cast<xmlpp::Element*> (attributeNode)) {
-				auto value = attributeElement->get_child_text()->get_content();
-				relationalWriter->writeAttribute(x2s(value));
+				bool asXml = isXmlAttribute(x2s(attributeElement->get_attribute("name")->get_value()));
+				relationalWriter->writeAttribute(serialize(asXml ? dynamic_cast<xmlpp::Element*> (attributeElement->get_first_child()) : attributeElement, asXml));
 			}
 		}
 	}
 
 	void writeOutputAttributes() {
 		for (auto oa : currentRelationConfiguration->outputAttributes) {
-			auto value = recordElement->eval_to_string(s2x(oa.xpath), xmlns);
-			relationalWriter->writeAttribute(x2s(value));
+			relpipe::common::type::StringX value;
+			if (isXmlAttribute(oa.name)) value = serialize(findSingleElement(recordElement->find(s2x(oa.xpath), xmlns)), true);
+			else value = x2s(recordElement->eval_to_string(s2x(oa.xpath), xmlns));
+			relationalWriter->writeAttribute(value);
 		}
 	}
 
@@ -146,10 +182,20 @@
 		if (currentRelationConfiguration) {
 			relpipe::reader::handlers::AttributeMetadata attributeMetadata = currentReaderMetadata[currentAttributeIndex];
 
+			// TODO: Parallel processing of records like in relpipe-in-filesystem? Or share common code with the XPath streamlet? (streamlets are parallelized)
+
 			xmlpp::Element* attributeElement = recordElement->add_child(xmlNameCodec.encode(s2x(attributeMetadata.getAttributeName())));
 			attributeElement->set_attribute("name", s2x(attributeMetadata.getAttributeName()));
 			attributeElement->set_attribute("type", s2x(attributeMetadata.getTypeName()));
-			attributeElement->add_child_text(s2x(value));
+			if (isXmlAttribute(attributeMetadata.getAttributeName())) {
+				if (value.size()) {
+					xmlpp::DomParser attributeParser;
+					attributeParser.parse_memory(s2x(value));
+					attributeElement->import_node(attributeParser.get_document()->get_root_node(), true);
+				}
+			} else {
+				attributeElement->add_child_text(s2x(value));
+			}
 
 			if (currentAttributeIndex == 0) {
 				recordElement->set_attribute("number", std::to_string(currentRecordNumber));