build DOM, filter records and evaluate XPath expressions for additional output attributes v_0
authorFrantišek Kučera <franta-hg@frantovo.cz>
Wed, 30 Dec 2020 01:28:45 +0100
branchv_0
changeset 2 426054465916
parent 1 d6dbd5d50d43
child 3 709abeb5f6d1
build DOM, filter records and evaluate XPath expressions for additional output attributes
src/XMLNameCodec.h
src/XPathHandler.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/XMLNameCodec.h	Wed Dec 30 01:28:45 2020 +0100
@@ -0,0 +1,127 @@
+/**
+ * Relational pipes
+ * Copyright © 2019 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <sstream>
+#include <iomanip>
+#include <stdexcept>
+
+#include <glibmm-2.4/glibmm/ustring.h>
+
+namespace relpipe {
+namespace in {
+namespace xmltable {
+
+class XMLNameCodec {
+private:
+	static const char DEFAULT_ESCAPING_CHARACTER = '_';
+	const char esc;
+	const bool namespaceAware;
+
+	bool between(gunichar codepoint, gunichar start, gunichar end) {
+		return codepoint >= start && codepoint <= end;
+	}
+
+	/**
+	 * https://www.w3.org/TR/REC-xml/#NT-NameStartChar
+	 *
+	 * @param codepoint unicode character
+	 * @return whether this character is allowed at the beginning of a XML name
+	 */
+	bool isValidNameStartChar(gunichar codepoint) {
+		// NameStartChar  ::= ":" | [A-Z] | "_" | [a-z] 
+		//   | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
+		//   | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF]
+		//   | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
+		return (codepoint == ':' && !namespaceAware) || between(codepoint, 'A', 'Z') || codepoint == '_' || between(codepoint, 'a', 'z')
+				|| between(codepoint, 0xC0, 0xD6) || between(codepoint, 0xD8, 0xF6) || between(codepoint, 0xF8, 0x2FF) || between(codepoint, 0x370, 0x37D) || between(codepoint, 0x37F, 0x1FFF)
+				|| between(codepoint, 0x200C, 0x200D) || between(codepoint, 0x2070, 0x218F) || between(codepoint, 0x2C00, 0x2FEF) || between(codepoint, 0x3001, 0xD7FF)
+				|| between(codepoint, 0xF900, 0xFDCF) || between(codepoint, 0xFDF0, 0xFFFD) || between(codepoint, 0x10000, 0xEFFFF);
+	}
+
+	/**
+	 * https://www.w3.org/TR/REC-xml/#NT-NameChar
+	 *
+	 * @param codepoint unicode character
+	 * @return whether this character is allowed in a XML name
+	 */
+	bool isValidNameChar(gunichar codepoint) {
+		// NameChar       ::= NameStartChar | "-" | "." | [0-9] 
+		//   | #xB7
+		//   | [#x0300-#x036F] | [#x203F-#x2040]
+		return isValidNameStartChar(codepoint) || codepoint == '-' || codepoint == '.' || between(codepoint, '0', '9')
+				|| codepoint == 0xB7
+				|| between(codepoint, 0x0300, 0x036F) || between(codepoint, 0x203F, 0x2040);
+	}
+
+public:
+
+	XMLNameCodec() : XMLNameCodec(DEFAULT_ESCAPING_CHARACTER, true) {
+	}
+
+	/**
+	 * @param escapingCharacter must be valid character allowed not only in the middle of the XML name but also as the
+	 * first character of the name
+	 * @param namespaceAware colon character is reserved as a separator of the prefix and the local name, see
+	 * https://www.w3.org/TR/REC-xml-names/#NT-NCName
+	 * @throws std::invalid_argument if escapingCharacter is not valid
+	 */
+	XMLNameCodec(const char escapingCharacter, const bool namespaceAware) : esc(escapingCharacter), namespaceAware(namespaceAware) {
+		// TODO: allow also characters like #xB7 and add another escaping if they occur at the beginning of the name?
+		if (!isValidNameStartChar(esc)) {
+			throw std::invalid_argument("The character „" + std::to_string(escapingCharacter) + "“ is not allowed at the beginning of a XML name and thus not usable for escaping");
+		}
+	}
+
+	virtual ~XMLNameCodec() {
+	}
+
+	/**
+	 * @param name any string
+	 * @return valid name of XML element or attribute
+	 */
+	Glib::ustring encode(Glib::ustring name) {
+		if (name.empty()) {
+			return Glib::ustring(1, esc);
+		} else {
+			std::stringstream result;
+
+			for (int i = 0; i < name.size(); i++) {
+				gunichar codepoint = name[i];
+				if (codepoint == esc) {
+					result.put(esc);
+					result.put(esc);
+					continue;
+				} else if ((i == 0 && isValidNameStartChar(codepoint)) || (i > 0 && isValidNameChar(codepoint))) {
+					result << Glib::ustring(1, codepoint);
+					continue;
+				}
+
+				result.put(esc);
+				result << Glib::ustring::format(std::hex, std::setfill(L'0'), std::setw(2), codepoint);
+				result.put(esc);
+			}
+
+			return result.str();
+		}
+	}
+
+};
+
+}
+}
+}
--- a/src/XPathHandler.h	Wed Dec 30 00:39:03 2020 +0100
+++ b/src/XPathHandler.h	Wed Dec 30 01:28:45 2020 +0100
@@ -34,6 +34,7 @@
 #include <relpipe/cli/RelpipeCLIException.h>
 
 #include "Configuration.h"
+#include "XMLNameCodec.h"
 
 namespace relpipe {
 namespace tr {
@@ -41,14 +42,18 @@
 
 class XPathHandler : public relpipe::reader::handlers::RelationalReaderStringHandler {
 private:
+	std::wstring_convert<codecvt_utf8<wchar_t>> convertor; // XML is in UTF-8
 	shared_ptr<relpipe::writer::RelationalWriter> relationalWriter;
 	Configuration configuration;
-	RelationConfiguration* currentRelationConfiguration;
+	RelationConfiguration* currentRelationConfiguration = nullptr;
 	std::vector<relpipe::reader::handlers::AttributeMetadata> currentReaderMetadata;
 	std::vector<relpipe::writer::AttributeMetadata> currentWriterMetadata;
 	size_t currentAttributeIndex = 0;
-	
+
+
 	xmlpp::DomParser dom;
+	xmlpp::Element* recordElement = nullptr;
+	relpipe::in::xmltable::XMLNameCodec xmlNameCodec; // TODO: move to a common library
 
 	void copyInputAttributesToOutput() {
 		for (auto rm : currentReaderMetadata) currentWriterMetadata.push_back({rm.getAttributeName(), relationalWriter->toTypeId(rm.getTypeName())});
@@ -63,6 +68,35 @@
 		return currentRelationConfiguration->inputAttributePolicy == InputAttributePolicy::Append;
 	}
 
+	void resetRecordElement() {
+		if (recordElement) dom.get_document()->get_root_node()->remove_child(recordElement);
+		recordElement = dom.get_document()->get_root_node()->add_child("record");
+	}
+
+	const Glib::ustring s2x(relpipe::common::type::StringX value) {
+		return Glib::ustring(convertor.to_bytes(value));
+	}
+
+	const relpipe::common::type::StringX x2s(const Glib::ustring& value) {
+		return convertor.from_bytes(value);
+	}
+
+	void writeInputAttributes() {
+		for (xmlpp::Node* attributeNode : recordElement->get_children()) {
+			if (xmlpp::Element * attributeElement = dynamic_cast<xmlpp::Element*> (attributeNode)) {
+				auto value = attributeElement->get_child_text()->get_content();
+				relationalWriter->writeAttribute(x2s(value));
+			}
+		}
+	}
+
+	void writeOutputAttributes() {
+		for (auto oa : currentRelationConfiguration->outputAttributes) {
+			auto value = recordElement->eval_to_string(s2x(oa.xpath));
+			relationalWriter->writeAttribute(x2s(value));
+		}
+	}
+
 public:
 
 	XPathHandler(shared_ptr<relpipe::writer::RelationalWriter> relationalWriter, Configuration configuration) : relationalWriter(relationalWriter), configuration(configuration) {
@@ -92,6 +126,8 @@
 			if (isAppendingInputAttributes()) copyInputAttributesToOutput();
 
 			// TODO: prepare DOM
+			dom.get_document()->create_root_node("relpipe-tr-xpath");
+			resetRecordElement();
 		}
 
 		relationalWriter->startRelation(name, currentWriterMetadata, true);
@@ -101,15 +137,21 @@
 		if (currentRelationConfiguration) {
 			relpipe::reader::handlers::AttributeMetadata attributeMetadata = currentReaderMetadata[currentAttributeIndex];
 
-			// TODO: add attribute to DOM		
+			xmlpp::Element* attributeElement = recordElement->add_child(xmlNameCodec.encode(s2x(attributeMetadata.getAttributeName())));
+			attributeElement->set_attribute("name", s2x(attributeMetadata.getAttributeName()));
+			attributeElement->set_attribute("type", s2x(attributeMetadata.getTypeName()));
+			attributeElement->add_child_text(s2x(value));
 
 			currentAttributeIndex++;
 
 			if (currentAttributeIndex == currentReaderMetadata.size()) {
+				if (currentRelationConfiguration->where.empty() || recordElement->eval_to_boolean(s2x(currentRelationConfiguration->where))) {
+					if (isPrependingInputAttributes()) writeInputAttributes();
+					writeOutputAttributes();
+					if (isAppendingInputAttributes()) writeInputAttributes();
+				}
 
-				// TODO: evaluate XPath expression
-				// TODO: write record to output, if the XPath condition was met
-				// TODO: clean record node in DOM
+				resetRecordElement();
 				currentAttributeIndex = 0;
 			}
 		} else {