# HG changeset patch # User František Kučera # Date 1609288125 -3600 # Node ID 426054465916cbe8bba2b06475c71bfa4dea177e # Parent d6dbd5d50d4326cc1b90c7d123d078e55634f004 build DOM, filter records and evaluate XPath expressions for additional output attributes diff -r d6dbd5d50d43 -r 426054465916 src/XMLNameCodec.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/XMLNameCodec.h Wed Dec 30 01:28:45 2020 +0100 @@ -0,0 +1,127 @@ +/** + * Relational pipes + * Copyright © 2019 František Kučera (Frantovo.cz, GlobalCode.info) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#pragma once + +#include +#include +#include + +#include + +namespace relpipe { +namespace in { +namespace xmltable { + +class XMLNameCodec { +private: + static const char DEFAULT_ESCAPING_CHARACTER = '_'; + const char esc; + const bool namespaceAware; + + bool between(gunichar codepoint, gunichar start, gunichar end) { + return codepoint >= start && codepoint <= end; + } + + /** + * https://www.w3.org/TR/REC-xml/#NT-NameStartChar + * + * @param codepoint unicode character + * @return whether this character is allowed at the beginning of a XML name + */ + bool isValidNameStartChar(gunichar codepoint) { + // NameStartChar ::= ":" | [A-Z] | "_" | [a-z] + // | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] + // | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] + // | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] + return (codepoint == ':' && !namespaceAware) || between(codepoint, 'A', 'Z') || codepoint == '_' || between(codepoint, 'a', 'z') + || between(codepoint, 0xC0, 0xD6) || between(codepoint, 0xD8, 0xF6) || between(codepoint, 0xF8, 0x2FF) || between(codepoint, 0x370, 0x37D) || between(codepoint, 0x37F, 0x1FFF) + || between(codepoint, 0x200C, 0x200D) || between(codepoint, 0x2070, 0x218F) || between(codepoint, 0x2C00, 0x2FEF) || between(codepoint, 0x3001, 0xD7FF) + || between(codepoint, 0xF900, 0xFDCF) || between(codepoint, 0xFDF0, 0xFFFD) || between(codepoint, 0x10000, 0xEFFFF); + } + + /** + * https://www.w3.org/TR/REC-xml/#NT-NameChar + * + * @param codepoint unicode character + * @return whether this character is allowed in a XML name + */ + bool isValidNameChar(gunichar codepoint) { + // NameChar ::= NameStartChar | "-" | "." | [0-9] + // | #xB7 + // | [#x0300-#x036F] | [#x203F-#x2040] + return isValidNameStartChar(codepoint) || codepoint == '-' || codepoint == '.' || between(codepoint, '0', '9') + || codepoint == 0xB7 + || between(codepoint, 0x0300, 0x036F) || between(codepoint, 0x203F, 0x2040); + } + +public: + + XMLNameCodec() : XMLNameCodec(DEFAULT_ESCAPING_CHARACTER, true) { + } + + /** + * @param escapingCharacter must be valid character allowed not only in the middle of the XML name but also as the + * first character of the name + * @param namespaceAware colon character is reserved as a separator of the prefix and the local name, see + * https://www.w3.org/TR/REC-xml-names/#NT-NCName + * @throws std::invalid_argument if escapingCharacter is not valid + */ + XMLNameCodec(const char escapingCharacter, const bool namespaceAware) : esc(escapingCharacter), namespaceAware(namespaceAware) { + // TODO: allow also characters like #xB7 and add another escaping if they occur at the beginning of the name? + if (!isValidNameStartChar(esc)) { + throw std::invalid_argument("The character „" + std::to_string(escapingCharacter) + "“ is not allowed at the beginning of a XML name and thus not usable for escaping"); + } + } + + virtual ~XMLNameCodec() { + } + + /** + * @param name any string + * @return valid name of XML element or attribute + */ + Glib::ustring encode(Glib::ustring name) { + if (name.empty()) { + return Glib::ustring(1, esc); + } else { + std::stringstream result; + + for (int i = 0; i < name.size(); i++) { + gunichar codepoint = name[i]; + if (codepoint == esc) { + result.put(esc); + result.put(esc); + continue; + } else if ((i == 0 && isValidNameStartChar(codepoint)) || (i > 0 && isValidNameChar(codepoint))) { + result << Glib::ustring(1, codepoint); + continue; + } + + result.put(esc); + result << Glib::ustring::format(std::hex, std::setfill(L'0'), std::setw(2), codepoint); + result.put(esc); + } + + return result.str(); + } + } + +}; + +} +} +} diff -r d6dbd5d50d43 -r 426054465916 src/XPathHandler.h --- a/src/XPathHandler.h Wed Dec 30 00:39:03 2020 +0100 +++ b/src/XPathHandler.h Wed Dec 30 01:28:45 2020 +0100 @@ -34,6 +34,7 @@ #include #include "Configuration.h" +#include "XMLNameCodec.h" namespace relpipe { namespace tr { @@ -41,14 +42,18 @@ class XPathHandler : public relpipe::reader::handlers::RelationalReaderStringHandler { private: + std::wstring_convert> convertor; // XML is in UTF-8 shared_ptr relationalWriter; Configuration configuration; - RelationConfiguration* currentRelationConfiguration; + RelationConfiguration* currentRelationConfiguration = nullptr; std::vector currentReaderMetadata; std::vector currentWriterMetadata; size_t currentAttributeIndex = 0; - + + xmlpp::DomParser dom; + xmlpp::Element* recordElement = nullptr; + relpipe::in::xmltable::XMLNameCodec xmlNameCodec; // TODO: move to a common library void copyInputAttributesToOutput() { for (auto rm : currentReaderMetadata) currentWriterMetadata.push_back({rm.getAttributeName(), relationalWriter->toTypeId(rm.getTypeName())}); @@ -63,6 +68,35 @@ return currentRelationConfiguration->inputAttributePolicy == InputAttributePolicy::Append; } + void resetRecordElement() { + if (recordElement) dom.get_document()->get_root_node()->remove_child(recordElement); + recordElement = dom.get_document()->get_root_node()->add_child("record"); + } + + const Glib::ustring s2x(relpipe::common::type::StringX value) { + return Glib::ustring(convertor.to_bytes(value)); + } + + const relpipe::common::type::StringX x2s(const Glib::ustring& value) { + return convertor.from_bytes(value); + } + + void writeInputAttributes() { + for (xmlpp::Node* attributeNode : recordElement->get_children()) { + if (xmlpp::Element * attributeElement = dynamic_cast (attributeNode)) { + auto value = attributeElement->get_child_text()->get_content(); + relationalWriter->writeAttribute(x2s(value)); + } + } + } + + void writeOutputAttributes() { + for (auto oa : currentRelationConfiguration->outputAttributes) { + auto value = recordElement->eval_to_string(s2x(oa.xpath)); + relationalWriter->writeAttribute(x2s(value)); + } + } + public: XPathHandler(shared_ptr relationalWriter, Configuration configuration) : relationalWriter(relationalWriter), configuration(configuration) { @@ -92,6 +126,8 @@ if (isAppendingInputAttributes()) copyInputAttributesToOutput(); // TODO: prepare DOM + dom.get_document()->create_root_node("relpipe-tr-xpath"); + resetRecordElement(); } relationalWriter->startRelation(name, currentWriterMetadata, true); @@ -101,15 +137,21 @@ if (currentRelationConfiguration) { relpipe::reader::handlers::AttributeMetadata attributeMetadata = currentReaderMetadata[currentAttributeIndex]; - // TODO: add attribute to DOM + xmlpp::Element* attributeElement = recordElement->add_child(xmlNameCodec.encode(s2x(attributeMetadata.getAttributeName()))); + attributeElement->set_attribute("name", s2x(attributeMetadata.getAttributeName())); + attributeElement->set_attribute("type", s2x(attributeMetadata.getTypeName())); + attributeElement->add_child_text(s2x(value)); currentAttributeIndex++; if (currentAttributeIndex == currentReaderMetadata.size()) { + if (currentRelationConfiguration->where.empty() || recordElement->eval_to_boolean(s2x(currentRelationConfiguration->where))) { + if (isPrependingInputAttributes()) writeInputAttributes(); + writeOutputAttributes(); + if (isAppendingInputAttributes()) writeInputAttributes(); + } - // TODO: evaluate XPath expression - // TODO: write record to output, if the XPath condition was met - // TODO: clean record node in DOM + resetRecordElement(); currentAttributeIndex = 0; } } else {