# HG changeset patch # User František Kučera # Date 1605827399 -3600 # Node ID 053054f9f702bfbb5db59ed0daea183f9dbb8c66 # Parent e67584a06be610cde2625a2678988f277635e0c0 XML name escaping diff -r e67584a06be6 -r 053054f9f702 nbproject/configurations.xml --- a/nbproject/configurations.xml Wed Oct 28 16:30:58 2020 +0100 +++ b/nbproject/configurations.xml Fri Nov 20 00:09:59 2020 +0100 @@ -102,6 +102,8 @@ true + + @@ -143,6 +145,8 @@ + + diff -r e67584a06be6 -r 053054f9f702 src/XMLDocumentConstructor.h --- a/src/XMLDocumentConstructor.h Wed Oct 28 16:30:58 2020 +0100 +++ b/src/XMLDocumentConstructor.h Fri Nov 20 00:09:59 2020 +0100 @@ -16,21 +16,24 @@ */ #pragma once -namespace relpipe { -namespace in { -namespace xmltable { - #include #include #include #include +#include "XMLNameCodec.h" + +namespace relpipe { +namespace in { +namespace xmltable { + class XMLDocumentConstructor { private: std::istream* input = nullptr; xmlpp::DomParser* parser = nullptr; yaml_parser_t yamlParser; + XMLNameCodec nameCodec; enum class Mode { ROOT, @@ -48,7 +51,7 @@ *length = input->gcount(); return (input->good() || input->eof()) ? 1 : 0; } - + /** * Both YAML and XML strings are in UTF-8. */ @@ -56,10 +59,8 @@ return value ? (const char*) value : ""; } - const std::string y2xname(yaml_char_t* value) { - // FIXME: escaping, assure valid XML names - //return std::string("name_") + y2x(value); - return y2x(value); + const Glib::ustring y2xname(yaml_char_t* value) { + return nameCodec.encode(y2x(value)); } xmlpp::Element* parentOrSelf(xmlpp::Element* current) { diff -r e67584a06be6 -r 053054f9f702 src/XMLNameCodec.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/XMLNameCodec.h Fri Nov 20 00:09:59 2020 +0100 @@ -0,0 +1,106 @@ +/** + * Relational pipes + * Copyright © 2019 František Kučera (Frantovo.cz, GlobalCode.info) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#pragma once + +#include +#include + +#include + +namespace relpipe { +namespace in { +namespace xmltable { + +class XMLNameCodec { +private: + static const char DEFAULT_ESCAPING_CHARACTER = '_'; + const char esc; + + bool between(gunichar codepoint, gunichar start, gunichar end) { + return codepoint >= start && codepoint <= end; + } + + bool isValidNameStartChar(gunichar codepoint) { + // NameStartChar ::= ":" | [A-Z] | "_" | [a-z] + // | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] + // | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] + // | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] + return codepoint == ':' || between(codepoint, 'A', 'Z') || codepoint == '_' || between(codepoint, 'a', 'z') + || between(codepoint, 0xC0, 0xD6) || between(codepoint, 0xD8, 0xF6) || between(codepoint, 0xF8, 0x2FF) || between(codepoint, 0x370, 0x37D) || between(codepoint, 0x37F, 0x1FFF) + || between(codepoint, 0x200C, 0x200D) || between(codepoint, 0x2070, 0x218F) || between(codepoint, 0x2C00, 0x2FEF) || between(codepoint, 0x3001, 0xD7FF) + || between(codepoint, 0xF900, 0xFDCF) || between(codepoint, 0xFDF0, 0xFFFD) || between(codepoint, 0x10000, 0xEFFFF); + } + + bool isValidNameChar(gunichar codepoint) { + // NameChar ::= NameStartChar | "-" | "." | [0-9] + // | #xB7 + // | [#x0300-#x036F] | [#x203F-#x2040] + return isValidNameStartChar(codepoint) || codepoint == '-' || codepoint == '.' || between(codepoint, '0', '9') + || codepoint == 0xB7 + || between(codepoint, 0x0300, 0x036F) || between(codepoint, 0x203F, 0x2040); + } + +public: + + XMLNameCodec() : esc(DEFAULT_ESCAPING_CHARACTER) { + } + + XMLNameCodec(const char esc) : esc(esc) { + } + + virtual ~XMLNameCodec() { + } + + Glib::ustring encode(Glib::ustring name) { + if (name.empty()) { + return "_"; + } else { + std::stringstream result; + + for (int i = 0; i < name.size(); i++) { + gunichar codepoint = name[i]; + if (codepoint == esc) { + result.put(esc); + result.put(esc); + continue; + } else if (i == 0) { + if (isValidNameStartChar(codepoint)) { + result << Glib::ustring(1, codepoint); + continue; + } else { + result.put('_'); + } + } else if (isValidNameChar(codepoint)) { + result << Glib::ustring(1, codepoint); + continue; + } + + result.put(esc); + result << Glib::ustring::format(std::hex, std::setfill(L'0'), std::setw(2), codepoint); + result.put(esc); + } + + return result.str(); + } + } + + +}; + +} +} +}