XML name escaping v_0
authorFrantišek Kučera <franta-hg@frantovo.cz>
Fri, 20 Nov 2020 00:09:59 +0100
branchv_0
changeset 21 053054f9f702
parent 20 e67584a06be6
child 22 53f1f3a5649a
XML name escaping
nbproject/configurations.xml
src/XMLDocumentConstructor.h
src/XMLNameCodec.h
--- a/nbproject/configurations.xml	Wed Oct 28 16:30:58 2020 +0100
+++ b/nbproject/configurations.xml	Fri Nov 20 00:09:59 2020 +0100
@@ -102,6 +102,8 @@
           <preBuildFirst>true</preBuildFirst>
         </preBuild>
       </makefileType>
+      <item path="src/XMLNameCodec.h" ex="false" tool="3" flavor2="0">
+      </item>
       <item path="src/relpipe-in-xmltable.cpp" ex="false" tool="1" flavor2="0">
         <ccTool flags="0">
         </ccTool>
@@ -143,6 +145,8 @@
       </makefileType>
       <item path="src/XMLDocumentConstructor.h" ex="false" tool="3" flavor2="0">
       </item>
+      <item path="src/XMLNameCodec.h" ex="false" tool="3" flavor2="0">
+      </item>
       <item path="src/relpipe-in-xmltable.cpp" ex="false" tool="1" flavor2="0">
         <ccTool flags="0">
         </ccTool>
--- a/src/XMLDocumentConstructor.h	Wed Oct 28 16:30:58 2020 +0100
+++ b/src/XMLDocumentConstructor.h	Fri Nov 20 00:09:59 2020 +0100
@@ -16,21 +16,24 @@
  */
 #pragma once
 
-namespace relpipe {
-namespace in {
-namespace xmltable {
-
 #include <codecvt>
 #include <vector>
 
 #include <libxml++-2.6/libxml++/libxml++.h>
 #include <yaml.h>
 
+#include "XMLNameCodec.h"
+
+namespace relpipe {
+namespace in {
+namespace xmltable {
+
 class XMLDocumentConstructor {
 private:
 	std::istream* input = nullptr;
 	xmlpp::DomParser* parser = nullptr;
 	yaml_parser_t yamlParser;
+	XMLNameCodec nameCodec;
 
 	enum class Mode {
 		ROOT,
@@ -48,7 +51,7 @@
 		*length = input->gcount();
 		return (input->good() || input->eof()) ? 1 : 0;
 	}
-
+	
 	/**
 	 * Both YAML and XML strings are in UTF-8.
 	 */
@@ -56,10 +59,8 @@
 		return value ? (const char*) value : "";
 	}
 
-	const std::string y2xname(yaml_char_t* value) {
-		// FIXME: escaping, assure valid XML names
-		//return std::string("name_") + y2x(value);
-		return y2x(value);
+	const Glib::ustring y2xname(yaml_char_t* value) {
+		return nameCodec.encode(y2x(value));
 	}
 
 	xmlpp::Element* parentOrSelf(xmlpp::Element* current) {
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/XMLNameCodec.h	Fri Nov 20 00:09:59 2020 +0100
@@ -0,0 +1,106 @@
+/**
+ * Relational pipes
+ * Copyright © 2019 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <sstream>
+#include <iomanip>
+
+#include <glibmm-2.4/glibmm/ustring.h>
+
+namespace relpipe {
+namespace in {
+namespace xmltable {
+
+class XMLNameCodec {
+private:
+	static const char DEFAULT_ESCAPING_CHARACTER = '_';
+	const char esc;
+
+	bool between(gunichar codepoint, gunichar start, gunichar end) {
+		return codepoint >= start && codepoint <= end;
+	}
+
+	bool isValidNameStartChar(gunichar codepoint) {
+		// NameStartChar  ::= ":" | [A-Z] | "_" | [a-z] 
+		//   | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
+		//   | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF]
+		//   | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
+		return codepoint == ':' || between(codepoint, 'A', 'Z') || codepoint == '_' || between(codepoint, 'a', 'z')
+				|| between(codepoint, 0xC0, 0xD6) || between(codepoint, 0xD8, 0xF6) || between(codepoint, 0xF8, 0x2FF) || between(codepoint, 0x370, 0x37D) || between(codepoint, 0x37F, 0x1FFF)
+				|| between(codepoint, 0x200C, 0x200D) || between(codepoint, 0x2070, 0x218F) || between(codepoint, 0x2C00, 0x2FEF) || between(codepoint, 0x3001, 0xD7FF)
+				|| between(codepoint, 0xF900, 0xFDCF) || between(codepoint, 0xFDF0, 0xFFFD) || between(codepoint, 0x10000, 0xEFFFF);
+	}
+
+	bool isValidNameChar(gunichar codepoint) {
+		// NameChar       ::= NameStartChar | "-" | "." | [0-9] 
+		//   | #xB7
+		//   | [#x0300-#x036F] | [#x203F-#x2040]
+		return isValidNameStartChar(codepoint) || codepoint == '-' || codepoint == '.' || between(codepoint, '0', '9')
+				|| codepoint == 0xB7
+				|| between(codepoint, 0x0300, 0x036F) || between(codepoint, 0x203F, 0x2040);
+	}
+
+public:
+
+	XMLNameCodec() : esc(DEFAULT_ESCAPING_CHARACTER) {
+	}
+
+	XMLNameCodec(const char esc) : esc(esc) {
+	}
+
+	virtual ~XMLNameCodec() {
+	}
+
+	Glib::ustring encode(Glib::ustring name) {
+		if (name.empty()) {
+			return "_";
+		} else {
+			std::stringstream result;
+
+			for (int i = 0; i < name.size(); i++) {
+				gunichar codepoint = name[i];
+				if (codepoint == esc) {
+					result.put(esc);
+					result.put(esc);
+					continue;
+				} else if (i == 0) {
+					if (isValidNameStartChar(codepoint)) {
+						result << Glib::ustring(1, codepoint);
+						continue;
+					} else {
+						result.put('_');
+					}
+				} else if (isValidNameChar(codepoint)) {
+					result << Glib::ustring(1, codepoint);
+					continue;
+				}
+
+				result.put(esc);
+				result << Glib::ustring::format(std::hex, std::setfill(L'0'), std::setw(2), codepoint);
+				result.put(esc);
+			}
+
+			return result.str();
+		}
+	}
+
+
+};
+
+}
+}
+}