XML name escaping: documentation, refactoring v_0
authorFrantišek Kučera <franta-hg@frantovo.cz>
Fri, 20 Nov 2020 11:59:15 +0100
branchv_0
changeset 23 b25404ff2b2b
parent 22 53f1f3a5649a
child 24 ee72fccc5267
XML name escaping: documentation, refactoring
src/XMLNameCodec.h
--- a/src/XMLNameCodec.h	Fri Nov 20 10:03:07 2020 +0100
+++ b/src/XMLNameCodec.h	Fri Nov 20 11:59:15 2020 +0100
@@ -18,6 +18,7 @@
 
 #include <sstream>
 #include <iomanip>
+#include <stdexcept>
 
 #include <glibmm-2.4/glibmm/ustring.h>
 
@@ -29,12 +30,19 @@
 private:
 	static const char DEFAULT_ESCAPING_CHARACTER = '_';
 	const char esc;
+	const bool namespaceAware;
 
 	bool between(gunichar codepoint, gunichar start, gunichar end) {
 		return codepoint >= start && codepoint <= end;
 	}
 
-	bool isValidNameStartChar(gunichar codepoint, bool namespaceAware = true) {
+	/**
+	 * https://www.w3.org/TR/REC-xml/#NT-NameStartChar
+	 *
+	 * @param codepoint unicode character
+	 * @return whether this character is allowed at the beginning of a XML name
+	 */
+	bool isValidNameStartChar(gunichar codepoint) {
 		// NameStartChar  ::= ":" | [A-Z] | "_" | [a-z] 
 		//   | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
 		//   | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF]
@@ -45,29 +53,50 @@
 				|| between(codepoint, 0xF900, 0xFDCF) || between(codepoint, 0xFDF0, 0xFFFD) || between(codepoint, 0x10000, 0xEFFFF);
 	}
 
-	bool isValidNameChar(gunichar codepoint, bool namespaceAware = true) {
+	/**
+	 * https://www.w3.org/TR/REC-xml/#NT-NameChar
+	 *
+	 * @param codepoint unicode character
+	 * @return whether this character is allowed in a XML name
+	 */
+	bool isValidNameChar(gunichar codepoint) {
 		// NameChar       ::= NameStartChar | "-" | "." | [0-9] 
 		//   | #xB7
 		//   | [#x0300-#x036F] | [#x203F-#x2040]
-		return isValidNameStartChar(codepoint, namespaceAware) || codepoint == '-' || codepoint == '.' || between(codepoint, '0', '9')
+		return isValidNameStartChar(codepoint) || codepoint == '-' || codepoint == '.' || between(codepoint, '0', '9')
 				|| codepoint == 0xB7
 				|| between(codepoint, 0x0300, 0x036F) || between(codepoint, 0x203F, 0x2040);
 	}
 
 public:
 
-	XMLNameCodec() : esc(DEFAULT_ESCAPING_CHARACTER) {
+	XMLNameCodec() : XMLNameCodec(DEFAULT_ESCAPING_CHARACTER, true) {
 	}
 
-	XMLNameCodec(const char esc) : esc(esc) {
+	/**
+	 * @param escapingCharacter must be valid character allowed not only in the middle of the XML name but also as the
+	 * first character of the name
+	 * @param namespaceAware colon character is reserved as a separator of the prefix and the local name, see
+	 * https://www.w3.org/TR/REC-xml-names/#NT-NCName
+	 * @throws std::invalid_argument if escapingCharacter is not valid
+	 */
+	XMLNameCodec(const char escapingCharacter, const bool namespaceAware) : esc(escapingCharacter), namespaceAware(namespaceAware) {
+		// TODO: allow also characters like #xB7 and add another escaping if they occur at the beginning of the name?
+		if (!isValidNameStartChar(esc)) {
+			throw std::invalid_argument("The character „" + std::to_string(escapingCharacter) + "“ is not allowed at the beginning of a XML name and thus not usable for escaping");
+		}
 	}
 
 	virtual ~XMLNameCodec() {
 	}
 
+	/**
+	 * @param name any string
+	 * @return valid name of XML element or attribute
+	 */
 	Glib::ustring encode(Glib::ustring name) {
 		if (name.empty()) {
-			return "_";
+			return Glib::ustring(1, esc);
 		} else {
 			std::stringstream result;
 
@@ -77,14 +106,7 @@
 					result.put(esc);
 					result.put(esc);
 					continue;
-				} else if (i == 0) {
-					if (isValidNameStartChar(codepoint)) {
-						result << Glib::ustring(1, codepoint);
-						continue;
-					} else {
-						result.put('_');
-					}
-				} else if (isValidNameChar(codepoint)) {
+				} else if ((i == 0 && isValidNameStartChar(codepoint)) || (i > 0 && isValidNameChar(codepoint))) {
 					result << Glib::ustring(1, codepoint);
 					continue;
 				}
@@ -98,7 +120,6 @@
 		}
 	}
 
-
 };
 
 }