src/XMLNameCodec.h
author František Kučera <franta-hg@frantovo.cz>
Wed, 30 Dec 2020 01:28:45 +0100
branchv_0
changeset 2 426054465916
permissions -rw-r--r--
build DOM, filter records and evaluate XPath expressions for additional output attributes

/**
 * Relational pipes
 * Copyright © 2019 František Kučera (Frantovo.cz, GlobalCode.info)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
#pragma once

#include <sstream>
#include <iomanip>
#include <stdexcept>

#include <glibmm-2.4/glibmm/ustring.h>

namespace relpipe {
namespace in {
namespace xmltable {

class XMLNameCodec {
private:
	static const char DEFAULT_ESCAPING_CHARACTER = '_';
	const char esc;
	const bool namespaceAware;

	bool between(gunichar codepoint, gunichar start, gunichar end) {
		return codepoint >= start && codepoint <= end;
	}

	/**
	 * https://www.w3.org/TR/REC-xml/#NT-NameStartChar
	 *
	 * @param codepoint unicode character
	 * @return whether this character is allowed at the beginning of a XML name
	 */
	bool isValidNameStartChar(gunichar codepoint) {
		// NameStartChar  ::= ":" | [A-Z] | "_" | [a-z] 
		//   | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
		//   | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF]
		//   | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
		return (codepoint == ':' && !namespaceAware) || between(codepoint, 'A', 'Z') || codepoint == '_' || between(codepoint, 'a', 'z')
				|| between(codepoint, 0xC0, 0xD6) || between(codepoint, 0xD8, 0xF6) || between(codepoint, 0xF8, 0x2FF) || between(codepoint, 0x370, 0x37D) || between(codepoint, 0x37F, 0x1FFF)
				|| between(codepoint, 0x200C, 0x200D) || between(codepoint, 0x2070, 0x218F) || between(codepoint, 0x2C00, 0x2FEF) || between(codepoint, 0x3001, 0xD7FF)
				|| between(codepoint, 0xF900, 0xFDCF) || between(codepoint, 0xFDF0, 0xFFFD) || between(codepoint, 0x10000, 0xEFFFF);
	}

	/**
	 * https://www.w3.org/TR/REC-xml/#NT-NameChar
	 *
	 * @param codepoint unicode character
	 * @return whether this character is allowed in a XML name
	 */
	bool isValidNameChar(gunichar codepoint) {
		// NameChar       ::= NameStartChar | "-" | "." | [0-9] 
		//   | #xB7
		//   | [#x0300-#x036F] | [#x203F-#x2040]
		return isValidNameStartChar(codepoint) || codepoint == '-' || codepoint == '.' || between(codepoint, '0', '9')
				|| codepoint == 0xB7
				|| between(codepoint, 0x0300, 0x036F) || between(codepoint, 0x203F, 0x2040);
	}

public:

	XMLNameCodec() : XMLNameCodec(DEFAULT_ESCAPING_CHARACTER, true) {
	}

	/**
	 * @param escapingCharacter must be valid character allowed not only in the middle of the XML name but also as the
	 * first character of the name
	 * @param namespaceAware colon character is reserved as a separator of the prefix and the local name, see
	 * https://www.w3.org/TR/REC-xml-names/#NT-NCName
	 * @throws std::invalid_argument if escapingCharacter is not valid
	 */
	XMLNameCodec(const char escapingCharacter, const bool namespaceAware) : esc(escapingCharacter), namespaceAware(namespaceAware) {
		// TODO: allow also characters like #xB7 and add another escaping if they occur at the beginning of the name?
		if (!isValidNameStartChar(esc)) {
			throw std::invalid_argument("The character „" + std::to_string(escapingCharacter) + "“ is not allowed at the beginning of a XML name and thus not usable for escaping");
		}
	}

	virtual ~XMLNameCodec() {
	}

	/**
	 * @param name any string
	 * @return valid name of XML element or attribute
	 */
	Glib::ustring encode(Glib::ustring name) {
		if (name.empty()) {
			return Glib::ustring(1, esc);
		} else {
			std::stringstream result;

			for (int i = 0; i < name.size(); i++) {
				gunichar codepoint = name[i];
				if (codepoint == esc) {
					result.put(esc);
					result.put(esc);
					continue;
				} else if ((i == 0 && isValidNameStartChar(codepoint)) || (i > 0 && isValidNameChar(codepoint))) {
					result << Glib::ustring(1, codepoint);
					continue;
				}

				result.put(esc);
				result << Glib::ustring::format(std::hex, std::setfill(L'0'), std::setw(2), codepoint);
				result.put(esc);
			}

			return result.str();
		}
	}

};

}
}
}