src/XMLNameCodec.h
author František Kučera <franta-hg@frantovo.cz>
Fri, 20 Nov 2020 10:03:07 +0100
branchv_0
changeset 22 53f1f3a5649a
parent 21 053054f9f702
child 23 b25404ff2b2b
permissions -rw-r--r--
XML name escaping: in namespace aware mode: escape also the colon

/**
 * Relational pipes
 * Copyright © 2019 František Kučera (Frantovo.cz, GlobalCode.info)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
#pragma once

#include <sstream>
#include <iomanip>

#include <glibmm-2.4/glibmm/ustring.h>

namespace relpipe {
namespace in {
namespace xmltable {

class XMLNameCodec {
private:
	static const char DEFAULT_ESCAPING_CHARACTER = '_';
	const char esc;

	bool between(gunichar codepoint, gunichar start, gunichar end) {
		return codepoint >= start && codepoint <= end;
	}

	bool isValidNameStartChar(gunichar codepoint, bool namespaceAware = true) {
		// NameStartChar  ::= ":" | [A-Z] | "_" | [a-z] 
		//   | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
		//   | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF]
		//   | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
		return (codepoint == ':' && !namespaceAware) || between(codepoint, 'A', 'Z') || codepoint == '_' || between(codepoint, 'a', 'z')
				|| between(codepoint, 0xC0, 0xD6) || between(codepoint, 0xD8, 0xF6) || between(codepoint, 0xF8, 0x2FF) || between(codepoint, 0x370, 0x37D) || between(codepoint, 0x37F, 0x1FFF)
				|| between(codepoint, 0x200C, 0x200D) || between(codepoint, 0x2070, 0x218F) || between(codepoint, 0x2C00, 0x2FEF) || between(codepoint, 0x3001, 0xD7FF)
				|| between(codepoint, 0xF900, 0xFDCF) || between(codepoint, 0xFDF0, 0xFFFD) || between(codepoint, 0x10000, 0xEFFFF);
	}

	bool isValidNameChar(gunichar codepoint, bool namespaceAware = true) {
		// NameChar       ::= NameStartChar | "-" | "." | [0-9] 
		//   | #xB7
		//   | [#x0300-#x036F] | [#x203F-#x2040]
		return isValidNameStartChar(codepoint, namespaceAware) || codepoint == '-' || codepoint == '.' || between(codepoint, '0', '9')
				|| codepoint == 0xB7
				|| between(codepoint, 0x0300, 0x036F) || between(codepoint, 0x203F, 0x2040);
	}

public:

	XMLNameCodec() : esc(DEFAULT_ESCAPING_CHARACTER) {
	}

	XMLNameCodec(const char esc) : esc(esc) {
	}

	virtual ~XMLNameCodec() {
	}

	Glib::ustring encode(Glib::ustring name) {
		if (name.empty()) {
			return "_";
		} else {
			std::stringstream result;

			for (int i = 0; i < name.size(); i++) {
				gunichar codepoint = name[i];
				if (codepoint == esc) {
					result.put(esc);
					result.put(esc);
					continue;
				} else if (i == 0) {
					if (isValidNameStartChar(codepoint)) {
						result << Glib::ustring(1, codepoint);
						continue;
					} else {
						result.put('_');
					}
				} else if (isValidNameChar(codepoint)) {
					result << Glib::ustring(1, codepoint);
					continue;
				}

				result.put(esc);
				result << Glib::ustring::format(std::hex, std::setfill(L'0'), std::setw(2), codepoint);
				result.put(esc);
			}

			return result.str();
		}
	}


};

}
}
}