temporary copy INIReader.h, INIReader.cpp, INIContentHandler.h from relpipe-in-ini + XMLNameCodec.h from relpipe-in-yamltable (will be moved to alt2xml and shared) v_0
authorFrantišek Kučera <franta-hg@frantovo.cz>
Sun, 22 Nov 2020 17:11:12 +0100
branchv_0
changeset 16 b9a3c806468a
parent 15 f4fb07ed8753
child 17 786977554fc3
temporary copy INIReader.h, INIReader.cpp, INIContentHandler.h from relpipe-in-ini + XMLNameCodec.h from relpipe-in-yamltable (will be moved to alt2xml and shared)
nbproject/configurations.xml
src/CMakeLists.txt
src/lib/INIContentHandler.h
src/lib/INIReader.cpp
src/lib/INIReader.h
src/lib/XMLNameCodec.h
--- a/nbproject/configurations.xml	Sun Nov 22 17:06:17 2020 +0100
+++ b/nbproject/configurations.xml	Sun Nov 22 17:11:12 2020 +0100
@@ -42,6 +42,9 @@
   <logicalFolder name="root" displayName="root" projectFiles="true" kind="ROOT">
     <df root="." name="0">
       <df name="src">
+        <df name="lib">
+          <in>INIReader.cpp</in>
+        </df>
         <in>XMLDocumentConstructor.h</in>
         <in>relpipe-in-xmltable.cpp</in>
       </df>
@@ -102,6 +105,10 @@
           <preBuildFirst>true</preBuildFirst>
         </preBuild>
       </makefileType>
+      <item path="src/lib/INIReader.cpp" ex="false" tool="1" flavor2="0">
+        <ccTool flags="0">
+        </ccTool>
+      </item>
       <item path="src/relpipe-in-xmltable.cpp" ex="false" tool="1" flavor2="0">
         <ccTool flags="0">
         </ccTool>
--- a/src/CMakeLists.txt	Sun Nov 22 17:06:17 2020 +0100
+++ b/src/CMakeLists.txt	Sun Nov 22 17:11:12 2020 +0100
@@ -29,6 +29,7 @@
 # Executable output:
 add_executable(
 	${EXECUTABLE_FILE}
+	lib/INIReader.cpp
 	relpipe-in-xmltable.cpp
 )
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/INIContentHandler.h	Sun Nov 22 17:11:12 2020 +0100
@@ -0,0 +1,50 @@
+/**
+ * Relational pipes
+ * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <string>
+
+class INIContentHandler {
+public:
+
+	class Event {
+	public:
+		int64_t eventNumber = -1;
+		int64_t lineNumber = -1;
+		std::string comment;
+	};
+
+	class SectionStartEvent : public Event {
+	public:
+		std::string name;
+	};
+
+	class EntryEvent : public Event {
+	public:
+		std::string key;
+		std::string subKey;
+		std::string fullKey;
+		std::string value;
+	};
+
+	virtual ~INIContentHandler() = default;
+	virtual void startDocument() = 0;
+	virtual void endDocument() = 0;
+	virtual void startSection(const SectionStartEvent& event) = 0;
+	virtual void endSection() = 0;
+	virtual void entry(const EntryEvent& event) = 0;
+};
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/INIReader.cpp	Sun Nov 22 17:11:12 2020 +0100
@@ -0,0 +1,105 @@
+/**
+ * Relational pipes
+ * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <vector>
+#include <regex>
+
+#include "INIReader.h"
+
+class INIReaderImpl : public INIReader {
+private:
+	std::istream& input;
+	std::vector<INIContentHandler*> handlers;
+public:
+
+	INIReaderImpl(std::istream& input) : input(input) {
+	}
+
+	void addHandler(INIContentHandler* handler) override {
+		handlers.push_back(handler);
+	}
+
+	void process() override {
+
+		for (INIContentHandler* handler : handlers) handler->startDocument();
+
+		std::regex whitespacePattrern("\\s*");
+		std::regex commentPattrern("\\s*(;|#)\\s*(.*)");
+		std::regex sectionPattrern("\\s*\\[\\s*([^\\]]+)\\s*\\]\\s*");
+		std::regex entryQuotesPattrern(/***/"\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*\"([^']+)\"\\s*((;|#)\\s*(.*))?");
+		std::regex entryApostrophesPattrern("\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*'([^']+)'\\s*((;|#)\\s*(.*))?");
+		std::regex entryPlainPattrern("\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*(.*)");
+
+		std::smatch match;
+		bool inSection = false;
+		std::string line;
+		int lineNumber = 0;
+		int eventNumber = 0;
+
+
+		while (std::getline(input, line)) {
+			lineNumber++;
+
+			if (std::regex_match(line, match, whitespacePattrern)) {
+				// TODO: support also whitespace
+			} else if (std::regex_match(line, match, commentPattrern)) {
+				// TODO: support also comments + emit also the comment style (;/#)
+			} else if (std::regex_match(line, match, sectionPattrern)) {
+				if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
+				inSection = true;
+				INIContentHandler::SectionStartEvent event;
+				event.lineNumber = lineNumber;
+				event.eventNumber = ++eventNumber;
+				event.name = match[1];
+				// TODO: support also comments + emit also the comment style (;/#)
+				for (INIContentHandler* handler : handlers) handler->startSection(event);
+			} else if (std::regex_match(line, match, entryQuotesPattrern) || std::regex_match(line, match, entryApostrophesPattrern) || std::regex_match(line, match, entryPlainPattrern)) {
+				INIContentHandler::EntryEvent event;
+				event.lineNumber = lineNumber;
+				event.eventNumber = ++eventNumber;
+				event.key = match[2];
+				event.subKey = match[4];
+				event.fullKey = match[1];
+				event.value = match[5];
+				if (match.size() == 9) event.comment = match[8];
+				// TODO: emit also the quote style ('/"/) and surrounding whitespace
+				for (INIContentHandler* handler : handlers) handler->entry(event);
+			} else {
+				// TODO: warning, error, or support unknown content
+			}
+
+			// TODO: probably switch to state-machine approach instead of regular expressions
+			// TODO: warning/error handler
+			// TODO: support also multiline content (\ + \n)
+			// TODO: support also quoted or multiline keys?
+			// TODO: support also escaped characters
+			// TODO: support also Java .properties and manifest.mf formats?
+			// TODO: support also nested sections – hierarchy
+			// TODO: support also nested keys e.g. key.sub.subsub.subsubsub=value – translate them to nested sections
+			// TODO: support also option for alternative key-value separator (: instead of =)
+			// TODO: support also other encodings (currently only UTF-8 is supported)
+		}
+
+		if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
+
+		for (INIContentHandler* handler : handlers) handler->endDocument();
+	}
+};
+
+INIReader* INIReader::create(std::istream& input) {
+	return new INIReaderImpl(input);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/INIReader.h	Sun Nov 22 17:11:12 2020 +0100
@@ -0,0 +1,33 @@
+/**
+ * Relational pipes
+ * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <string>
+#include <istream>
+
+#include "INIContentHandler.h"
+
+/**
+ * TODO: Files in the src/lib directory will be moved to alt2xml and used as a shared library.
+ */
+class INIReader {
+public:
+	virtual ~INIReader() = default;
+	virtual void addHandler(INIContentHandler* handler) = 0;
+	virtual void process() = 0;
+	static INIReader* create(std::istream& input);
+};
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/XMLNameCodec.h	Sun Nov 22 17:11:12 2020 +0100
@@ -0,0 +1,127 @@
+/**
+ * Relational pipes
+ * Copyright © 2019 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <sstream>
+#include <iomanip>
+#include <stdexcept>
+
+#include <glibmm-2.4/glibmm/ustring.h>
+
+namespace relpipe {
+namespace in {
+namespace xmltable {
+
+class XMLNameCodec {
+private:
+	static const char DEFAULT_ESCAPING_CHARACTER = '_';
+	const char esc;
+	const bool namespaceAware;
+
+	bool between(gunichar codepoint, gunichar start, gunichar end) {
+		return codepoint >= start && codepoint <= end;
+	}
+
+	/**
+	 * https://www.w3.org/TR/REC-xml/#NT-NameStartChar
+	 *
+	 * @param codepoint unicode character
+	 * @return whether this character is allowed at the beginning of a XML name
+	 */
+	bool isValidNameStartChar(gunichar codepoint) {
+		// NameStartChar  ::= ":" | [A-Z] | "_" | [a-z] 
+		//   | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
+		//   | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF]
+		//   | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
+		return (codepoint == ':' && !namespaceAware) || between(codepoint, 'A', 'Z') || codepoint == '_' || between(codepoint, 'a', 'z')
+				|| between(codepoint, 0xC0, 0xD6) || between(codepoint, 0xD8, 0xF6) || between(codepoint, 0xF8, 0x2FF) || between(codepoint, 0x370, 0x37D) || between(codepoint, 0x37F, 0x1FFF)
+				|| between(codepoint, 0x200C, 0x200D) || between(codepoint, 0x2070, 0x218F) || between(codepoint, 0x2C00, 0x2FEF) || between(codepoint, 0x3001, 0xD7FF)
+				|| between(codepoint, 0xF900, 0xFDCF) || between(codepoint, 0xFDF0, 0xFFFD) || between(codepoint, 0x10000, 0xEFFFF);
+	}
+
+	/**
+	 * https://www.w3.org/TR/REC-xml/#NT-NameChar
+	 *
+	 * @param codepoint unicode character
+	 * @return whether this character is allowed in a XML name
+	 */
+	bool isValidNameChar(gunichar codepoint) {
+		// NameChar       ::= NameStartChar | "-" | "." | [0-9] 
+		//   | #xB7
+		//   | [#x0300-#x036F] | [#x203F-#x2040]
+		return isValidNameStartChar(codepoint) || codepoint == '-' || codepoint == '.' || between(codepoint, '0', '9')
+				|| codepoint == 0xB7
+				|| between(codepoint, 0x0300, 0x036F) || between(codepoint, 0x203F, 0x2040);
+	}
+
+public:
+
+	XMLNameCodec() : XMLNameCodec(DEFAULT_ESCAPING_CHARACTER, true) {
+	}
+
+	/**
+	 * @param escapingCharacter must be valid character allowed not only in the middle of the XML name but also as the
+	 * first character of the name
+	 * @param namespaceAware colon character is reserved as a separator of the prefix and the local name, see
+	 * https://www.w3.org/TR/REC-xml-names/#NT-NCName
+	 * @throws std::invalid_argument if escapingCharacter is not valid
+	 */
+	XMLNameCodec(const char escapingCharacter, const bool namespaceAware) : esc(escapingCharacter), namespaceAware(namespaceAware) {
+		// TODO: allow also characters like #xB7 and add another escaping if they occur at the beginning of the name?
+		if (!isValidNameStartChar(esc)) {
+			throw std::invalid_argument("The character „" + std::to_string(escapingCharacter) + "“ is not allowed at the beginning of a XML name and thus not usable for escaping");
+		}
+	}
+
+	virtual ~XMLNameCodec() {
+	}
+
+	/**
+	 * @param name any string
+	 * @return valid name of XML element or attribute
+	 */
+	Glib::ustring encode(Glib::ustring name) {
+		if (name.empty()) {
+			return Glib::ustring(1, esc);
+		} else {
+			std::stringstream result;
+
+			for (int i = 0; i < name.size(); i++) {
+				gunichar codepoint = name[i];
+				if (codepoint == esc) {
+					result.put(esc);
+					result.put(esc);
+					continue;
+				} else if ((i == 0 && isValidNameStartChar(codepoint)) || (i > 0 && isValidNameChar(codepoint))) {
+					result << Glib::ustring(1, codepoint);
+					continue;
+				}
+
+				result.put(esc);
+				result << Glib::ustring::format(std::hex, std::setfill(L'0'), std::setw(2), codepoint);
+				result.put(esc);
+			}
+
+			return result.str();
+		}
+	}
+
+};
+
+}
+}
+}