unescape also Java .properties encoding (\uXXXX) v_0
authorFrantišek Kučera <franta-hg@frantovo.cz>
Thu, 26 Nov 2020 11:42:26 +0100
branchv_0
changeset 27 fd669e73d39a
parent 26 80e129ec3408
child 28 0e7c57d48d1e
unescape also Java .properties encoding (\uXXXX)
src/XMLDocumentConstructor.h
src/lib/JavaPropertiesUnescapingINIHandler.h
src/lib/UnescapingINIHandler.h
--- a/src/XMLDocumentConstructor.h	Wed Nov 25 21:50:26 2020 +0100
+++ b/src/XMLDocumentConstructor.h	Thu Nov 26 11:42:26 2020 +0100
@@ -21,6 +21,7 @@
 
 #include "lib/INIReader.h"
 #include "lib/BasicUnescapingINIHandler.h"
+#include "lib/JavaPropertiesUnescapingINIHandler.h"
 #include "lib/XMLNameCodec.h"
 
 using namespace relpipe::in::ini::lib;
@@ -110,8 +111,9 @@
 	void process() {
 		HierarchicalINIContentHandler handler(parser);
 		std::shared_ptr<INIReader> reader(INIReader::create(*input));
-		BasicUnescapingINIContentHandler unescapingHandler(handler, true);
-		reader->addHandler(&unescapingHandler);
+		BasicUnescapingINIContentHandler unescapingHandler(handler, false);
+		JavaPropertiesUnescapingINIContentHandler javaHandler(handler, true);
+		reader->addHandler(&javaHandler);
 		reader->process();
 	}
 };
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/JavaPropertiesUnescapingINIHandler.h	Thu Nov 26 11:42:26 2020 +0100
@@ -0,0 +1,105 @@
+/**
+ * Relational pipes
+ * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <sstream>
+#include <codecvt>
+#include <arpa/inet.h>
+
+#include "UnescapingINIHandler.h"
+
+using namespace std;
+using namespace relpipe::writer;
+
+namespace relpipe {
+namespace in {
+namespace ini {
+namespace lib {
+
+class JavaPropertiesUnescapingINIContentHandler : public UnescapingINIContentHandler {
+private:
+	wstring_convert < codecvt_utf8<wchar_t>> convertor; // INI parser works with UTF-8
+
+	bool readHex(const char* hexadecimal, size_t hexLength, uint8_t* resultBuffer, size_t binLength) {
+		if (hexLength != binLength * 2) return false;
+
+		for (size_t i = 0; i < binLength; i++) {
+			uint8_t value = 0;
+			char a = hexadecimal[i * 2];
+			char b = hexadecimal[i * 2 + 1];
+
+			if (a >= '0' && a <= '9') value += (a - '0')*16;
+			else if (a >= 'a' && a <= 'f') value += (a - 'a' + 10)*16;
+			else if (a >= 'A' && a <= 'F') value += (a - 'A' + 10)*16;
+			else return false;
+
+			if (b >= '0' && b <= '9') value += b - '0';
+			else if (b >= 'a' && b <= 'f') value += b - 'a' + 10;
+			else if (b >= 'A' && b <= 'F') value += b - 'A' + 10;
+			else return false;
+
+			if (resultBuffer) resultBuffer[i] = value;
+		}
+		return true;
+	}
+
+protected:
+
+	virtual std::string unescape(const std::string& s) {
+		std::stringstream result;
+		for (int i = 0, length = s.size(); i < length; i++) {
+			char ch = s[i];
+			if (i + 1 < length && ch == ESC) {
+				ch = s[i + 1];
+				if (ch == 'u') {
+					// TODO: simplify, clean-up, verify (but seems working)
+					i++;
+					int hexLength = 4;
+					if (i + hexLength < length) {
+						uint16_t u16;
+						bool hexOK = readHex(s.c_str() + i + 1, hexLength, (uint8_t*) & u16, sizeof (u16));
+						if (hexOK) result << convertor.to_bytes(ntohs(u16));
+						else throw std::logic_error(std::string("Invalid unicode escape sequence: invalid HEX"));
+						i += hexLength;
+					} else {
+						throw std::logic_error(std::string("Invalid unicode escape sequence: missing characters"));
+					}
+
+				} else if (ch == ESC && !lastEscaphingPhase) put(result, ESC, i).put(ESC); // copy and skip even the second \ to avoid its misinterpretation in the next cycle
+				else if (ch == ESC && lastEscaphingPhase) put(result, ESC, i); // unescape \\ to \.
+				else if (lastEscaphingPhase) throw std::logic_error(std::string("Unsupported escape sequence: ") + ch);
+				else result.put(ESC); // keep the escape sequence for later unescaping phase
+			} else if (ch == ESC) {
+				throw std::logic_error(std::string("Missing escape sequence")); // this should not happen
+			} else {
+				result.put(ch);
+			}
+		}
+		return result.str();
+	}
+
+public:
+
+	JavaPropertiesUnescapingINIContentHandler(INIContentHandler& output, bool lastEscaphingPhase) : UnescapingINIContentHandler(output, lastEscaphingPhase, true) {
+	}
+
+};
+
+}
+}
+}
+}
--- a/src/lib/UnescapingINIHandler.h	Wed Nov 25 21:50:26 2020 +0100
+++ b/src/lib/UnescapingINIHandler.h	Thu Nov 26 11:42:26 2020 +0100
@@ -31,6 +31,7 @@
 class UnescapingINIContentHandler : public INIContentHandler {
 private:
 	INIContentHandler& output;
+	bool unescapeComments;
 
 protected:
 	const char ESC = '\\';
@@ -53,7 +54,7 @@
 	 * in the last phase, all remaining sequences (including \\) must be recognized and unescaped
 	 * (otherwise the input is considered invalid and an exception is thrown)
 	 */
-	UnescapingINIContentHandler(INIContentHandler& output, bool lastEscaphingPhase) : output(output), lastEscaphingPhase(lastEscaphingPhase) {
+	UnescapingINIContentHandler(INIContentHandler& output, bool lastEscaphingPhase, bool unescapeComments = false) : output(output), lastEscaphingPhase(lastEscaphingPhase), unescapeComments(unescapeComments) {
 	}
 
 	void startDocument() override {
@@ -67,6 +68,7 @@
 	void startSection(const SectionStartEvent& event) override {
 		SectionStartEvent e = event;
 		e.name = unescape(e.name);
+		if (unescapeComments) e.comment = unescape(e.comment);
 		output.startSection(e);
 	}
 
@@ -80,11 +82,18 @@
 		e.fullKey = unescape(e.fullKey);
 		e.subKey = unescape(e.subKey);
 		e.value = unescape(e.value);
+		if (unescapeComments) e.comment = unescape(e.comment);
 		output.entry(e);
 	}
 
 	void comment(const CommentEvent& event) override {
-		output.comment(event);
+		if (unescapeComments) {
+			CommentEvent e = event;
+			e.comment = unescape(e.comment);
+			output.comment(e);
+		} else {
+			output.comment(event);
+		}
 	}
 
 	void whitespace(const WhitespaceEvent& event) override {