configurable unescaping processors v_0
authorFrantišek Kučera <franta-hg@frantovo.cz>
Sat, 28 Nov 2020 18:14:15 +0100
branchv_0
changeset 28 0e7c57d48d1e
parent 27 fd669e73d39a
child 29 06aaad12c207
configurable unescaping processors
src/XMLDocumentConstructor.h
src/lib/BackspaceUnescapingProcessor.h
src/lib/BasicUnescapingINIHandler.h
src/lib/BasicUnescapingProcessor.h
src/lib/INIReader.cpp
src/lib/INIReader.h
src/lib/JavaPropertiesUnescapingINIHandler.h
src/lib/JavaPropertiesUnescapingProcessor.h
src/lib/UnescapingINIHandler.h
src/lib/UnescapingProcessor.h
--- a/src/XMLDocumentConstructor.h	Thu Nov 26 11:42:26 2020 +0100
+++ b/src/XMLDocumentConstructor.h	Sat Nov 28 18:14:15 2020 +0100
@@ -20,8 +20,9 @@
 #include <libxml++-2.6/libxml++/libxml++.h>
 
 #include "lib/INIReader.h"
-#include "lib/BasicUnescapingINIHandler.h"
-#include "lib/JavaPropertiesUnescapingINIHandler.h"
+#include "lib/BasicUnescapingProcessor.h"
+#include "lib/BackspaceUnescapingProcessor.h"
+#include "lib/JavaPropertiesUnescapingProcessor.h"
 #include "lib/XMLNameCodec.h"
 
 using namespace relpipe::in::ini::lib;
@@ -111,9 +112,14 @@
 	void process() {
 		HierarchicalINIContentHandler handler(parser);
 		std::shared_ptr<INIReader> reader(INIReader::create(*input));
-		BasicUnescapingINIContentHandler unescapingHandler(handler, false);
-		JavaPropertiesUnescapingINIContentHandler javaHandler(handler, true);
-		reader->addHandler(&javaHandler);
+		reader->addUnescapingProcessor(std::make_shared<BasicUnescapingProcessor>(), "unescape-basic", true);
+		reader->addUnescapingProcessor(std::make_shared<JavaPropertiesUnescapingProcessor>(), "unescape-java-properties", false);
+		reader->addUnescapingProcessor(std::make_shared<BackspaceUnescapingProcessor>(false), "unescape-backspace-disorder", false);
+		reader->addUnescapingProcessor(std::make_shared<BackspaceUnescapingProcessor>(), "unescape-backspace", true);
+		reader->addHandler(&handler);
+		// TODO: smart pointers vs. references: are we going to call addUnescapingProcessor() dynamically/conditionally or share instances? Then pointers will be better.
+		// TODO: call setOption() according to the configuration
+		// for (ParserOptionRecipe option : configuration.parserOptions) reader->setOption(convertor.to_bytes(option.uri), convertor.to_bytes(option.value));
 		reader->process();
 	}
 };
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/BackspaceUnescapingProcessor.h	Sat Nov 28 18:14:15 2020 +0100
@@ -0,0 +1,70 @@
+/**
+ * Relational pipes
+ * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <sstream>
+
+#include "UnescapingProcessor.h"
+
+using namespace std;
+using namespace relpipe::writer;
+
+namespace relpipe {
+namespace in {
+namespace ini {
+namespace lib {
+
+class BackspaceUnescapingProcessor : public UnescapingProcessor {
+private:
+	const bool lastEscaphingPhase = true;
+public:
+
+	std::string unescape(const std::string& s, const TextType type) override {
+		std::stringstream result;
+		for (int i = 0, length = s.size(); i < length; i++) {
+			char ch = s[i];
+			if (i + 1 < length && ch == ESC) {
+				ch = s[i + 1];
+				if (ch == ESC) put(result, ESC, i); // unescape \\ to \.
+				else if (lastEscaphingPhase) throw std::logic_error(std::string("Unsupported escape sequence: ") + ch);
+				else result.put(ESC); // keep the escape sequence for later unescaping phase
+			} else if (ch == ESC) {
+				throw std::logic_error(std::string("Missing escape sequence")); // this should not happen
+			} else {
+				result.put(ch);
+			}
+		}
+		return result.str();
+	}
+
+	/**
+	 * @param lastEscaphingPhase whether this is final unescaping stage.
+	 * By default it is set to true, thus no unrecognized escape sequences may left after this stage.
+	 * Setting this to false is dangerous and may lead to errors and ambiguous behavior.
+	 * It should be used only as a last resort.
+	 * Because both "\\ \xxx" and "\ \xxx" will be converted to "\ \xxx" and the information will be lost.
+	 * So, it is usually better to keep the "\" escaped as "\\" and process both the escaped backspaces and unrecognized escape sequences later.
+	 */
+	BackspaceUnescapingProcessor(bool lastEscaphingPhase = true) : lastEscaphingPhase(lastEscaphingPhase) {
+	}
+
+};
+
+}
+}
+}
+}
--- a/src/lib/BasicUnescapingINIHandler.h	Thu Nov 26 11:42:26 2020 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,74 +0,0 @@
-/**
- * Relational pipes
- * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, version 3 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-#pragma once
-
-#include <sstream>
-
-#include "UnescapingINIHandler.h"
-
-using namespace std;
-using namespace relpipe::writer;
-
-namespace relpipe {
-namespace in {
-namespace ini {
-namespace lib {
-
-class BasicUnescapingINIContentHandler : public UnescapingINIContentHandler {
-protected:
-
-	virtual std::string unescape(const std::string& s) {
-		std::stringstream result;
-		for (int i = 0, length = s.size(); i < length; i++) {
-			char ch = s[i];
-			if (i + 1 < length && ch == ESC) {
-				ch = s[i + 1];
-				if (ch == 'n') put(result, '\n', i);
-				else if (ch == 'r') put(result, '\r', i);
-				else if (ch == 't') put(result, '\t', i);
-				else if (ch == 's') put(result, ' ', i); // TODO: Reconsider what is „basic“ escaping and should be supported.
-				else if (ch == '"') put(result, ch, i); //        The delimiters (\n,]",') are already unescaped during the first stage in the INIReader while parsing (the delimiter relevant to given environment is unescaped, e.g. \" in "quoted" value).
-				else if (ch == '\'') put(result, ch, i); //       So it does not necessary to do it here. But someone might write a="xxx\'zzz" however it is superfluous because a="xxx'zzz" will also work.
-				else if (ch == ']') put(result, ch, i);
-				else if (ch == ':') put(result, ch, i);
-				else if (ch == ';') put(result, ch, i);
-				else if (ch == '#') put(result, ch, i);
-				else if (ch == '=') put(result, ch, i);
-				else if (ch == ESC && !lastEscaphingPhase) put(result, ESC, i).put(ESC); // copy and skip even the second \ to avoid its misinterpretation in the next cycle
-				else if (ch == ESC && lastEscaphingPhase) put(result, ESC, i); // unescape \\ to \.
-				else if (lastEscaphingPhase) throw std::logic_error(std::string("Unsupported escape sequence: ") + ch);
-				else result.put(ESC); // keep the escape sequence for later unescaping phase
-			} else if (ch == ESC) {
-				throw std::logic_error(std::string("Missing escape sequence")); // this should not happen
-			} else {
-				result.put(ch);
-			}
-		}
-		return result.str();
-	}
-
-public:
-
-	BasicUnescapingINIContentHandler(INIContentHandler& output, bool lastEscaphingPhase) : UnescapingINIContentHandler(output, lastEscaphingPhase) {
-	}
-
-};
-
-}
-}
-}
-}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/BasicUnescapingProcessor.h	Sat Nov 28 18:14:15 2020 +0100
@@ -0,0 +1,67 @@
+/**
+ * Relational pipes
+ * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <sstream>
+
+#include "UnescapingProcessor.h"
+
+using namespace std;
+using namespace relpipe::writer;
+
+namespace relpipe {
+namespace in {
+namespace ini {
+namespace lib {
+
+class BasicUnescapingProcessor : public UnescapingProcessor {
+public:
+
+	std::string unescape(const std::string& s, const TextType type) override {
+		std::stringstream result;
+		for (int i = 0, length = s.size(); i < length; i++) {
+			char ch = s[i];
+			if (i + 1 < length && ch == ESC) {
+				ch = s[i + 1];
+				if (ch == 'n') put(result, '\n', i);
+				else if (ch == 'r') put(result, '\r', i);
+				else if (ch == 't') put(result, '\t', i);
+				else if (ch == 's') put(result, ' ', i); // TODO: Reconsider what is „basic“ escaping and should be supported.
+				else if (ch == '"') put(result, ch, i); //        The delimiters (\n,]",') are already unescaped during the first stage in the INIReader while parsing (the delimiter relevant to given environment is unescaped, e.g. \" in "quoted" value).
+				else if (ch == '\'') put(result, ch, i); //       So it does not necessary to do it here. But someone might write a="xxx\'zzz" however it is superfluous because a="xxx'zzz" will also work.
+				else if (ch == ']') put(result, ch, i);
+				else if (ch == ':') put(result, ch, i);
+				else if (ch == ';') put(result, ch, i);
+				else if (ch == '#') put(result, ch, i);
+				else if (ch == '=') put(result, ch, i);
+				else if (ch == ESC) put(result, ESC, i).put(ESC); // copy and skip even the second \ to avoid its misinterpretation in the next cycle
+				else result.put(ESC); // keep the escape sequence for later unescaping phase
+			} else if (ch == ESC) {
+				throw std::logic_error(std::string("Missing escape sequence")); // this should not happen
+			} else {
+				result.put(ch);
+			}
+		}
+		return result.str();
+	}
+	
+};
+
+}
+}
+}
+}
--- a/src/lib/INIReader.cpp	Thu Nov 26 11:42:26 2020 +0100
+++ b/src/lib/INIReader.cpp	Sat Nov 28 18:14:15 2020 +0100
@@ -32,21 +32,47 @@
 	std::istream& input;
 	std::vector<INIContentHandler*> handlers;
 
+	class ConfiguredUnescapingProcessor {
+	public:
+		std::shared_ptr<UnescapingProcessor> processor;
+		const std::string uri;
+		bool enbaled;
+
+		ConfiguredUnescapingProcessor(std::shared_ptr<UnescapingProcessor> processor, const std::string uri, bool enbaled) : processor(processor), uri(uri), enbaled(enbaled) {
+		}
+
+	};
+
+	std::vector<ConfiguredUnescapingProcessor> unescapingProcessors;
+
 	/** 
-	 * This might be configurable.
-	 * 
 	 * By default, we ignore all leading whitespace on continuing lines.
 	 * If there should be some spaces or tabs, they should be placed on the previous line before the „\“.
 	 * If a line break is desired, it should be written as \n (escaped) or the value should be quoted in " or '.
 	 * 
+	 * TODO: several options:
+	 *  - enabled, disabled
+	 *  - if disabled, then: keep backslash, trim backslash, escape backslash
+	 *    (keep requires support in some further unescaping phase, or it will cause an error)
+	 *  - keep or trim the line end
+	 *  - keep or trim the leading spaces
+	 *  - allow comments interleaved with continuing lines (the freaky systemd syntax)
+	 * 
 	 * Related specifications:
 	 *  - https://docs.oracle.com/javase/8/docs/api/index.html?java/util/Properties.html
+	 *  - https://www.freedesktop.org/software/systemd/man/systemd.syntax.html
 	 */
-	bool consumeLeadingSpacesOnContinuingLines = true;
+	bool trimLeadingSpacesOnContinuingLines = true;
+
 
 	/**
-	 * This might be configurable.
-	 * 
+	 * Some dialects or configuration files in general does not support sections.
+	 * Then a line, that looks like an INI section, should be interpreted as a key
+	 * (or error, if does not have a proper key-value separator).
+	 */
+	bool allowSections = true;
+
+	/**
 	 * KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section.
 	 * Line „[section_1][$i]“ means that the „section_1“ is „locked“.
 	 * We may emit this information somehow later, but for now, it is just ignored.
@@ -59,8 +85,6 @@
 	bool allowSectionTags = true;
 
 	/**
-	 * This might be configurable.
-	 * 
 	 * If whole key is „aaa[bbb]“ then „aaa“ is considered to be the key and „bbb“ the sub-key.
 	 * No \[ escaping is currently supported, so the key might not contain the bracket character.
 	 * 
@@ -70,6 +94,35 @@
 	 */
 	bool allowSubKeys = true;
 
+	/**
+	 * Classic INI uses „key=value“ syntax.
+	 * But some other formats/dialects might use key:value.
+	 * 
+	 * Only single character separators are supported.
+	 * If multiple separators should be recognized (e.g. both „=“ and „:“), this string will contain all of them,
+	 * i.e. „:=“ does not mean that the „key:=value“ syntax, but „key=value“ or „key:value“.
+	 */
+	std::string keyValueSeparators = "=";
+
+	/**
+	 * Classic INI uses „; comment“ syntax.
+	 * But many existing files contain „# comment“ lines.
+	 * 
+	 * Only single character separators are supported (works same as keyValueSeparators).
+	 */
+	std::string commentSeparators = ";#";
+
+	/**
+	 * INI often support both "quotes" and 'apostrophes' styles.
+	 * But some dialects may support only one of them or not support quoting at all.
+	 * 
+	 * In such case e.g. „key="some value"“ would mean that the value is „"value"“ (including the quotes).
+	 * Thus it is important to allow disabling quote recognizing (which is done by setting this parameter to empty string).
+	 * 
+	 * Only single character quotes are supported (works same as keyValueSeparators).
+	 */
+	std::string quotes = "\"'";
+
 	int lineNumber = 1;
 	int eventNumber = 0;
 
@@ -111,30 +164,35 @@
 	}
 
 	void processContinuingLine(std::stringstream& result) {
-		if (consumeLeadingSpacesOnContinuingLines) readSpacesAndTabs();
+		if (trimLeadingSpacesOnContinuingLines) readSpacesAndTabs();
 		else result.put('\n');
 	}
 
-	std::string readUntil(char until, bool* found = nullptr) {
+	std::string readUntil(const char until, bool* found = nullptr) {
+		return readUntil(std::string(1, until), found);
+	}
+
+	std::string readUntil(const std::string& until, bool* found = nullptr) {
 		std::stringstream result;
 
-		for (char ch = peek(); input.good() && ch != until; ch = peek()) {
+		for (char ch = peek(); input.good() && !oneOf(ch, until); ch = peek()) {
 			if (ch == '\\') {
 				get();
 				ch = get();
-				if (ch == until && ch == '\n') processContinuingLine(result);
-				else if (ch == until) result.put(ch);
+				if (oneOf(ch, until) && ch == '\n') processContinuingLine(result);
+				else if (oneOf(ch, until)) result.put(ch);
 				else if (ch == std::istream::traits_type::eof()) break;
 				else result.put('\\').put(ch);
-				// TODO: two-stage and modular unescaping: here unescape only \+LF or more genereally: unescape only the until character and rest leave untouched
-				// second escaping stage move to separate class/wrapper (similar to hierarchical wrappers)
+				// unescaping is done in two phases:
+				// here we unescape just the \n (LF)
+				// other escape sequences are leaved untouched and will be processed in later phases, see see UnescapingINIHandler
 			} else {
 				ch = get();
 				result.put(ch);
 			}
 		}
 
-		if (peek() == until) {
+		if (oneOf(peek(), until)) {
 			get();
 			if (found) *found = true;
 		} else {
@@ -144,13 +202,17 @@
 		return result.str();
 	}
 
-	std::string readToken(char until, char* quote = nullptr, bool* found = nullptr) {
+	std::string readToken(const char until, char* quote = nullptr, bool* found = nullptr) {
+		return readToken(std::string(1, until), quote, found);
+	}
+
+	std::string readToken(const std::string& until, char* quote = nullptr, bool* found = nullptr) {
 		std::string result;
 
 		char ch = peek();
 		if (isQuote(ch)) {
 			if (quote) *quote = ch;
-			result = readUntil(get(), found);
+			result = readUntil(std::string(1, get()), found);
 		} else {
 			if (quote) *quote = 0;
 			result = readUntil(until, found);
@@ -160,35 +222,107 @@
 	}
 
 	std::string readTokenAndEatTerminator(char until, char* quote = nullptr, bool* found = nullptr) {
+		return readTokenAndEatTerminator(std::string(1, until), quote, found);
+	}
+
+	std::string readTokenAndEatTerminator(const std::string& until, char* quote = nullptr, bool* found = nullptr) {
 		std::string result = readToken(until, quote, found);
 		if (*quote) {
 			readAllWhitespace();
-			if (get() != until) throw std::logic_error(std::string("missing „") + std::string(1, until) + "“ after quoted section name");
+			if (!oneOf(get(), until)) throw std::logic_error(std::string("missing „") + until + "“ after quoted section name");
 		}
 		return result;
 	}
 
+	std::string unescape(const std::string& value, UnescapingProcessor::TextType type) {
+		std::string result = value;
+		for (ConfiguredUnescapingProcessor p : unescapingProcessors) if (p.enbaled) result = p.processor->unescape(result, type);
+		return result;
+	}
+
 	bool isComment(char ch) {
-		return ch == '#' || ch == ';';
+		return oneOf(ch, commentSeparators);
 	}
 
 	bool isQuote(char ch) {
-		return ch == '"' || ch == '\'';
+		return oneOf(ch, quotes);
+	}
+
+	/**
+	 * @param ch character to be evaluated
+	 * @param options list of options (characters)
+	 * @return whether ch is one of options
+	 */
+	bool oneOf(char ch, const std::string& options) {
+		return options.find(ch) != std::string::npos;
 	}
 
 	std::string trim(std::string s) {
 		return std::regex_replace(s, std::regex("^\\s+|\\s+$"), "");
 	}
 
+	/**
+	 * TODO: use a common method
+	 */
+	bool parseBoolean(const std::string& value) {
+		if (value == "true") return true;
+		else if (value == "false") return false;
+		else throw std::invalid_argument(std::string("Unable to parse boolean value: ") + value + " (expecting true or false)");
+	}
+
+	void setDialect(const std::string& name) {
+		if (name == "default-ini") {
+			// already set
+		} else if (name == "java-properties") {
+			trimLeadingSpacesOnContinuingLines = true;
+			allowSections = false;
+			allowSectionTags = false;
+			allowSubKeys = false;
+			commentSeparators = "#";
+			keyValueSeparators = "=:";
+			quotes = "";
+			// TODO: enable unicode unescaping
+		} else {
+			throw std::invalid_argument(std::string("Unsupported INI dialect: ") + name);
+		}
+	}
+
+	bool setUnescaping(const std::string& uri, const std::string& value) {
+		for (ConfiguredUnescapingProcessor& p : unescapingProcessors) {
+			if (p.uri == uri) {
+				p.enbaled = parseBoolean(value);
+				return true;
+			}
+		}
+		return false;
+	}
+
 public:
 
 	INIReaderImpl(std::istream& input) : input(input) {
 	}
 
+	void setOption(const std::string& uri, const std::string& value) override {
+		if (uri == "trim-continuing-lines") trimLeadingSpacesOnContinuingLines = parseBoolean(value); // TODO: continuing lines modes (enum), not just boolean
+		else if (uri == "allow-sections") allowSections = parseBoolean(value);
+		else if (uri == "allow-section-tags") allowSectionTags = parseBoolean(value);
+		else if (uri == "allow-sub-keys") allowSubKeys = parseBoolean(value);
+		else if (uri == "comment-separators") commentSeparators = value;
+		else if (uri == "key-value-separators") keyValueSeparators = value;
+		else if (uri == "quotes") quotes = value;
+		else if (uri == "dialect") setDialect(value);
+		else if (setUnescaping(uri, value));
+		else throw std::invalid_argument(std::string("Invalid parser option: „") + uri + "“ with value: „" + value + "“");
+	}
+
 	void addHandler(INIContentHandler* handler) override {
 		handlers.push_back(handler);
 	}
 
+	void addUnescapingProcessor(std::shared_ptr<UnescapingProcessor> processor, const std::string uri, bool enabledByDefault) override {
+		unescapingProcessors.push_back({processor, uri, enabledByDefault});
+	}
+
 	void process() override {
 		for (INIContentHandler* handler : handlers) handler->startDocument();
 
@@ -196,10 +330,10 @@
 
 		while (input.good()) { // TODO: condition
 			{
+				INIContentHandler::WhitespaceEvent event;
+				event.lineNumber = lineNumber;
 				std::string whitespace = readAllWhitespace();
 				if (whitespace.size()) {
-					INIContentHandler::WhitespaceEvent event;
-					event.lineNumber = lineNumber;
 					event.eventNumber = ++eventNumber;
 					event.whitespace = whitespace;
 					for (INIContentHandler* handler : handlers) handler->whitespace(event);
@@ -213,20 +347,23 @@
 
 			if (ch == std::istream::traits_type::eof()) {
 				break;
-			} else if (ch == '[') {
+			} else if (ch == '[' && allowSections) {
 				if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
 				inSection = true;
-				get();
-				readAllWhitespace();
 				INIContentHandler::SectionStartEvent event;
 				event.lineNumber = lineNumber;
 				event.eventNumber = ++eventNumber;
+				get();
+				readAllWhitespace();
 				event.name = readTokenAndEatTerminator(']', &quote, &found);
+				if (!quote) event.name = trim(event.name);
+				event.name = unescape(event.name, UnescapingProcessor::TextType::SectionName);
 
 				readSpacesAndTabs();
 				if (allowSectionTags && peek() == '[') {
 					get();
 					event.tag = readTokenAndEatTerminator(']', &quote, &found);
+					event.tag = unescape(event.tag, UnescapingProcessor::TextType::SectionTag);
 				}
 
 				readSpacesAndTabs();
@@ -235,6 +372,7 @@
 					get();
 					readSpacesAndTabs();
 					event.comment = readUntil('\n', &found);
+					event.comment = unescape(event.comment, UnescapingProcessor::TextType::SectionComment);
 				} else if (ch == '\n') {
 					get();
 				} else {
@@ -243,31 +381,33 @@
 
 				for (INIContentHandler* handler : handlers) handler->startSection(event);
 			} else if (isComment(ch)) {
-				get();
-				readSpacesAndTabs();
 				INIContentHandler::CommentEvent event;
 				event.lineNumber = lineNumber;
 				event.eventNumber = ++eventNumber;
+				get();
+				readSpacesAndTabs();
 				event.comment = readUntil('\n', &found);
+				event.comment = unescape(event.comment, UnescapingProcessor::TextType::Comment);
 				for (INIContentHandler* handler : handlers) handler->comment(event);
 			} else {
-				std::string fullKey = readToken('=', &quote, &found);
+				INIContentHandler::EntryEvent event;
+				event.lineNumber = lineNumber;
+				event.eventNumber = ++eventNumber;
+
+				std::string fullKey = readToken(keyValueSeparators, &quote, &found);
 				if (!found) throw std::logic_error(std::string("missing = after key: '") + fullKey + "'");
 				if (!quote) fullKey = trim(fullKey);
 				readSpacesAndTabs();
 
 				if (quote) {
 					ch = get();
-					if (ch == '=') readSpacesAndTabs();
+					if (oneOf(ch, keyValueSeparators)) readSpacesAndTabs();
 					else throw std::logic_error(std::string("missing = after quoted key: '") + fullKey + "'");
 				}
 
 				std::string value = readToken('\n', &quote, &found);
 				if (!quote) value = trim(value);
 
-				INIContentHandler::EntryEvent event;
-				event.lineNumber = lineNumber;
-				event.eventNumber = ++eventNumber;
 				event.key = fullKey;
 				event.fullKey = fullKey;
 				event.value = value;
@@ -278,9 +418,14 @@
 						event.key = match[1];
 						event.subKey = match[2];
 						event.fullKey = fullKey;
+						event.subKey = unescape(event.subKey, UnescapingProcessor::TextType::EntryKey);
 					}
 				}
 
+				event.key = unescape(event.key, UnescapingProcessor::TextType::EntryKey);
+				event.fullKey = unescape(event.fullKey, UnescapingProcessor::TextType::EntryKey);
+				event.value = unescape(event.value, UnescapingProcessor::TextType::EntryValue);
+
 				if (quote) {
 					readSpacesAndTabs();
 					ch = peek();
@@ -288,9 +433,15 @@
 						get();
 						readSpacesAndTabs();
 						event.comment = readUntil('\n', &found);
+						event.comment = unescape(event.comment, UnescapingProcessor::TextType::EntryComment);
 					} else if (ch == '\n') {
 						get();
 					} else {
+						// TODO: optional support for multiple tokens in a single entry?
+						// modes: array, concatenate
+						// some-array-1 = "item 1" "item 2" 'item 3' item 4
+						// some-array-2 = "item 1" "item 2" 'item 3' item_4 item_5
+						// some-bash-style-string-value = "this "will' be' concatenated → this will be concatenated
 						throw std::logic_error(std::string("unexpected content after the quoted value: key='") + fullKey + "' value='" + event.value + "'");
 					}
 				}
--- a/src/lib/INIReader.h	Thu Nov 26 11:42:26 2020 +0100
+++ b/src/lib/INIReader.h	Sat Nov 28 18:14:15 2020 +0100
@@ -20,6 +20,7 @@
 #include <istream>
 
 #include "INIContentHandler.h"
+#include "UnescapingProcessor.h"
 
 namespace relpipe {
 namespace in {
@@ -32,7 +33,20 @@
 class INIReader {
 public:
 	virtual ~INIReader() = default;
+	/**
+	 * TODO: after moving to alt2xml:
+	 *        - option will be identified by globally unique URI/IRI
+	 *        - parsers will provide catalog of supported options (names, enum values, documentation)
+	 *        - options serves as both XML parser features and properties and are mapped to them
+	 */
+	virtual void setOption(const std::string& uri, const std::string& value) = 0;
+	/**
+	 * TODO: after moving to alt2xml:
+	 *        - this will be generic handler for SAX event
+	 *        - but both sides will know the schema (allowed elements and attributes for INI events)
+	 */
 	virtual void addHandler(INIContentHandler* handler) = 0;
+	virtual void addUnescapingProcessor(std::shared_ptr<UnescapingProcessor> processor, const std::string uri, bool enabledByDefault) = 0;
 	virtual void process() = 0;
 	static INIReader* create(std::istream& input);
 };
--- a/src/lib/JavaPropertiesUnescapingINIHandler.h	Thu Nov 26 11:42:26 2020 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,105 +0,0 @@
-/**
- * Relational pipes
- * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, version 3 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-#pragma once
-
-#include <sstream>
-#include <codecvt>
-#include <arpa/inet.h>
-
-#include "UnescapingINIHandler.h"
-
-using namespace std;
-using namespace relpipe::writer;
-
-namespace relpipe {
-namespace in {
-namespace ini {
-namespace lib {
-
-class JavaPropertiesUnescapingINIContentHandler : public UnescapingINIContentHandler {
-private:
-	wstring_convert < codecvt_utf8<wchar_t>> convertor; // INI parser works with UTF-8
-
-	bool readHex(const char* hexadecimal, size_t hexLength, uint8_t* resultBuffer, size_t binLength) {
-		if (hexLength != binLength * 2) return false;
-
-		for (size_t i = 0; i < binLength; i++) {
-			uint8_t value = 0;
-			char a = hexadecimal[i * 2];
-			char b = hexadecimal[i * 2 + 1];
-
-			if (a >= '0' && a <= '9') value += (a - '0')*16;
-			else if (a >= 'a' && a <= 'f') value += (a - 'a' + 10)*16;
-			else if (a >= 'A' && a <= 'F') value += (a - 'A' + 10)*16;
-			else return false;
-
-			if (b >= '0' && b <= '9') value += b - '0';
-			else if (b >= 'a' && b <= 'f') value += b - 'a' + 10;
-			else if (b >= 'A' && b <= 'F') value += b - 'A' + 10;
-			else return false;
-
-			if (resultBuffer) resultBuffer[i] = value;
-		}
-		return true;
-	}
-
-protected:
-
-	virtual std::string unescape(const std::string& s) {
-		std::stringstream result;
-		for (int i = 0, length = s.size(); i < length; i++) {
-			char ch = s[i];
-			if (i + 1 < length && ch == ESC) {
-				ch = s[i + 1];
-				if (ch == 'u') {
-					// TODO: simplify, clean-up, verify (but seems working)
-					i++;
-					int hexLength = 4;
-					if (i + hexLength < length) {
-						uint16_t u16;
-						bool hexOK = readHex(s.c_str() + i + 1, hexLength, (uint8_t*) & u16, sizeof (u16));
-						if (hexOK) result << convertor.to_bytes(ntohs(u16));
-						else throw std::logic_error(std::string("Invalid unicode escape sequence: invalid HEX"));
-						i += hexLength;
-					} else {
-						throw std::logic_error(std::string("Invalid unicode escape sequence: missing characters"));
-					}
-
-				} else if (ch == ESC && !lastEscaphingPhase) put(result, ESC, i).put(ESC); // copy and skip even the second \ to avoid its misinterpretation in the next cycle
-				else if (ch == ESC && lastEscaphingPhase) put(result, ESC, i); // unescape \\ to \.
-				else if (lastEscaphingPhase) throw std::logic_error(std::string("Unsupported escape sequence: ") + ch);
-				else result.put(ESC); // keep the escape sequence for later unescaping phase
-			} else if (ch == ESC) {
-				throw std::logic_error(std::string("Missing escape sequence")); // this should not happen
-			} else {
-				result.put(ch);
-			}
-		}
-		return result.str();
-	}
-
-public:
-
-	JavaPropertiesUnescapingINIContentHandler(INIContentHandler& output, bool lastEscaphingPhase) : UnescapingINIContentHandler(output, lastEscaphingPhase, true) {
-	}
-
-};
-
-}
-}
-}
-}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/JavaPropertiesUnescapingProcessor.h	Sat Nov 28 18:14:15 2020 +0100
@@ -0,0 +1,101 @@
+/**
+ * Relational pipes
+ * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <sstream>
+#include <codecvt>
+#include <arpa/inet.h>
+
+#include "UnescapingProcessor.h"
+
+using namespace std;
+using namespace relpipe::writer;
+
+namespace relpipe {
+namespace in {
+namespace ini {
+namespace lib {
+
+/**
+ * Should work according to <https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.3> 3.3. Unicode Escapes 
+ */
+class JavaPropertiesUnescapingProcessor : public UnescapingProcessor {
+private:
+	wstring_convert < codecvt_utf8<wchar_t>> convertor; // INI parser works with UTF-8
+
+	bool readHex(const char* hexadecimal, size_t hexLength, uint8_t* resultBuffer, size_t binLength) {
+		if (hexLength != binLength * 2) return false;
+
+		for (size_t i = 0; i < binLength; i++) {
+			uint8_t value = 0;
+			char a = hexadecimal[i * 2];
+			char b = hexadecimal[i * 2 + 1];
+
+			if (a >= '0' && a <= '9') value += (a - '0')*16;
+			else if (a >= 'a' && a <= 'f') value += (a - 'a' + 10)*16;
+			else if (a >= 'A' && a <= 'F') value += (a - 'A' + 10)*16;
+			else return false;
+
+			if (b >= '0' && b <= '9') value += b - '0';
+			else if (b >= 'a' && b <= 'f') value += b - 'a' + 10;
+			else if (b >= 'A' && b <= 'F') value += b - 'A' + 10;
+			else return false;
+
+			if (resultBuffer) resultBuffer[i] = value;
+		}
+		return true;
+	}
+
+public:
+	
+	std::string unescape(const std::string& s, const TextType type) override {
+		std::stringstream result;
+		for (int i = 0, length = s.size(); i < length; i++) {
+			char ch = s[i];
+			if (i + 1 < length && ch == ESC) {
+				ch = s[i + 1];
+				if (ch == 'u') {
+					// TODO: simplify, clean-up, verify (but seems working)
+					i++;
+					int hexLength = 4;
+					if (i + hexLength < length) {
+						uint16_t u16;
+						bool hexOK = readHex(s.c_str() + i + 1, hexLength, (uint8_t*) & u16, sizeof (u16));
+						if (hexOK) result << convertor.to_bytes(ntohs(u16));
+						else throw std::logic_error(std::string("Invalid unicode escape sequence: invalid HEX"));
+						i += hexLength;
+					} else {
+						throw std::logic_error(std::string("Invalid unicode escape sequence: missing characters"));
+					}
+
+				} else if (ch == ESC) put(result, ESC, i).put(ESC); // copy and skip even the second \ to avoid its misinterpretation in the next cycle
+				else result.put(ESC); // keep the escape sequence for later unescaping phase
+			} else if (ch == ESC) {
+				throw std::logic_error(std::string("Missing escape sequence")); // this should not happen
+			} else {
+				result.put(ch);
+			}
+		}
+		return result.str();
+	}
+
+};
+
+}
+}
+}
+}
--- a/src/lib/UnescapingINIHandler.h	Thu Nov 26 11:42:26 2020 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,108 +0,0 @@
-/**
- * Relational pipes
- * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, version 3 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-#pragma once
-
-#include <sstream>
-
-#include "INIReader.h"
-
-using namespace std;
-using namespace relpipe::writer;
-
-namespace relpipe {
-namespace in {
-namespace ini {
-namespace lib {
-
-class UnescapingINIContentHandler : public INIContentHandler {
-private:
-	INIContentHandler& output;
-	bool unescapeComments;
-
-protected:
-	const char ESC = '\\';
-	bool lastEscaphingPhase;
-
-	std::stringstream& put(std::stringstream& result, const char& ch, int& i) {
-		result.put(ch);
-		i++;
-		return result;
-	}
-
-	virtual std::string unescape(const std::string& s) = 0;
-
-public:
-
-	/**
-	 * @param output here will be sent events with unescaped values
-	 * @param lastEscaphingPhase instances of UnescapingINIContentHandler might be chained:
-	 * unsupported escaping sequences are kept untouched to be processed in further phases;
-	 * in the last phase, all remaining sequences (including \\) must be recognized and unescaped
-	 * (otherwise the input is considered invalid and an exception is thrown)
-	 */
-	UnescapingINIContentHandler(INIContentHandler& output, bool lastEscaphingPhase, bool unescapeComments = false) : output(output), lastEscaphingPhase(lastEscaphingPhase), unescapeComments(unescapeComments) {
-	}
-
-	void startDocument() override {
-		output.startDocument();
-	}
-
-	void endDocument() override {
-		output.endDocument();
-	}
-
-	void startSection(const SectionStartEvent& event) override {
-		SectionStartEvent e = event;
-		e.name = unescape(e.name);
-		if (unescapeComments) e.comment = unescape(e.comment);
-		output.startSection(e);
-	}
-
-	void endSection() override {
-		output.endSection();
-	}
-
-	void entry(const EntryEvent& event) override {
-		EntryEvent e = event;
-		e.key = unescape(e.key);
-		e.fullKey = unescape(e.fullKey);
-		e.subKey = unescape(e.subKey);
-		e.value = unescape(e.value);
-		if (unescapeComments) e.comment = unescape(e.comment);
-		output.entry(e);
-	}
-
-	void comment(const CommentEvent& event) override {
-		if (unescapeComments) {
-			CommentEvent e = event;
-			e.comment = unescape(e.comment);
-			output.comment(e);
-		} else {
-			output.comment(event);
-		}
-	}
-
-	void whitespace(const WhitespaceEvent& event) override {
-		output.whitespace(event);
-	}
-
-};
-
-}
-}
-}
-}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/UnescapingProcessor.h	Sat Nov 28 18:14:15 2020 +0100
@@ -0,0 +1,60 @@
+/**
+ * Relational pipes
+ * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <sstream>
+
+#include "INIReader.h"
+
+using namespace std;
+
+namespace relpipe {
+namespace in {
+namespace ini {
+namespace lib {
+
+class UnescapingProcessor {
+private:
+protected:
+	const char ESC = '\\';
+
+	std::stringstream& put(std::stringstream& result, const char& ch, int& i) {
+		result.put(ch);
+		i++;
+		return result;
+	}
+
+public:
+
+	enum class TextType {
+		SectionName,
+		SectionComment,
+		SectionTag,
+		EntryKey,
+		EntryValue,
+		EntryComment,
+		Comment,
+	};
+
+	virtual std::string unescape(const std::string& s, const TextType type) = 0;
+
+};
+
+}
+}
+}
+}