src/lib/INIReader.cpp
branchv_0
changeset 28 0e7c57d48d1e
parent 26 80e129ec3408
child 29 06aaad12c207
--- a/src/lib/INIReader.cpp	Thu Nov 26 11:42:26 2020 +0100
+++ b/src/lib/INIReader.cpp	Sat Nov 28 18:14:15 2020 +0100
@@ -32,21 +32,47 @@
 	std::istream& input;
 	std::vector<INIContentHandler*> handlers;
 
+	class ConfiguredUnescapingProcessor {
+	public:
+		std::shared_ptr<UnescapingProcessor> processor;
+		const std::string uri;
+		bool enbaled;
+
+		ConfiguredUnescapingProcessor(std::shared_ptr<UnescapingProcessor> processor, const std::string uri, bool enbaled) : processor(processor), uri(uri), enbaled(enbaled) {
+		}
+
+	};
+
+	std::vector<ConfiguredUnescapingProcessor> unescapingProcessors;
+
 	/** 
-	 * This might be configurable.
-	 * 
 	 * By default, we ignore all leading whitespace on continuing lines.
 	 * If there should be some spaces or tabs, they should be placed on the previous line before the „\“.
 	 * If a line break is desired, it should be written as \n (escaped) or the value should be quoted in " or '.
 	 * 
+	 * TODO: several options:
+	 *  - enabled, disabled
+	 *  - if disabled, then: keep backslash, trim backslash, escape backslash
+	 *    (keep requires support in some further unescaping phase, or it will cause an error)
+	 *  - keep or trim the line end
+	 *  - keep or trim the leading spaces
+	 *  - allow comments interleaved with continuing lines (the freaky systemd syntax)
+	 * 
 	 * Related specifications:
 	 *  - https://docs.oracle.com/javase/8/docs/api/index.html?java/util/Properties.html
+	 *  - https://www.freedesktop.org/software/systemd/man/systemd.syntax.html
 	 */
-	bool consumeLeadingSpacesOnContinuingLines = true;
+	bool trimLeadingSpacesOnContinuingLines = true;
+
 
 	/**
-	 * This might be configurable.
-	 * 
+	 * Some dialects or configuration files in general does not support sections.
+	 * Then a line, that looks like an INI section, should be interpreted as a key
+	 * (or error, if does not have a proper key-value separator).
+	 */
+	bool allowSections = true;
+
+	/**
 	 * KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section.
 	 * Line „[section_1][$i]“ means that the „section_1“ is „locked“.
 	 * We may emit this information somehow later, but for now, it is just ignored.
@@ -59,8 +85,6 @@
 	bool allowSectionTags = true;
 
 	/**
-	 * This might be configurable.
-	 * 
 	 * If whole key is „aaa[bbb]“ then „aaa“ is considered to be the key and „bbb“ the sub-key.
 	 * No \[ escaping is currently supported, so the key might not contain the bracket character.
 	 * 
@@ -70,6 +94,35 @@
 	 */
 	bool allowSubKeys = true;
 
+	/**
+	 * Classic INI uses „key=value“ syntax.
+	 * But some other formats/dialects might use key:value.
+	 * 
+	 * Only single character separators are supported.
+	 * If multiple separators should be recognized (e.g. both „=“ and „:“), this string will contain all of them,
+	 * i.e. „:=“ does not mean that the „key:=value“ syntax, but „key=value“ or „key:value“.
+	 */
+	std::string keyValueSeparators = "=";
+
+	/**
+	 * Classic INI uses „; comment“ syntax.
+	 * But many existing files contain „# comment“ lines.
+	 * 
+	 * Only single character separators are supported (works same as keyValueSeparators).
+	 */
+	std::string commentSeparators = ";#";
+
+	/**
+	 * INI often support both "quotes" and 'apostrophes' styles.
+	 * But some dialects may support only one of them or not support quoting at all.
+	 * 
+	 * In such case e.g. „key="some value"“ would mean that the value is „"value"“ (including the quotes).
+	 * Thus it is important to allow disabling quote recognizing (which is done by setting this parameter to empty string).
+	 * 
+	 * Only single character quotes are supported (works same as keyValueSeparators).
+	 */
+	std::string quotes = "\"'";
+
 	int lineNumber = 1;
 	int eventNumber = 0;
 
@@ -111,30 +164,35 @@
 	}
 
 	void processContinuingLine(std::stringstream& result) {
-		if (consumeLeadingSpacesOnContinuingLines) readSpacesAndTabs();
+		if (trimLeadingSpacesOnContinuingLines) readSpacesAndTabs();
 		else result.put('\n');
 	}
 
-	std::string readUntil(char until, bool* found = nullptr) {
+	std::string readUntil(const char until, bool* found = nullptr) {
+		return readUntil(std::string(1, until), found);
+	}
+
+	std::string readUntil(const std::string& until, bool* found = nullptr) {
 		std::stringstream result;
 
-		for (char ch = peek(); input.good() && ch != until; ch = peek()) {
+		for (char ch = peek(); input.good() && !oneOf(ch, until); ch = peek()) {
 			if (ch == '\\') {
 				get();
 				ch = get();
-				if (ch == until && ch == '\n') processContinuingLine(result);
-				else if (ch == until) result.put(ch);
+				if (oneOf(ch, until) && ch == '\n') processContinuingLine(result);
+				else if (oneOf(ch, until)) result.put(ch);
 				else if (ch == std::istream::traits_type::eof()) break;
 				else result.put('\\').put(ch);
-				// TODO: two-stage and modular unescaping: here unescape only \+LF or more genereally: unescape only the until character and rest leave untouched
-				// second escaping stage move to separate class/wrapper (similar to hierarchical wrappers)
+				// unescaping is done in two phases:
+				// here we unescape just the \n (LF)
+				// other escape sequences are leaved untouched and will be processed in later phases, see see UnescapingINIHandler
 			} else {
 				ch = get();
 				result.put(ch);
 			}
 		}
 
-		if (peek() == until) {
+		if (oneOf(peek(), until)) {
 			get();
 			if (found) *found = true;
 		} else {
@@ -144,13 +202,17 @@
 		return result.str();
 	}
 
-	std::string readToken(char until, char* quote = nullptr, bool* found = nullptr) {
+	std::string readToken(const char until, char* quote = nullptr, bool* found = nullptr) {
+		return readToken(std::string(1, until), quote, found);
+	}
+
+	std::string readToken(const std::string& until, char* quote = nullptr, bool* found = nullptr) {
 		std::string result;
 
 		char ch = peek();
 		if (isQuote(ch)) {
 			if (quote) *quote = ch;
-			result = readUntil(get(), found);
+			result = readUntil(std::string(1, get()), found);
 		} else {
 			if (quote) *quote = 0;
 			result = readUntil(until, found);
@@ -160,35 +222,107 @@
 	}
 
 	std::string readTokenAndEatTerminator(char until, char* quote = nullptr, bool* found = nullptr) {
+		return readTokenAndEatTerminator(std::string(1, until), quote, found);
+	}
+
+	std::string readTokenAndEatTerminator(const std::string& until, char* quote = nullptr, bool* found = nullptr) {
 		std::string result = readToken(until, quote, found);
 		if (*quote) {
 			readAllWhitespace();
-			if (get() != until) throw std::logic_error(std::string("missing „") + std::string(1, until) + "“ after quoted section name");
+			if (!oneOf(get(), until)) throw std::logic_error(std::string("missing „") + until + "“ after quoted section name");
 		}
 		return result;
 	}
 
+	std::string unescape(const std::string& value, UnescapingProcessor::TextType type) {
+		std::string result = value;
+		for (ConfiguredUnescapingProcessor p : unescapingProcessors) if (p.enbaled) result = p.processor->unescape(result, type);
+		return result;
+	}
+
 	bool isComment(char ch) {
-		return ch == '#' || ch == ';';
+		return oneOf(ch, commentSeparators);
 	}
 
 	bool isQuote(char ch) {
-		return ch == '"' || ch == '\'';
+		return oneOf(ch, quotes);
+	}
+
+	/**
+	 * @param ch character to be evaluated
+	 * @param options list of options (characters)
+	 * @return whether ch is one of options
+	 */
+	bool oneOf(char ch, const std::string& options) {
+		return options.find(ch) != std::string::npos;
 	}
 
 	std::string trim(std::string s) {
 		return std::regex_replace(s, std::regex("^\\s+|\\s+$"), "");
 	}
 
+	/**
+	 * TODO: use a common method
+	 */
+	bool parseBoolean(const std::string& value) {
+		if (value == "true") return true;
+		else if (value == "false") return false;
+		else throw std::invalid_argument(std::string("Unable to parse boolean value: ") + value + " (expecting true or false)");
+	}
+
+	void setDialect(const std::string& name) {
+		if (name == "default-ini") {
+			// already set
+		} else if (name == "java-properties") {
+			trimLeadingSpacesOnContinuingLines = true;
+			allowSections = false;
+			allowSectionTags = false;
+			allowSubKeys = false;
+			commentSeparators = "#";
+			keyValueSeparators = "=:";
+			quotes = "";
+			// TODO: enable unicode unescaping
+		} else {
+			throw std::invalid_argument(std::string("Unsupported INI dialect: ") + name);
+		}
+	}
+
+	bool setUnescaping(const std::string& uri, const std::string& value) {
+		for (ConfiguredUnescapingProcessor& p : unescapingProcessors) {
+			if (p.uri == uri) {
+				p.enbaled = parseBoolean(value);
+				return true;
+			}
+		}
+		return false;
+	}
+
 public:
 
 	INIReaderImpl(std::istream& input) : input(input) {
 	}
 
+	void setOption(const std::string& uri, const std::string& value) override {
+		if (uri == "trim-continuing-lines") trimLeadingSpacesOnContinuingLines = parseBoolean(value); // TODO: continuing lines modes (enum), not just boolean
+		else if (uri == "allow-sections") allowSections = parseBoolean(value);
+		else if (uri == "allow-section-tags") allowSectionTags = parseBoolean(value);
+		else if (uri == "allow-sub-keys") allowSubKeys = parseBoolean(value);
+		else if (uri == "comment-separators") commentSeparators = value;
+		else if (uri == "key-value-separators") keyValueSeparators = value;
+		else if (uri == "quotes") quotes = value;
+		else if (uri == "dialect") setDialect(value);
+		else if (setUnescaping(uri, value));
+		else throw std::invalid_argument(std::string("Invalid parser option: „") + uri + "“ with value: „" + value + "“");
+	}
+
 	void addHandler(INIContentHandler* handler) override {
 		handlers.push_back(handler);
 	}
 
+	void addUnescapingProcessor(std::shared_ptr<UnescapingProcessor> processor, const std::string uri, bool enabledByDefault) override {
+		unescapingProcessors.push_back({processor, uri, enabledByDefault});
+	}
+
 	void process() override {
 		for (INIContentHandler* handler : handlers) handler->startDocument();
 
@@ -196,10 +330,10 @@
 
 		while (input.good()) { // TODO: condition
 			{
+				INIContentHandler::WhitespaceEvent event;
+				event.lineNumber = lineNumber;
 				std::string whitespace = readAllWhitespace();
 				if (whitespace.size()) {
-					INIContentHandler::WhitespaceEvent event;
-					event.lineNumber = lineNumber;
 					event.eventNumber = ++eventNumber;
 					event.whitespace = whitespace;
 					for (INIContentHandler* handler : handlers) handler->whitespace(event);
@@ -213,20 +347,23 @@
 
 			if (ch == std::istream::traits_type::eof()) {
 				break;
-			} else if (ch == '[') {
+			} else if (ch == '[' && allowSections) {
 				if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
 				inSection = true;
-				get();
-				readAllWhitespace();
 				INIContentHandler::SectionStartEvent event;
 				event.lineNumber = lineNumber;
 				event.eventNumber = ++eventNumber;
+				get();
+				readAllWhitespace();
 				event.name = readTokenAndEatTerminator(']', &quote, &found);
+				if (!quote) event.name = trim(event.name);
+				event.name = unescape(event.name, UnescapingProcessor::TextType::SectionName);
 
 				readSpacesAndTabs();
 				if (allowSectionTags && peek() == '[') {
 					get();
 					event.tag = readTokenAndEatTerminator(']', &quote, &found);
+					event.tag = unescape(event.tag, UnescapingProcessor::TextType::SectionTag);
 				}
 
 				readSpacesAndTabs();
@@ -235,6 +372,7 @@
 					get();
 					readSpacesAndTabs();
 					event.comment = readUntil('\n', &found);
+					event.comment = unescape(event.comment, UnescapingProcessor::TextType::SectionComment);
 				} else if (ch == '\n') {
 					get();
 				} else {
@@ -243,31 +381,33 @@
 
 				for (INIContentHandler* handler : handlers) handler->startSection(event);
 			} else if (isComment(ch)) {
-				get();
-				readSpacesAndTabs();
 				INIContentHandler::CommentEvent event;
 				event.lineNumber = lineNumber;
 				event.eventNumber = ++eventNumber;
+				get();
+				readSpacesAndTabs();
 				event.comment = readUntil('\n', &found);
+				event.comment = unescape(event.comment, UnescapingProcessor::TextType::Comment);
 				for (INIContentHandler* handler : handlers) handler->comment(event);
 			} else {
-				std::string fullKey = readToken('=', &quote, &found);
+				INIContentHandler::EntryEvent event;
+				event.lineNumber = lineNumber;
+				event.eventNumber = ++eventNumber;
+
+				std::string fullKey = readToken(keyValueSeparators, &quote, &found);
 				if (!found) throw std::logic_error(std::string("missing = after key: '") + fullKey + "'");
 				if (!quote) fullKey = trim(fullKey);
 				readSpacesAndTabs();
 
 				if (quote) {
 					ch = get();
-					if (ch == '=') readSpacesAndTabs();
+					if (oneOf(ch, keyValueSeparators)) readSpacesAndTabs();
 					else throw std::logic_error(std::string("missing = after quoted key: '") + fullKey + "'");
 				}
 
 				std::string value = readToken('\n', &quote, &found);
 				if (!quote) value = trim(value);
 
-				INIContentHandler::EntryEvent event;
-				event.lineNumber = lineNumber;
-				event.eventNumber = ++eventNumber;
 				event.key = fullKey;
 				event.fullKey = fullKey;
 				event.value = value;
@@ -278,9 +418,14 @@
 						event.key = match[1];
 						event.subKey = match[2];
 						event.fullKey = fullKey;
+						event.subKey = unescape(event.subKey, UnescapingProcessor::TextType::EntryKey);
 					}
 				}
 
+				event.key = unescape(event.key, UnescapingProcessor::TextType::EntryKey);
+				event.fullKey = unescape(event.fullKey, UnescapingProcessor::TextType::EntryKey);
+				event.value = unescape(event.value, UnescapingProcessor::TextType::EntryValue);
+
 				if (quote) {
 					readSpacesAndTabs();
 					ch = peek();
@@ -288,9 +433,15 @@
 						get();
 						readSpacesAndTabs();
 						event.comment = readUntil('\n', &found);
+						event.comment = unescape(event.comment, UnescapingProcessor::TextType::EntryComment);
 					} else if (ch == '\n') {
 						get();
 					} else {
+						// TODO: optional support for multiple tokens in a single entry?
+						// modes: array, concatenate
+						// some-array-1 = "item 1" "item 2" 'item 3' item 4
+						// some-array-2 = "item 1" "item 2" 'item 3' item_4 item_5
+						// some-bash-style-string-value = "this "will' be' concatenated → this will be concatenated
 						throw std::logic_error(std::string("unexpected content after the quoted value: key='") + fullKey + "' value='" + event.value + "'");
 					}
 				}