src/lib/INIReader.cpp
branchv_0
changeset 26 80e129ec3408
parent 25 ee70b17950bd
child 28 0e7c57d48d1e
--- a/src/lib/INIReader.cpp	Mon Nov 23 21:09:46 2020 +0100
+++ b/src/lib/INIReader.cpp	Wed Nov 25 21:50:26 2020 +0100
@@ -17,13 +17,169 @@
 
 #include <vector>
 #include <regex>
+#include <sstream>
+#include <stdexcept>
 
 #include "INIReader.h"
 
+namespace relpipe {
+namespace in {
+namespace ini {
+namespace lib {
+
 class INIReaderImpl : public INIReader {
 private:
 	std::istream& input;
 	std::vector<INIContentHandler*> handlers;
+
+	/** 
+	 * This might be configurable.
+	 * 
+	 * By default, we ignore all leading whitespace on continuing lines.
+	 * If there should be some spaces or tabs, they should be placed on the previous line before the „\“.
+	 * If a line break is desired, it should be written as \n (escaped) or the value should be quoted in " or '.
+	 * 
+	 * Related specifications:
+	 *  - https://docs.oracle.com/javase/8/docs/api/index.html?java/util/Properties.html
+	 */
+	bool consumeLeadingSpacesOnContinuingLines = true;
+
+	/**
+	 * This might be configurable.
+	 * 
+	 * KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section.
+	 * Line „[section_1][$i]“ means that the „section_1“ is „locked“.
+	 * We may emit this information somehow later, but for now, it is just ignored.
+	 * 
+	 * TODO: Is „section tag“ right name?
+	 * 
+	 * Related specifications:
+	 *  - https://userbase.kde.org/KDE_System_Administration/Configuration_Files#Lock_Down
+	 */
+	bool allowSectionTags = true;
+
+	/**
+	 * This might be configurable.
+	 * 
+	 * If whole key is „aaa[bbb]“ then „aaa“ is considered to be the key and „bbb“ the sub-key.
+	 * No \[ escaping is currently supported, so the key might not contain the bracket character.
+	 * 
+	 * Related specifications:
+	 *  - https://userbase.kde.org/KDE_System_Administration/Configuration_Files#Shell_Expansion
+	 *  - https://specifications.freedesktop.org/desktop-entry-spec/latest/ar01s05.html
+	 */
+	bool allowSubKeys = true;
+
+	int lineNumber = 1;
+	int eventNumber = 0;
+
+	/**
+	 * Should be always used instead of input.peek().
+	 * Skips \r.
+	 */
+	char peek() {
+		// In 2020 there is no need to manually return the carriage. However some legacy systems still do it.
+		char ch = input.peek();
+		if (ch == '\r') {
+			input.get();
+			ch = input.peek();
+		}
+		return ch;
+	}
+
+	/**
+	 * Should be always used instead of input.get().
+	 * Counts the lines and skips \r.
+	 */
+	char get() {
+		char ch = input.get();
+		if (ch == '\n') lineNumber++;
+		else if (ch == '\r') ch = get();
+		return ch;
+	}
+
+	std::string readSpacesAndTabs() {
+		std::stringstream result;
+		for (char ch = peek(); input.good() && (ch == ' ' || ch == '\t'); ch = peek()) result.put(get());
+		return result.str();
+	}
+
+	std::string readAllWhitespace() {
+		std::stringstream result;
+		for (char ch = peek(); input.good() && (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'); ch = peek()) result.put(get());
+		return result.str();
+	}
+
+	void processContinuingLine(std::stringstream& result) {
+		if (consumeLeadingSpacesOnContinuingLines) readSpacesAndTabs();
+		else result.put('\n');
+	}
+
+	std::string readUntil(char until, bool* found = nullptr) {
+		std::stringstream result;
+
+		for (char ch = peek(); input.good() && ch != until; ch = peek()) {
+			if (ch == '\\') {
+				get();
+				ch = get();
+				if (ch == until && ch == '\n') processContinuingLine(result);
+				else if (ch == until) result.put(ch);
+				else if (ch == std::istream::traits_type::eof()) break;
+				else result.put('\\').put(ch);
+				// TODO: two-stage and modular unescaping: here unescape only \+LF or more genereally: unescape only the until character and rest leave untouched
+				// second escaping stage move to separate class/wrapper (similar to hierarchical wrappers)
+			} else {
+				ch = get();
+				result.put(ch);
+			}
+		}
+
+		if (peek() == until) {
+			get();
+			if (found) *found = true;
+		} else {
+			if (found) *found = false;
+		}
+
+		return result.str();
+	}
+
+	std::string readToken(char until, char* quote = nullptr, bool* found = nullptr) {
+		std::string result;
+
+		char ch = peek();
+		if (isQuote(ch)) {
+			if (quote) *quote = ch;
+			result = readUntil(get(), found);
+		} else {
+			if (quote) *quote = 0;
+			result = readUntil(until, found);
+		}
+
+		return result;
+	}
+
+	std::string readTokenAndEatTerminator(char until, char* quote = nullptr, bool* found = nullptr) {
+		std::string result = readToken(until, quote, found);
+		if (*quote) {
+			readAllWhitespace();
+			if (get() != until) throw std::logic_error(std::string("missing „") + std::string(1, until) + "“ after quoted section name");
+		}
+		return result;
+	}
+
+	bool isComment(char ch) {
+		return ch == '#' || ch == ';';
+	}
+
+	bool isQuote(char ch) {
+		return ch == '"' || ch == '\'';
+	}
+
+	std::string trim(std::string s) {
+		return std::regex_replace(s, std::regex("^\\s+|\\s+$"), "");
+	}
+
 public:
 
 	INIReaderImpl(std::istream& input) : input(input) {
@@ -34,124 +190,144 @@
 	}
 
 	void process() override {
-
 		for (INIContentHandler* handler : handlers) handler->startDocument();
 
-		std::regex whitespacePattrern("\\s*");
-		std::regex commentPattrern("\\s*(;|#)\\s*(.*)");
-		std::regex sectionPattrern("\\s*\\[\\s*([^\\]]+)\\s*\\]\\s*(\\[\\s*([^\\]]+)\\s*\\])?\\s*((;|#)\\s*(.*))?");
-		std::regex entryQuotedPattrern("\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*(\"|')((?:(?!\\5).)*)(\\5)?\\s*((;|#)\\s*(.*))?");
-		std::regex entryPlainPattrern("\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*(.*?)\\s*");
+		bool inSection = false;
+
+		while (input.good()) { // TODO: condition
+			{
+				std::string whitespace = readAllWhitespace();
+				if (whitespace.size()) {
+					INIContentHandler::WhitespaceEvent event;
+					event.lineNumber = lineNumber;
+					event.eventNumber = ++eventNumber;
+					event.whitespace = whitespace;
+					for (INIContentHandler* handler : handlers) handler->whitespace(event);
+				}
+			}
 
-		std::smatch match;
-		bool inSection = false;
-		std::string line;
-		int lineNumber = 0;
-		int eventNumber = 0;
+			bool found;
+			char quote;
+
+			char ch = peek();
 
-
-		while (std::getline(input, line)) {
-			lineNumber++;
-
-			if (std::regex_match(line, match, whitespacePattrern)) {
-				INIContentHandler::WhitespaceEvent event;
+			if (ch == std::istream::traits_type::eof()) {
+				break;
+			} else if (ch == '[') {
+				if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
+				inSection = true;
+				get();
+				readAllWhitespace();
+				INIContentHandler::SectionStartEvent event;
 				event.lineNumber = lineNumber;
 				event.eventNumber = ++eventNumber;
-				event.whitespace = match[0];
-				for (INIContentHandler* handler : handlers) handler->whitespace(event);
-			} else if (std::regex_match(line, match, commentPattrern)) {
+				event.name = readTokenAndEatTerminator(']', &quote, &found);
+
+				readSpacesAndTabs();
+				if (allowSectionTags && peek() == '[') {
+					get();
+					event.tag = readTokenAndEatTerminator(']', &quote, &found);
+				}
+
+				readSpacesAndTabs();
+				ch = peek();
+				if (isComment(ch)) {
+					get();
+					readSpacesAndTabs();
+					event.comment = readUntil('\n', &found);
+				} else if (ch == '\n') {
+					get();
+				} else {
+					throw std::logic_error(std::string("unexpected content after the section: '") + event.name + "'");
+				}
+
+				for (INIContentHandler* handler : handlers) handler->startSection(event);
+			} else if (isComment(ch)) {
+				get();
+				readSpacesAndTabs();
 				INIContentHandler::CommentEvent event;
 				event.lineNumber = lineNumber;
 				event.eventNumber = ++eventNumber;
-				event.comment = match[2];
+				event.comment = readUntil('\n', &found);
 				for (INIContentHandler* handler : handlers) handler->comment(event);
-			} else if (std::regex_match(line, match, sectionPattrern)) {
-				if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
-				inSection = true;
-				INIContentHandler::SectionStartEvent event;
-				event.lineNumber = lineNumber;
-				event.eventNumber = ++eventNumber;
-				event.name = match[1];
-				event.comment = match[6];
-				// event.tag = match[3];
-				// KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section.
-				// see <https://userbase.kde.org/KDE_System_Administration/Configuration_Files>, „[$i]“ means that the section is „locked“
-				// We may emit this information somehow later, but for now, it is just ignored.
-				for (INIContentHandler* handler : handlers) handler->startSection(event);
-			} else if (std::regex_match(line, match, entryQuotedPattrern)) {
+			} else {
+				std::string fullKey = readToken('=', &quote, &found);
+				if (!found) throw std::logic_error(std::string("missing = after key: '") + fullKey + "'");
+				if (!quote) fullKey = trim(fullKey);
+				readSpacesAndTabs();
+
+				if (quote) {
+					ch = get();
+					if (ch == '=') readSpacesAndTabs();
+					else throw std::logic_error(std::string("missing = after quoted key: '") + fullKey + "'");
+				}
+
+				std::string value = readToken('\n', &quote, &found);
+				if (!quote) value = trim(value);
+
 				INIContentHandler::EntryEvent event;
 				event.lineNumber = lineNumber;
 				event.eventNumber = ++eventNumber;
-				event.key = match[2];
-				event.subKey = match[4];
-				event.fullKey = match[1];
-				event.value = match[6];
-				event.comment = match[10];
+				event.key = fullKey;
+				event.fullKey = fullKey;
+				event.value = value;
 
-				// the "/' at the end is missing → line continues
-				if (match.length(7) == 0) {
-					std::regex endPattern(std::string("(.*?)") + (match[5] == "'" ? "'" : "\"") + "\\s*((;|#)\\s*(.*))?");
-					while (std::getline(input, line)) {
-						lineNumber++;
-						event.value += "\n";
-						if (std::regex_match(line, match, endPattern)) {
-							event.value += std::string(match[1]);
-							event.comment = match[4];
-							break;
-						} else {
-							event.value += line;
-						}
+				if (allowSubKeys) {
+					std::smatch match;
+					if (std::regex_match(fullKey, match, std::regex("([^\\[]+)\\[([^\\[]+)\\]"))) {
+						event.key = match[1];
+						event.subKey = match[2];
+						event.fullKey = fullKey;
+					}
+				}
+
+				if (quote) {
+					readSpacesAndTabs();
+					ch = peek();
+					if (isComment(ch)) {
+						get();
+						readSpacesAndTabs();
+						event.comment = readUntil('\n', &found);
+					} else if (ch == '\n') {
+						get();
+					} else {
+						throw std::logic_error(std::string("unexpected content after the quoted value: key='") + fullKey + "' value='" + event.value + "'");
 					}
 				}
 
 				for (INIContentHandler* handler : handlers) handler->entry(event);
-			} else if (std::regex_match(line, match, entryPlainPattrern)) {
-				INIContentHandler::EntryEvent event;
-				event.lineNumber = lineNumber;
-				event.eventNumber = ++eventNumber;
-				event.key = match[2];
-				event.subKey = match[4];
-				event.fullKey = match[1];
-				event.value = match[5];
-
-				// the \ at the end → line continues
-				while (line.back() == '\\' && std::getline(input, line)) {
-					lineNumber++;
-					line = std::regex_replace(line, std::regex("^\\s+|\\s+$"), ""); // trim the spaces: continuing lines might be aligned to the first line (desired spaces – if any – should be at the line end before the \ character)
-					event.value = event.value.substr(0, event.value.size() - 1); // cut the trailing \ backslash
-					event.value = event.value + line;
-				}
-
-				for (INIContentHandler* handler : handlers) handler->entry(event);
-			} else {
-				// TODO: warning, error, or support unknown content
 			}
-
-			// General feautres:
-			// TODO: probably switch to state-machine approach instead of regular expressions or use an existing library
-			// TODO: warning/error handler
-			// TODO: support also quoted or multiline keys?
-			// TODO: support also escaped characters
-			// TODO: support also Java .properties and manifest.mf formats?
-			// TODO: support also quoted sections ["qoted section"] – useful for hierarchy (the path element may contain the separator character)
-			// TODO: support also nested sections – hierarchy
-			// TODO: support also nested keys e.g. key.sub.subsub.subsubsub=value – translate them to nested sections
-			// TODO: support also option for alternative key-value separator (: instead of =)
-			// TODO: support also other encodings (currently only UTF-8 is supported)
-
-			// Lossless conversions:
-			// TODO: emit also the quote style ('/"/)
-			// TODO: emit also the comment style (;/#) ?
-			// TODO: emit also the whitespace before key name, around =, after "values"/'values', around [sections] ?
-			// TODO: emit also the line-end type (LF/CRLF) ?
 		}
-
+		// TODO: error at the end, catch premature/unexpected EOF
+		// TODO: unescape + trim values + ignore \r
+		// TODO: count lines
 		if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
-
 		for (INIContentHandler* handler : handlers) handler->endDocument();
 	}
+
+	// General feautres:
+	// TODO: warning/error handler
+	// TODO: support also escaped characters
+	// TODO: support also Java .properties and manifest.mf formats?
+	// TODO: support also nested sections – hierarchy
+	// TODO: support also nested keys e.g. key.sub.subsub.subsubsub=value – translate them to nested sections
+	// TODO: support also option for alternative key-value separator (: instead of =)
+	// TODO: support also other encodings (currently only UTF-8 is supported)
+	// TODO: better exceptions
+
+	// Lossless conversions:
+	// TODO: emit also the quote style ('/"/)
+	// TODO: emit also the comment style (;/#) ?
+	// TODO: emit also the whitespace before key name, around =, after "values"/'values', around [sections] ?
+	// TODO: emit also the line-end type (LF/CRLF) ?
+
 };
 
 INIReader* INIReader::create(std::istream& input) {
 	return new INIReaderImpl(input);
 }
+
+}
+}
+}
+}