src/lib/INIReader.cpp
author František Kučera <franta-hg@frantovo.cz>
Fri, 26 Aug 2022 22:41:46 +0200
branchv_0
changeset 39 509e4b02f3c2
parent 35 930f17f16fd7
permissions -rw-r--r--
fix typo: enbaled → enabled (thanks Jiří Wolker for reporting)

/**
 * Relational pipes
 * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

#include <vector>
#include <regex>
#include <sstream>
#include <stdexcept>

#include "INIReader.h"
#include "uri.h"

namespace relpipe {
namespace in {
namespace ini {
namespace lib {

class INIReaderImpl : public INIReader {
private:
	std::istream& input;
	std::vector<INIContentHandler*> handlers;

	class ConfiguredUnescapingProcessor {
	public:
		std::shared_ptr<UnescapingProcessor> processor;
		const std::string uri;
		bool enabled;

		ConfiguredUnescapingProcessor(std::shared_ptr<UnescapingProcessor> processor, const std::string uri, bool enabled) : processor(processor), uri(uri), enabled(enabled) {
		}

	};

	std::vector<ConfiguredUnescapingProcessor> unescapingProcessors;

	class ConfiguredDialect {
	public:
		std::shared_ptr<Dialect> dialect;
		const std::string uri;

		ConfiguredDialect(std::shared_ptr<Dialect> dialect, const std::string uri) : dialect(dialect), uri(uri) {
		}

	};

	std::vector<ConfiguredDialect> dialects;

	/**
	 * If there is a „\“ backspace at the end of a physical line, the logical line continues on the next physical line.
	 *
	 * Disabling this option makes sense only if we also disable the unescaping processors (unescape-basic, unescape-backspace).
	 * Otherwise they will complain about „Missing escape sequence“ because they got „\“ at the end of the value.
	 */
	bool allowLineContinuationsWithEscaping = true;

	/**
	 * If a line starts with a space, it is continuation of the previous line.
	 * This rule conflicts with default ignorance of such insignificant whitespace and is quite specific to the Java MANIFEST.MF dialect.
	 */
	bool allowLineContinuationsWithSpace = false;

	/** 
	 * By default, we ignore all leading whitespace on continuing lines.
	 * If there should be some spaces or tabs, they should be placed on the previous line before the „\“.
	 * If a line break is desired, it should be written as \n (escaped) or the value should be quoted in " or '.
	 * 
	 * TODO: several options:
	 *  - enabled, disabled
	 *  - if disabled, then: keep backslash, trim backslash, escape backslash
	 *    (keep requires support in some further unescaping phase, or it will cause an error)
	 *  - keep or trim the line end
	 *  - keep or trim the leading spaces
	 *  - allow comments interleaved with continuing lines (the freaky systemd syntax)
	 * 
	 * Related specifications:
	 *  - https://docs.oracle.com/javase/8/docs/api/index.html?java/util/Properties.html
	 *  - https://www.freedesktop.org/software/systemd/man/systemd.syntax.html
	 */
	bool trimLeadingSpacesOnContinuingLines = true;


	/**
	 * Some dialects or configuration files in general does not support sections.
	 * Then a line, that looks like an INI section, should be interpreted as a key
	 * (or error, if does not have a proper key-value separator).
	 */
	bool allowSections = true;

	/**
	 * KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section.
	 * Line „[section_1][$i]“ means that the „section_1“ is „locked“.
	 * We may emit this information somehow later, but for now, it is just ignored.
	 * 
	 * TODO: Is „section tag“ right name?
	 * 
	 * Related specifications:
	 *  - https://userbase.kde.org/KDE_System_Administration/Configuration_Files#Lock_Down
	 */
	bool allowSectionTags = true;

	/**
	 * If whole key is „aaa[bbb]“ then „aaa“ is considered to be the key and „bbb“ the sub-key.
	 * No \[ escaping is currently supported, so the key might not contain the bracket character.
	 * 
	 * Related specifications:
	 *  - https://userbase.kde.org/KDE_System_Administration/Configuration_Files#Shell_Expansion
	 *  - https://specifications.freedesktop.org/desktop-entry-spec/latest/ar01s05.html
	 */
	bool allowSubKeys = true;

	/**
	 * Classic INI uses „key=value“ syntax.
	 * But some other formats/dialects might use key:value.
	 * 
	 * Only single character separators are supported.
	 * If multiple separators should be recognized (e.g. both „=“ and „:“), this string will contain all of them,
	 * i.e. „:=“ does not mean that the „key:=value“ syntax, but „key=value“ or „key:value“.
	 */
	std::string keyValueSeparators = "=";

	/**
	 * Classic INI uses „; comment“ syntax.
	 * But many existing files contain „# comment“ lines.
	 * 
	 * Only single character separators are supported (works same as keyValueSeparators).
	 */
	std::string commentSeparators = ";#";

	/**
	 * INI often support both "quotes" and 'apostrophes' styles.
	 * But some dialects may support only one of them or not support quoting at all.
	 * 
	 * In such case e.g. „key="some value"“ would mean that the value is „"value"“ (including the quotes).
	 * Thus it is important to allow disabling quote recognizing (which is done by setting this parameter to empty string).
	 * 
	 * Only single character quotes are supported (works same as keyValueSeparators).
	 */
	std::string quotes = "\"'";

	int lineNumber = 1;
	int eventNumber = 0;

	/**
	 * Should be always used instead of input.peek().
	 * Skips \r.
	 */
	char peek() {
		// In 2020 there is no need to manually return the carriage. However some legacy systems still do it.
		char ch = input.peek();
		if (ch == '\r') {
			input.get();
			ch = input.peek();
		}
		return ch;
	}

	/**
	 * Should be always used instead of input.get().
	 * Counts the lines and skips \r.
	 */
	char get() {
		char ch = input.get();
		if (ch == '\n') lineNumber++;
		else if (ch == '\r') ch = get();
		return ch;
	}

	std::string readSpacesAndTabs() {
		std::stringstream result;
		for (char ch = peek(); input.good() && (ch == ' ' || ch == '\t'); ch = peek()) result.put(get());
		return result.str();
	}

	std::string readAllWhitespace() {
		std::stringstream result;
		for (char ch = peek(); input.good() && (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'); ch = peek()) result.put(get());
		return result.str();
	}

	void processContinuingLine(std::stringstream& result) {
		if (trimLeadingSpacesOnContinuingLines) readSpacesAndTabs();
		else result.put('\n');
	}

	std::string readUntil(const char until, bool* found = nullptr) {
		return readUntil(std::string(1, until), found);
	}

	std::string readUntil(const std::string& until, bool* found = nullptr) {
		std::stringstream result;

		for (char ch = peek(); input.good(); ch = peek()) {
			if (allowLineContinuationsWithSpace && ch == '\n') {
				get();
				ch = peek();
				if (ch == ' ') get();
				else if (ch == std::istream::traits_type::eof()) break;
				else {
					if (found) *found = true;
					return result.str();
				}
			} else if (oneOf(ch, until)) {
				break;
			} else if (allowLineContinuationsWithEscaping && ch == '\\') {
				get();
				ch = get();
				if (oneOf(ch, until) && ch == '\n') processContinuingLine(result);
				else if (oneOf(ch, until)) result.put(ch);
				else if (ch == std::istream::traits_type::eof()) break;
				else result.put('\\').put(ch);
				// unescaping is done in two phases:
				// here we unescape just the \n (LF)
				// other escape sequences are leaved untouched and will be processed in later phases, see see UnescapingINIHandler
			} else {
				ch = get();
				result.put(ch);
			}
		}

		if (oneOf(peek(), until)) {
			get();
			if (found) *found = true;
		} else {
			if (found) *found = false;
		}

		return result.str();
	}

	std::string readToken(const char until, char* quote = nullptr, bool* found = nullptr) {
		return readToken(std::string(1, until), quote, found);
	}

	std::string readToken(const std::string& until, char* quote = nullptr, bool* found = nullptr) {
		std::string result;

		char ch = peek();
		if (isQuote(ch)) {
			if (quote) *quote = ch;
			result = readUntil(std::string(1, get()), found);
		} else {
			if (quote) *quote = 0;
			result = readUntil(until, found);
		}

		return result;
	}

	std::string readTokenAndEatTerminator(char until, char* quote = nullptr, bool* found = nullptr) {
		return readTokenAndEatTerminator(std::string(1, until), quote, found);
	}

	std::string readTokenAndEatTerminator(const std::string& until, char* quote = nullptr, bool* found = nullptr) {
		std::string result = readToken(until, quote, found);
		if (*quote) {
			readAllWhitespace();
			if (!oneOf(get(), until)) throw std::logic_error(std::string("missing „") + until + "“ after quoted section name");
		}
		return result;
	}

	std::string unescape(const std::string& value, UnescapingProcessor::TextType type) {
		std::string result = value;
		for (ConfiguredUnescapingProcessor p : unescapingProcessors) if (p.enabled) result = p.processor->unescape(result, type);
		return result;
	}

	bool isComment(char ch) {
		return oneOf(ch, commentSeparators);
	}

	bool isQuote(char ch) {
		return oneOf(ch, quotes);
	}

	/**
	 * @param ch character to be evaluated
	 * @param options list of options (characters)
	 * @return whether ch is one of options
	 */
	bool oneOf(char ch, const std::string& options) {
		return options.find(ch) != std::string::npos;
	}

	std::string trim(std::string s) {
		return std::regex_replace(s, std::regex("^\\s+|\\s+$"), "");
	}

	/**
	 * TODO: use a common method
	 */
	bool parseBoolean(const std::string& value) {
		if (value == "true") return true;
		else if (value == "false") return false;
		else throw std::invalid_argument(std::string("Unable to parse boolean value: ") + value + " (expecting true or false)");
	}

	void setDialect(const std::string& uri) {
		for (ConfiguredDialect& d : dialects) {
			if (d.uri == uri) {
				d.dialect->apply(*this);
				return;
			}
		}
		throw std::invalid_argument(std::string("Unsupported INI dialect: ") + uri);
	}

	bool setUnescaping(const std::string& uri, const std::string& value) {
		for (ConfiguredUnescapingProcessor& p : unescapingProcessors) {
			if (p.uri == uri) {
				p.enabled = parseBoolean(value);
				return true;
			}
		}
		return false;
	}

public:

	INIReaderImpl(std::istream& input) : input(input) {
	}

	void setOption(const std::string& uri, const std::string& value) override {
		if (uri == option::AllowLineContinuationWithEscaping) allowLineContinuationsWithEscaping = parseBoolean(value);
		else if (uri == option::AllowLineContinuationWithSpace) allowLineContinuationsWithSpace = parseBoolean(value);
		else if (uri == option::TrimContinuingLines) trimLeadingSpacesOnContinuingLines = parseBoolean(value);
		else if (uri == option::AllowSections) allowSections = parseBoolean(value);
		else if (uri == option::AllowSectionTags) allowSectionTags = parseBoolean(value);
		else if (uri == option::AllowSubKeys) allowSubKeys = parseBoolean(value);
		else if (uri == option::CommentSeparators) commentSeparators = value;
		else if (uri == option::KeyValueSeparators) keyValueSeparators = value;
		else if (uri == option::Quotes) quotes = value;
		else if (uri == option::Dialect) setDialect(value);
		else if (setUnescaping(uri, value));
		else throw std::invalid_argument(std::string("Invalid parser option: „") + uri + "“ with value: „" + value + "“");
	}

	void addHandler(INIContentHandler* handler) override {
		handlers.push_back(handler);
	}

	void addUnescapingProcessor(std::shared_ptr<UnescapingProcessor> processor, const std::string uri, bool enabledByDefault) override {
		unescapingProcessors.push_back({processor, uri, enabledByDefault});
	}

	void addDialect(std::shared_ptr<Dialect> dialect, const std::string uri, bool enabledByDefault) override {
		dialects.push_back({dialect, uri});
		if (enabledByDefault) dialect->apply(*this);
	}

	void process() override {
		for (INIContentHandler* handler : handlers) handler->startDocument();

		bool inSection = false;

		while (input.good()) { // TODO: condition
			{
				INIContentHandler::WhitespaceEvent event;
				event.lineNumber = lineNumber;
				std::string whitespace = readAllWhitespace();
				if (whitespace.size()) {
					event.eventNumber = ++eventNumber;
					event.whitespace = whitespace;
					for (INIContentHandler* handler : handlers) handler->whitespace(event);
				}
			}

			bool found;
			char quote;

			char ch = peek();

			if (ch == std::istream::traits_type::eof()) {
				break;
			} else if (ch == '[' && allowSections) {
				if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
				inSection = true;
				INIContentHandler::SectionStartEvent event;
				event.lineNumber = lineNumber;
				event.eventNumber = ++eventNumber;
				get();
				readAllWhitespace();
				event.name = readTokenAndEatTerminator(']', &quote, &found);
				if (!quote) event.name = trim(event.name);
				event.name = unescape(event.name, UnescapingProcessor::TextType::SectionName);

				readSpacesAndTabs();
				if (allowSectionTags && peek() == '[') {
					get();
					event.tag = readTokenAndEatTerminator(']', &quote, &found);
					event.tag = unescape(event.tag, UnescapingProcessor::TextType::SectionTag);
				}

				readSpacesAndTabs();
				ch = peek();
				if (isComment(ch)) {
					get();
					readSpacesAndTabs();
					event.comment = readUntil('\n', &found);
					event.comment = unescape(event.comment, UnescapingProcessor::TextType::SectionComment);
				} else if (ch == '\n') {
					get();
				} else {
					throw std::logic_error(std::string("unexpected content after the section: '") + event.name + "'");
				}

				for (INIContentHandler* handler : handlers) handler->startSection(event);
			} else if (isComment(ch)) {
				INIContentHandler::CommentEvent event;
				event.lineNumber = lineNumber;
				event.eventNumber = ++eventNumber;
				get();
				readSpacesAndTabs();
				event.comment = readUntil('\n', &found);
				event.comment = unescape(event.comment, UnescapingProcessor::TextType::Comment);
				for (INIContentHandler* handler : handlers) handler->comment(event);
			} else {
				INIContentHandler::EntryEvent event;
				event.lineNumber = lineNumber;
				event.eventNumber = ++eventNumber;

				std::string fullKey = readToken(keyValueSeparators, &quote, &found);
				if (!found) throw std::logic_error(std::string("missing = after key: '") + fullKey + "'");
				if (!quote) fullKey = trim(fullKey);
				readSpacesAndTabs();

				if (quote) {
					ch = get();
					if (oneOf(ch, keyValueSeparators)) readSpacesAndTabs();
					else throw std::logic_error(std::string("missing = after quoted key: '") + fullKey + "'");
				}

				std::string value = readToken('\n', &quote, &found);
				if (!quote) value = trim(value);

				event.key = fullKey;
				event.fullKey = fullKey;
				event.value = value;

				if (allowSubKeys) {
					std::smatch match;
					if (std::regex_match(fullKey, match, std::regex("([^\\[]+)\\[([^\\[]+)\\]"))) {
						event.key = match[1];
						event.subKey = match[2];
						event.fullKey = fullKey;
						event.subKey = unescape(event.subKey, UnescapingProcessor::TextType::EntryKey);
					}
				}

				event.key = unescape(event.key, UnescapingProcessor::TextType::EntryKey);
				event.fullKey = unescape(event.fullKey, UnescapingProcessor::TextType::EntryKey);
				event.value = unescape(event.value, UnescapingProcessor::TextType::EntryValue);

				if (quote) {
					readSpacesAndTabs();
					ch = peek();
					if (isComment(ch)) {
						get();
						readSpacesAndTabs();
						event.comment = readUntil('\n', &found);
						event.comment = unescape(event.comment, UnescapingProcessor::TextType::EntryComment);
					} else if (ch == '\n') {
						get();
					} else {
						// TODO: optional support for multiple tokens in a single entry?
						// modes: array, concatenate
						// some-array-1 = "item 1" "item 2" 'item 3' item 4
						// some-array-2 = "item 1" "item 2" 'item 3' item_4 item_5
						// some-bash-style-string-value = "this "will' be' concatenated → this will be concatenated
						throw std::logic_error(std::string("unexpected content after the quoted value: key='") + fullKey + "' value='" + event.value + "'");
					}
				}

				for (INIContentHandler* handler : handlers) handler->entry(event);
			}
		}
		// TODO: error at the end, catch premature/unexpected EOF
		// TODO: unescape + trim values + ignore \r
		// TODO: count lines
		if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
		for (INIContentHandler* handler : handlers) handler->endDocument();
	}

	// General feautres:
	// TODO: warning/error handler
	// TODO: support also escaped characters
	// TODO: support also Java .properties and manifest.mf formats?
	// TODO: support also nested sections – hierarchy
	// TODO: support also nested keys e.g. key.sub.subsub.subsubsub=value – translate them to nested sections
	// TODO: support also option for alternative key-value separator (: instead of =)
	// TODO: support also other encodings (currently only UTF-8 is supported)
	// TODO: better exceptions

	// Lossless conversions:
	// TODO: emit also the quote style ('/"/)
	// TODO: emit also the comment style (;/#) ?
	// TODO: emit also the whitespace before key name, around =, after "values"/'values', around [sections] ?
	// TODO: emit also the line-end type (LF/CRLF) ?

};

INIReader* INIReader::create(std::istream& input) {
	return new INIReaderImpl(input);
}

}
}
}
}