diff -r fd669e73d39a -r 0e7c57d48d1e src/lib/INIReader.cpp --- a/src/lib/INIReader.cpp Thu Nov 26 11:42:26 2020 +0100 +++ b/src/lib/INIReader.cpp Sat Nov 28 18:14:15 2020 +0100 @@ -32,21 +32,47 @@ std::istream& input; std::vector handlers; + class ConfiguredUnescapingProcessor { + public: + std::shared_ptr processor; + const std::string uri; + bool enbaled; + + ConfiguredUnescapingProcessor(std::shared_ptr processor, const std::string uri, bool enbaled) : processor(processor), uri(uri), enbaled(enbaled) { + } + + }; + + std::vector unescapingProcessors; + /** - * This might be configurable. - * * By default, we ignore all leading whitespace on continuing lines. * If there should be some spaces or tabs, they should be placed on the previous line before the „\“. * If a line break is desired, it should be written as \n (escaped) or the value should be quoted in " or '. * + * TODO: several options: + * - enabled, disabled + * - if disabled, then: keep backslash, trim backslash, escape backslash + * (keep requires support in some further unescaping phase, or it will cause an error) + * - keep or trim the line end + * - keep or trim the leading spaces + * - allow comments interleaved with continuing lines (the freaky systemd syntax) + * * Related specifications: * - https://docs.oracle.com/javase/8/docs/api/index.html?java/util/Properties.html + * - https://www.freedesktop.org/software/systemd/man/systemd.syntax.html */ - bool consumeLeadingSpacesOnContinuingLines = true; + bool trimLeadingSpacesOnContinuingLines = true; + /** - * This might be configurable. - * + * Some dialects or configuration files in general does not support sections. + * Then a line, that looks like an INI section, should be interpreted as a key + * (or error, if does not have a proper key-value separator). + */ + bool allowSections = true; + + /** * KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section. * Line „[section_1][$i]“ means that the „section_1“ is „locked“. * We may emit this information somehow later, but for now, it is just ignored. @@ -59,8 +85,6 @@ bool allowSectionTags = true; /** - * This might be configurable. - * * If whole key is „aaa[bbb]“ then „aaa“ is considered to be the key and „bbb“ the sub-key. * No \[ escaping is currently supported, so the key might not contain the bracket character. * @@ -70,6 +94,35 @@ */ bool allowSubKeys = true; + /** + * Classic INI uses „key=value“ syntax. + * But some other formats/dialects might use key:value. + * + * Only single character separators are supported. + * If multiple separators should be recognized (e.g. both „=“ and „:“), this string will contain all of them, + * i.e. „:=“ does not mean that the „key:=value“ syntax, but „key=value“ or „key:value“. + */ + std::string keyValueSeparators = "="; + + /** + * Classic INI uses „; comment“ syntax. + * But many existing files contain „# comment“ lines. + * + * Only single character separators are supported (works same as keyValueSeparators). + */ + std::string commentSeparators = ";#"; + + /** + * INI often support both "quotes" and 'apostrophes' styles. + * But some dialects may support only one of them or not support quoting at all. + * + * In such case e.g. „key="some value"“ would mean that the value is „"value"“ (including the quotes). + * Thus it is important to allow disabling quote recognizing (which is done by setting this parameter to empty string). + * + * Only single character quotes are supported (works same as keyValueSeparators). + */ + std::string quotes = "\"'"; + int lineNumber = 1; int eventNumber = 0; @@ -111,30 +164,35 @@ } void processContinuingLine(std::stringstream& result) { - if (consumeLeadingSpacesOnContinuingLines) readSpacesAndTabs(); + if (trimLeadingSpacesOnContinuingLines) readSpacesAndTabs(); else result.put('\n'); } - std::string readUntil(char until, bool* found = nullptr) { + std::string readUntil(const char until, bool* found = nullptr) { + return readUntil(std::string(1, until), found); + } + + std::string readUntil(const std::string& until, bool* found = nullptr) { std::stringstream result; - for (char ch = peek(); input.good() && ch != until; ch = peek()) { + for (char ch = peek(); input.good() && !oneOf(ch, until); ch = peek()) { if (ch == '\\') { get(); ch = get(); - if (ch == until && ch == '\n') processContinuingLine(result); - else if (ch == until) result.put(ch); + if (oneOf(ch, until) && ch == '\n') processContinuingLine(result); + else if (oneOf(ch, until)) result.put(ch); else if (ch == std::istream::traits_type::eof()) break; else result.put('\\').put(ch); - // TODO: two-stage and modular unescaping: here unescape only \+LF or more genereally: unescape only the until character and rest leave untouched - // second escaping stage move to separate class/wrapper (similar to hierarchical wrappers) + // unescaping is done in two phases: + // here we unescape just the \n (LF) + // other escape sequences are leaved untouched and will be processed in later phases, see see UnescapingINIHandler } else { ch = get(); result.put(ch); } } - if (peek() == until) { + if (oneOf(peek(), until)) { get(); if (found) *found = true; } else { @@ -144,13 +202,17 @@ return result.str(); } - std::string readToken(char until, char* quote = nullptr, bool* found = nullptr) { + std::string readToken(const char until, char* quote = nullptr, bool* found = nullptr) { + return readToken(std::string(1, until), quote, found); + } + + std::string readToken(const std::string& until, char* quote = nullptr, bool* found = nullptr) { std::string result; char ch = peek(); if (isQuote(ch)) { if (quote) *quote = ch; - result = readUntil(get(), found); + result = readUntil(std::string(1, get()), found); } else { if (quote) *quote = 0; result = readUntil(until, found); @@ -160,35 +222,107 @@ } std::string readTokenAndEatTerminator(char until, char* quote = nullptr, bool* found = nullptr) { + return readTokenAndEatTerminator(std::string(1, until), quote, found); + } + + std::string readTokenAndEatTerminator(const std::string& until, char* quote = nullptr, bool* found = nullptr) { std::string result = readToken(until, quote, found); if (*quote) { readAllWhitespace(); - if (get() != until) throw std::logic_error(std::string("missing „") + std::string(1, until) + "“ after quoted section name"); + if (!oneOf(get(), until)) throw std::logic_error(std::string("missing „") + until + "“ after quoted section name"); } return result; } + std::string unescape(const std::string& value, UnescapingProcessor::TextType type) { + std::string result = value; + for (ConfiguredUnescapingProcessor p : unescapingProcessors) if (p.enbaled) result = p.processor->unescape(result, type); + return result; + } + bool isComment(char ch) { - return ch == '#' || ch == ';'; + return oneOf(ch, commentSeparators); } bool isQuote(char ch) { - return ch == '"' || ch == '\''; + return oneOf(ch, quotes); + } + + /** + * @param ch character to be evaluated + * @param options list of options (characters) + * @return whether ch is one of options + */ + bool oneOf(char ch, const std::string& options) { + return options.find(ch) != std::string::npos; } std::string trim(std::string s) { return std::regex_replace(s, std::regex("^\\s+|\\s+$"), ""); } + /** + * TODO: use a common method + */ + bool parseBoolean(const std::string& value) { + if (value == "true") return true; + else if (value == "false") return false; + else throw std::invalid_argument(std::string("Unable to parse boolean value: ") + value + " (expecting true or false)"); + } + + void setDialect(const std::string& name) { + if (name == "default-ini") { + // already set + } else if (name == "java-properties") { + trimLeadingSpacesOnContinuingLines = true; + allowSections = false; + allowSectionTags = false; + allowSubKeys = false; + commentSeparators = "#"; + keyValueSeparators = "=:"; + quotes = ""; + // TODO: enable unicode unescaping + } else { + throw std::invalid_argument(std::string("Unsupported INI dialect: ") + name); + } + } + + bool setUnescaping(const std::string& uri, const std::string& value) { + for (ConfiguredUnescapingProcessor& p : unescapingProcessors) { + if (p.uri == uri) { + p.enbaled = parseBoolean(value); + return true; + } + } + return false; + } + public: INIReaderImpl(std::istream& input) : input(input) { } + void setOption(const std::string& uri, const std::string& value) override { + if (uri == "trim-continuing-lines") trimLeadingSpacesOnContinuingLines = parseBoolean(value); // TODO: continuing lines modes (enum), not just boolean + else if (uri == "allow-sections") allowSections = parseBoolean(value); + else if (uri == "allow-section-tags") allowSectionTags = parseBoolean(value); + else if (uri == "allow-sub-keys") allowSubKeys = parseBoolean(value); + else if (uri == "comment-separators") commentSeparators = value; + else if (uri == "key-value-separators") keyValueSeparators = value; + else if (uri == "quotes") quotes = value; + else if (uri == "dialect") setDialect(value); + else if (setUnescaping(uri, value)); + else throw std::invalid_argument(std::string("Invalid parser option: „") + uri + "“ with value: „" + value + "“"); + } + void addHandler(INIContentHandler* handler) override { handlers.push_back(handler); } + void addUnescapingProcessor(std::shared_ptr processor, const std::string uri, bool enabledByDefault) override { + unescapingProcessors.push_back({processor, uri, enabledByDefault}); + } + void process() override { for (INIContentHandler* handler : handlers) handler->startDocument(); @@ -196,10 +330,10 @@ while (input.good()) { // TODO: condition { + INIContentHandler::WhitespaceEvent event; + event.lineNumber = lineNumber; std::string whitespace = readAllWhitespace(); if (whitespace.size()) { - INIContentHandler::WhitespaceEvent event; - event.lineNumber = lineNumber; event.eventNumber = ++eventNumber; event.whitespace = whitespace; for (INIContentHandler* handler : handlers) handler->whitespace(event); @@ -213,20 +347,23 @@ if (ch == std::istream::traits_type::eof()) { break; - } else if (ch == '[') { + } else if (ch == '[' && allowSections) { if (inSection) for (INIContentHandler* handler : handlers) handler->endSection(); inSection = true; - get(); - readAllWhitespace(); INIContentHandler::SectionStartEvent event; event.lineNumber = lineNumber; event.eventNumber = ++eventNumber; + get(); + readAllWhitespace(); event.name = readTokenAndEatTerminator(']', "e, &found); + if (!quote) event.name = trim(event.name); + event.name = unescape(event.name, UnescapingProcessor::TextType::SectionName); readSpacesAndTabs(); if (allowSectionTags && peek() == '[') { get(); event.tag = readTokenAndEatTerminator(']', "e, &found); + event.tag = unescape(event.tag, UnescapingProcessor::TextType::SectionTag); } readSpacesAndTabs(); @@ -235,6 +372,7 @@ get(); readSpacesAndTabs(); event.comment = readUntil('\n', &found); + event.comment = unescape(event.comment, UnescapingProcessor::TextType::SectionComment); } else if (ch == '\n') { get(); } else { @@ -243,31 +381,33 @@ for (INIContentHandler* handler : handlers) handler->startSection(event); } else if (isComment(ch)) { - get(); - readSpacesAndTabs(); INIContentHandler::CommentEvent event; event.lineNumber = lineNumber; event.eventNumber = ++eventNumber; + get(); + readSpacesAndTabs(); event.comment = readUntil('\n', &found); + event.comment = unescape(event.comment, UnescapingProcessor::TextType::Comment); for (INIContentHandler* handler : handlers) handler->comment(event); } else { - std::string fullKey = readToken('=', "e, &found); + INIContentHandler::EntryEvent event; + event.lineNumber = lineNumber; + event.eventNumber = ++eventNumber; + + std::string fullKey = readToken(keyValueSeparators, "e, &found); if (!found) throw std::logic_error(std::string("missing = after key: '") + fullKey + "'"); if (!quote) fullKey = trim(fullKey); readSpacesAndTabs(); if (quote) { ch = get(); - if (ch == '=') readSpacesAndTabs(); + if (oneOf(ch, keyValueSeparators)) readSpacesAndTabs(); else throw std::logic_error(std::string("missing = after quoted key: '") + fullKey + "'"); } std::string value = readToken('\n', "e, &found); if (!quote) value = trim(value); - INIContentHandler::EntryEvent event; - event.lineNumber = lineNumber; - event.eventNumber = ++eventNumber; event.key = fullKey; event.fullKey = fullKey; event.value = value; @@ -278,9 +418,14 @@ event.key = match[1]; event.subKey = match[2]; event.fullKey = fullKey; + event.subKey = unescape(event.subKey, UnescapingProcessor::TextType::EntryKey); } } + event.key = unescape(event.key, UnescapingProcessor::TextType::EntryKey); + event.fullKey = unescape(event.fullKey, UnescapingProcessor::TextType::EntryKey); + event.value = unescape(event.value, UnescapingProcessor::TextType::EntryValue); + if (quote) { readSpacesAndTabs(); ch = peek(); @@ -288,9 +433,15 @@ get(); readSpacesAndTabs(); event.comment = readUntil('\n', &found); + event.comment = unescape(event.comment, UnescapingProcessor::TextType::EntryComment); } else if (ch == '\n') { get(); } else { + // TODO: optional support for multiple tokens in a single entry? + // modes: array, concatenate + // some-array-1 = "item 1" "item 2" 'item 3' item 4 + // some-array-2 = "item 1" "item 2" 'item 3' item_4 item_5 + // some-bash-style-string-value = "this "will' be' concatenated → this will be concatenated throw std::logic_error(std::string("unexpected content after the quoted value: key='") + fullKey + "' value='" + event.value + "'"); } }