# HG changeset patch # User František Kučera # Date 1606337426 -3600 # Node ID 80e129ec340889785c32cb062cbd4b42fb3d09f1 # Parent ee70b17950bdad7b845f26c07becad0d99a6954e new INI parser diff -r ee70b17950bd -r 80e129ec3408 src/XMLDocumentConstructor.h --- a/src/XMLDocumentConstructor.h Mon Nov 23 21:09:46 2020 +0100 +++ b/src/XMLDocumentConstructor.h Wed Nov 25 21:50:26 2020 +0100 @@ -20,8 +20,11 @@ #include #include "lib/INIReader.h" +#include "lib/BasicUnescapingINIHandler.h" #include "lib/XMLNameCodec.h" +using namespace relpipe::in::ini::lib; + namespace relpipe { namespace in { namespace xmltable { @@ -107,7 +110,8 @@ void process() { HierarchicalINIContentHandler handler(parser); std::shared_ptr reader(INIReader::create(*input)); - reader->addHandler(&handler); + BasicUnescapingINIContentHandler unescapingHandler(handler, true); + reader->addHandler(&unescapingHandler); reader->process(); } }; diff -r ee70b17950bd -r 80e129ec3408 src/lib/BasicUnescapingINIHandler.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib/BasicUnescapingINIHandler.h Wed Nov 25 21:50:26 2020 +0100 @@ -0,0 +1,74 @@ +/** + * Relational pipes + * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#pragma once + +#include + +#include "UnescapingINIHandler.h" + +using namespace std; +using namespace relpipe::writer; + +namespace relpipe { +namespace in { +namespace ini { +namespace lib { + +class BasicUnescapingINIContentHandler : public UnescapingINIContentHandler { +protected: + + virtual std::string unescape(const std::string& s) { + std::stringstream result; + for (int i = 0, length = s.size(); i < length; i++) { + char ch = s[i]; + if (i + 1 < length && ch == ESC) { + ch = s[i + 1]; + if (ch == 'n') put(result, '\n', i); + else if (ch == 'r') put(result, '\r', i); + else if (ch == 't') put(result, '\t', i); + else if (ch == 's') put(result, ' ', i); // TODO: Reconsider what is „basic“ escaping and should be supported. + else if (ch == '"') put(result, ch, i); // The delimiters (\n,]",') are already unescaped during the first stage in the INIReader while parsing (the delimiter relevant to given environment is unescaped, e.g. \" in "quoted" value). + else if (ch == '\'') put(result, ch, i); // So it does not necessary to do it here. But someone might write a="xxx\'zzz" however it is superfluous because a="xxx'zzz" will also work. + else if (ch == ']') put(result, ch, i); + else if (ch == ':') put(result, ch, i); + else if (ch == ';') put(result, ch, i); + else if (ch == '#') put(result, ch, i); + else if (ch == '=') put(result, ch, i); + else if (ch == ESC && !lastEscaphingPhase) put(result, ESC, i).put(ESC); // copy and skip even the second \ to avoid its misinterpretation in the next cycle + else if (ch == ESC && lastEscaphingPhase) put(result, ESC, i); // unescape \\ to \. + else if (lastEscaphingPhase) throw std::logic_error(std::string("Unsupported escape sequence: ") + ch); + else result.put(ESC); // keep the escape sequence for later unescaping phase + } else if (ch == ESC) { + throw std::logic_error(std::string("Missing escape sequence")); // this should not happen + } else { + result.put(ch); + } + } + return result.str(); + } + +public: + + BasicUnescapingINIContentHandler(INIContentHandler& output, bool lastEscaphingPhase) : UnescapingINIContentHandler(output, lastEscaphingPhase) { + } + +}; + +} +} +} +} diff -r ee70b17950bd -r 80e129ec3408 src/lib/INIContentHandler.h --- a/src/lib/INIContentHandler.h Mon Nov 23 21:09:46 2020 +0100 +++ b/src/lib/INIContentHandler.h Wed Nov 25 21:50:26 2020 +0100 @@ -18,6 +18,11 @@ #include +namespace relpipe { +namespace in { +namespace ini { +namespace lib { + class INIContentHandler { public: @@ -31,6 +36,7 @@ public: std::string comment; std::string name; + std::string tag; }; class EntryEvent : public Event { @@ -61,3 +67,8 @@ virtual void comment(const CommentEvent& event) = 0; virtual void whitespace(const WhitespaceEvent& event) = 0; }; + +} +} +} +} diff -r ee70b17950bd -r 80e129ec3408 src/lib/INIReader.cpp --- a/src/lib/INIReader.cpp Mon Nov 23 21:09:46 2020 +0100 +++ b/src/lib/INIReader.cpp Wed Nov 25 21:50:26 2020 +0100 @@ -17,13 +17,169 @@ #include #include +#include +#include #include "INIReader.h" +namespace relpipe { +namespace in { +namespace ini { +namespace lib { + class INIReaderImpl : public INIReader { private: std::istream& input; std::vector handlers; + + /** + * This might be configurable. + * + * By default, we ignore all leading whitespace on continuing lines. + * If there should be some spaces or tabs, they should be placed on the previous line before the „\“. + * If a line break is desired, it should be written as \n (escaped) or the value should be quoted in " or '. + * + * Related specifications: + * - https://docs.oracle.com/javase/8/docs/api/index.html?java/util/Properties.html + */ + bool consumeLeadingSpacesOnContinuingLines = true; + + /** + * This might be configurable. + * + * KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section. + * Line „[section_1][$i]“ means that the „section_1“ is „locked“. + * We may emit this information somehow later, but for now, it is just ignored. + * + * TODO: Is „section tag“ right name? + * + * Related specifications: + * - https://userbase.kde.org/KDE_System_Administration/Configuration_Files#Lock_Down + */ + bool allowSectionTags = true; + + /** + * This might be configurable. + * + * If whole key is „aaa[bbb]“ then „aaa“ is considered to be the key and „bbb“ the sub-key. + * No \[ escaping is currently supported, so the key might not contain the bracket character. + * + * Related specifications: + * - https://userbase.kde.org/KDE_System_Administration/Configuration_Files#Shell_Expansion + * - https://specifications.freedesktop.org/desktop-entry-spec/latest/ar01s05.html + */ + bool allowSubKeys = true; + + int lineNumber = 1; + int eventNumber = 0; + + /** + * Should be always used instead of input.peek(). + * Skips \r. + */ + char peek() { + // In 2020 there is no need to manually return the carriage. However some legacy systems still do it. + char ch = input.peek(); + if (ch == '\r') { + input.get(); + ch = input.peek(); + } + return ch; + } + + /** + * Should be always used instead of input.get(). + * Counts the lines and skips \r. + */ + char get() { + char ch = input.get(); + if (ch == '\n') lineNumber++; + else if (ch == '\r') ch = get(); + return ch; + } + + std::string readSpacesAndTabs() { + std::stringstream result; + for (char ch = peek(); input.good() && (ch == ' ' || ch == '\t'); ch = peek()) result.put(get()); + return result.str(); + } + + std::string readAllWhitespace() { + std::stringstream result; + for (char ch = peek(); input.good() && (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'); ch = peek()) result.put(get()); + return result.str(); + } + + void processContinuingLine(std::stringstream& result) { + if (consumeLeadingSpacesOnContinuingLines) readSpacesAndTabs(); + else result.put('\n'); + } + + std::string readUntil(char until, bool* found = nullptr) { + std::stringstream result; + + for (char ch = peek(); input.good() && ch != until; ch = peek()) { + if (ch == '\\') { + get(); + ch = get(); + if (ch == until && ch == '\n') processContinuingLine(result); + else if (ch == until) result.put(ch); + else if (ch == std::istream::traits_type::eof()) break; + else result.put('\\').put(ch); + // TODO: two-stage and modular unescaping: here unescape only \+LF or more genereally: unescape only the until character and rest leave untouched + // second escaping stage move to separate class/wrapper (similar to hierarchical wrappers) + } else { + ch = get(); + result.put(ch); + } + } + + if (peek() == until) { + get(); + if (found) *found = true; + } else { + if (found) *found = false; + } + + return result.str(); + } + + std::string readToken(char until, char* quote = nullptr, bool* found = nullptr) { + std::string result; + + char ch = peek(); + if (isQuote(ch)) { + if (quote) *quote = ch; + result = readUntil(get(), found); + } else { + if (quote) *quote = 0; + result = readUntil(until, found); + } + + return result; + } + + std::string readTokenAndEatTerminator(char until, char* quote = nullptr, bool* found = nullptr) { + std::string result = readToken(until, quote, found); + if (*quote) { + readAllWhitespace(); + if (get() != until) throw std::logic_error(std::string("missing „") + std::string(1, until) + "“ after quoted section name"); + } + return result; + } + + bool isComment(char ch) { + return ch == '#' || ch == ';'; + } + + bool isQuote(char ch) { + return ch == '"' || ch == '\''; + } + + std::string trim(std::string s) { + return std::regex_replace(s, std::regex("^\\s+|\\s+$"), ""); + } + public: INIReaderImpl(std::istream& input) : input(input) { @@ -34,124 +190,144 @@ } void process() override { - for (INIContentHandler* handler : handlers) handler->startDocument(); - std::regex whitespacePattrern("\\s*"); - std::regex commentPattrern("\\s*(;|#)\\s*(.*)"); - std::regex sectionPattrern("\\s*\\[\\s*([^\\]]+)\\s*\\]\\s*(\\[\\s*([^\\]]+)\\s*\\])?\\s*((;|#)\\s*(.*))?"); - std::regex entryQuotedPattrern("\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*(\"|')((?:(?!\\5).)*)(\\5)?\\s*((;|#)\\s*(.*))?"); - std::regex entryPlainPattrern("\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*(.*?)\\s*"); + bool inSection = false; + + while (input.good()) { // TODO: condition + { + std::string whitespace = readAllWhitespace(); + if (whitespace.size()) { + INIContentHandler::WhitespaceEvent event; + event.lineNumber = lineNumber; + event.eventNumber = ++eventNumber; + event.whitespace = whitespace; + for (INIContentHandler* handler : handlers) handler->whitespace(event); + } + } - std::smatch match; - bool inSection = false; - std::string line; - int lineNumber = 0; - int eventNumber = 0; + bool found; + char quote; + + char ch = peek(); - - while (std::getline(input, line)) { - lineNumber++; - - if (std::regex_match(line, match, whitespacePattrern)) { - INIContentHandler::WhitespaceEvent event; + if (ch == std::istream::traits_type::eof()) { + break; + } else if (ch == '[') { + if (inSection) for (INIContentHandler* handler : handlers) handler->endSection(); + inSection = true; + get(); + readAllWhitespace(); + INIContentHandler::SectionStartEvent event; event.lineNumber = lineNumber; event.eventNumber = ++eventNumber; - event.whitespace = match[0]; - for (INIContentHandler* handler : handlers) handler->whitespace(event); - } else if (std::regex_match(line, match, commentPattrern)) { + event.name = readTokenAndEatTerminator(']', "e, &found); + + readSpacesAndTabs(); + if (allowSectionTags && peek() == '[') { + get(); + event.tag = readTokenAndEatTerminator(']', "e, &found); + } + + readSpacesAndTabs(); + ch = peek(); + if (isComment(ch)) { + get(); + readSpacesAndTabs(); + event.comment = readUntil('\n', &found); + } else if (ch == '\n') { + get(); + } else { + throw std::logic_error(std::string("unexpected content after the section: '") + event.name + "'"); + } + + for (INIContentHandler* handler : handlers) handler->startSection(event); + } else if (isComment(ch)) { + get(); + readSpacesAndTabs(); INIContentHandler::CommentEvent event; event.lineNumber = lineNumber; event.eventNumber = ++eventNumber; - event.comment = match[2]; + event.comment = readUntil('\n', &found); for (INIContentHandler* handler : handlers) handler->comment(event); - } else if (std::regex_match(line, match, sectionPattrern)) { - if (inSection) for (INIContentHandler* handler : handlers) handler->endSection(); - inSection = true; - INIContentHandler::SectionStartEvent event; - event.lineNumber = lineNumber; - event.eventNumber = ++eventNumber; - event.name = match[1]; - event.comment = match[6]; - // event.tag = match[3]; - // KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section. - // see , „[$i]“ means that the section is „locked“ - // We may emit this information somehow later, but for now, it is just ignored. - for (INIContentHandler* handler : handlers) handler->startSection(event); - } else if (std::regex_match(line, match, entryQuotedPattrern)) { + } else { + std::string fullKey = readToken('=', "e, &found); + if (!found) throw std::logic_error(std::string("missing = after key: '") + fullKey + "'"); + if (!quote) fullKey = trim(fullKey); + readSpacesAndTabs(); + + if (quote) { + ch = get(); + if (ch == '=') readSpacesAndTabs(); + else throw std::logic_error(std::string("missing = after quoted key: '") + fullKey + "'"); + } + + std::string value = readToken('\n', "e, &found); + if (!quote) value = trim(value); + INIContentHandler::EntryEvent event; event.lineNumber = lineNumber; event.eventNumber = ++eventNumber; - event.key = match[2]; - event.subKey = match[4]; - event.fullKey = match[1]; - event.value = match[6]; - event.comment = match[10]; + event.key = fullKey; + event.fullKey = fullKey; + event.value = value; - // the "/' at the end is missing → line continues - if (match.length(7) == 0) { - std::regex endPattern(std::string("(.*?)") + (match[5] == "'" ? "'" : "\"") + "\\s*((;|#)\\s*(.*))?"); - while (std::getline(input, line)) { - lineNumber++; - event.value += "\n"; - if (std::regex_match(line, match, endPattern)) { - event.value += std::string(match[1]); - event.comment = match[4]; - break; - } else { - event.value += line; - } + if (allowSubKeys) { + std::smatch match; + if (std::regex_match(fullKey, match, std::regex("([^\\[]+)\\[([^\\[]+)\\]"))) { + event.key = match[1]; + event.subKey = match[2]; + event.fullKey = fullKey; + } + } + + if (quote) { + readSpacesAndTabs(); + ch = peek(); + if (isComment(ch)) { + get(); + readSpacesAndTabs(); + event.comment = readUntil('\n', &found); + } else if (ch == '\n') { + get(); + } else { + throw std::logic_error(std::string("unexpected content after the quoted value: key='") + fullKey + "' value='" + event.value + "'"); } } for (INIContentHandler* handler : handlers) handler->entry(event); - } else if (std::regex_match(line, match, entryPlainPattrern)) { - INIContentHandler::EntryEvent event; - event.lineNumber = lineNumber; - event.eventNumber = ++eventNumber; - event.key = match[2]; - event.subKey = match[4]; - event.fullKey = match[1]; - event.value = match[5]; - - // the \ at the end → line continues - while (line.back() == '\\' && std::getline(input, line)) { - lineNumber++; - line = std::regex_replace(line, std::regex("^\\s+|\\s+$"), ""); // trim the spaces: continuing lines might be aligned to the first line (desired spaces – if any – should be at the line end before the \ character) - event.value = event.value.substr(0, event.value.size() - 1); // cut the trailing \ backslash - event.value = event.value + line; - } - - for (INIContentHandler* handler : handlers) handler->entry(event); - } else { - // TODO: warning, error, or support unknown content } - - // General feautres: - // TODO: probably switch to state-machine approach instead of regular expressions or use an existing library - // TODO: warning/error handler - // TODO: support also quoted or multiline keys? - // TODO: support also escaped characters - // TODO: support also Java .properties and manifest.mf formats? - // TODO: support also quoted sections ["qoted section"] – useful for hierarchy (the path element may contain the separator character) - // TODO: support also nested sections – hierarchy - // TODO: support also nested keys e.g. key.sub.subsub.subsubsub=value – translate them to nested sections - // TODO: support also option for alternative key-value separator (: instead of =) - // TODO: support also other encodings (currently only UTF-8 is supported) - - // Lossless conversions: - // TODO: emit also the quote style ('/"/) - // TODO: emit also the comment style (;/#) ? - // TODO: emit also the whitespace before key name, around =, after "values"/'values', around [sections] ? - // TODO: emit also the line-end type (LF/CRLF) ? } - + // TODO: error at the end, catch premature/unexpected EOF + // TODO: unescape + trim values + ignore \r + // TODO: count lines if (inSection) for (INIContentHandler* handler : handlers) handler->endSection(); - for (INIContentHandler* handler : handlers) handler->endDocument(); } + + // General feautres: + // TODO: warning/error handler + // TODO: support also escaped characters + // TODO: support also Java .properties and manifest.mf formats? + // TODO: support also nested sections – hierarchy + // TODO: support also nested keys e.g. key.sub.subsub.subsubsub=value – translate them to nested sections + // TODO: support also option for alternative key-value separator (: instead of =) + // TODO: support also other encodings (currently only UTF-8 is supported) + // TODO: better exceptions + + // Lossless conversions: + // TODO: emit also the quote style ('/"/) + // TODO: emit also the comment style (;/#) ? + // TODO: emit also the whitespace before key name, around =, after "values"/'values', around [sections] ? + // TODO: emit also the line-end type (LF/CRLF) ? + }; INIReader* INIReader::create(std::istream& input) { return new INIReaderImpl(input); } + +} +} +} +} diff -r ee70b17950bd -r 80e129ec3408 src/lib/INIReader.h --- a/src/lib/INIReader.h Mon Nov 23 21:09:46 2020 +0100 +++ b/src/lib/INIReader.h Wed Nov 25 21:50:26 2020 +0100 @@ -21,6 +21,11 @@ #include "INIContentHandler.h" +namespace relpipe { +namespace in { +namespace ini { +namespace lib { + /** * TODO: Files in the src/lib directory will be moved to alt2xml and used as a shared library. */ @@ -31,3 +36,8 @@ virtual void process() = 0; static INIReader* create(std::istream& input); }; + +} +} +} +} diff -r ee70b17950bd -r 80e129ec3408 src/lib/UnescapingINIHandler.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib/UnescapingINIHandler.h Wed Nov 25 21:50:26 2020 +0100 @@ -0,0 +1,99 @@ +/** + * Relational pipes + * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#pragma once + +#include + +#include "INIReader.h" + +using namespace std; +using namespace relpipe::writer; + +namespace relpipe { +namespace in { +namespace ini { +namespace lib { + +class UnescapingINIContentHandler : public INIContentHandler { +private: + INIContentHandler& output; + +protected: + const char ESC = '\\'; + bool lastEscaphingPhase; + + std::stringstream& put(std::stringstream& result, const char& ch, int& i) { + result.put(ch); + i++; + return result; + } + + virtual std::string unescape(const std::string& s) = 0; + +public: + + /** + * @param output here will be sent events with unescaped values + * @param lastEscaphingPhase instances of UnescapingINIContentHandler might be chained: + * unsupported escaping sequences are kept untouched to be processed in further phases; + * in the last phase, all remaining sequences (including \\) must be recognized and unescaped + * (otherwise the input is considered invalid and an exception is thrown) + */ + UnescapingINIContentHandler(INIContentHandler& output, bool lastEscaphingPhase) : output(output), lastEscaphingPhase(lastEscaphingPhase) { + } + + void startDocument() override { + output.startDocument(); + } + + void endDocument() override { + output.endDocument(); + } + + void startSection(const SectionStartEvent& event) override { + SectionStartEvent e = event; + e.name = unescape(e.name); + output.startSection(e); + } + + void endSection() override { + output.endSection(); + } + + void entry(const EntryEvent& event) override { + EntryEvent e = event; + e.key = unescape(e.key); + e.fullKey = unescape(e.fullKey); + e.subKey = unescape(e.subKey); + e.value = unescape(e.value); + output.entry(e); + } + + void comment(const CommentEvent& event) override { + output.comment(event); + } + + void whitespace(const WhitespaceEvent& event) override { + output.whitespace(event); + } + +}; + +} +} +} +}