--- a/src/XMLDocumentConstructor.h Mon Nov 23 21:09:46 2020 +0100
+++ b/src/XMLDocumentConstructor.h Wed Nov 25 21:50:26 2020 +0100
@@ -20,8 +20,11 @@
#include <libxml++-2.6/libxml++/libxml++.h>
#include "lib/INIReader.h"
+#include "lib/BasicUnescapingINIHandler.h"
#include "lib/XMLNameCodec.h"
+using namespace relpipe::in::ini::lib;
+
namespace relpipe {
namespace in {
namespace xmltable {
@@ -107,7 +110,8 @@
void process() {
HierarchicalINIContentHandler handler(parser);
std::shared_ptr<INIReader> reader(INIReader::create(*input));
- reader->addHandler(&handler);
+ BasicUnescapingINIContentHandler unescapingHandler(handler, true);
+ reader->addHandler(&unescapingHandler);
reader->process();
}
};
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/BasicUnescapingINIHandler.h Wed Nov 25 21:50:26 2020 +0100
@@ -0,0 +1,74 @@
+/**
+ * Relational pipes
+ * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <sstream>
+
+#include "UnescapingINIHandler.h"
+
+using namespace std;
+using namespace relpipe::writer;
+
+namespace relpipe {
+namespace in {
+namespace ini {
+namespace lib {
+
+class BasicUnescapingINIContentHandler : public UnescapingINIContentHandler {
+protected:
+
+ virtual std::string unescape(const std::string& s) {
+ std::stringstream result;
+ for (int i = 0, length = s.size(); i < length; i++) {
+ char ch = s[i];
+ if (i + 1 < length && ch == ESC) {
+ ch = s[i + 1];
+ if (ch == 'n') put(result, '\n', i);
+ else if (ch == 'r') put(result, '\r', i);
+ else if (ch == 't') put(result, '\t', i);
+ else if (ch == 's') put(result, ' ', i); // TODO: Reconsider what is „basic“ escaping and should be supported.
+ else if (ch == '"') put(result, ch, i); // The delimiters (\n,]",') are already unescaped during the first stage in the INIReader while parsing (the delimiter relevant to given environment is unescaped, e.g. \" in "quoted" value).
+ else if (ch == '\'') put(result, ch, i); // So it does not necessary to do it here. But someone might write a="xxx\'zzz" however it is superfluous because a="xxx'zzz" will also work.
+ else if (ch == ']') put(result, ch, i);
+ else if (ch == ':') put(result, ch, i);
+ else if (ch == ';') put(result, ch, i);
+ else if (ch == '#') put(result, ch, i);
+ else if (ch == '=') put(result, ch, i);
+ else if (ch == ESC && !lastEscaphingPhase) put(result, ESC, i).put(ESC); // copy and skip even the second \ to avoid its misinterpretation in the next cycle
+ else if (ch == ESC && lastEscaphingPhase) put(result, ESC, i); // unescape \\ to \.
+ else if (lastEscaphingPhase) throw std::logic_error(std::string("Unsupported escape sequence: ") + ch);
+ else result.put(ESC); // keep the escape sequence for later unescaping phase
+ } else if (ch == ESC) {
+ throw std::logic_error(std::string("Missing escape sequence")); // this should not happen
+ } else {
+ result.put(ch);
+ }
+ }
+ return result.str();
+ }
+
+public:
+
+ BasicUnescapingINIContentHandler(INIContentHandler& output, bool lastEscaphingPhase) : UnescapingINIContentHandler(output, lastEscaphingPhase) {
+ }
+
+};
+
+}
+}
+}
+}
--- a/src/lib/INIContentHandler.h Mon Nov 23 21:09:46 2020 +0100
+++ b/src/lib/INIContentHandler.h Wed Nov 25 21:50:26 2020 +0100
@@ -18,6 +18,11 @@
#include <string>
+namespace relpipe {
+namespace in {
+namespace ini {
+namespace lib {
+
class INIContentHandler {
public:
@@ -31,6 +36,7 @@
public:
std::string comment;
std::string name;
+ std::string tag;
};
class EntryEvent : public Event {
@@ -61,3 +67,8 @@
virtual void comment(const CommentEvent& event) = 0;
virtual void whitespace(const WhitespaceEvent& event) = 0;
};
+
+}
+}
+}
+}
--- a/src/lib/INIReader.cpp Mon Nov 23 21:09:46 2020 +0100
+++ b/src/lib/INIReader.cpp Wed Nov 25 21:50:26 2020 +0100
@@ -17,13 +17,169 @@
#include <vector>
#include <regex>
+#include <sstream>
+#include <stdexcept>
#include "INIReader.h"
+namespace relpipe {
+namespace in {
+namespace ini {
+namespace lib {
+
class INIReaderImpl : public INIReader {
private:
std::istream& input;
std::vector<INIContentHandler*> handlers;
+
+ /**
+ * This might be configurable.
+ *
+ * By default, we ignore all leading whitespace on continuing lines.
+ * If there should be some spaces or tabs, they should be placed on the previous line before the „\“.
+ * If a line break is desired, it should be written as \n (escaped) or the value should be quoted in " or '.
+ *
+ * Related specifications:
+ * - https://docs.oracle.com/javase/8/docs/api/index.html?java/util/Properties.html
+ */
+ bool consumeLeadingSpacesOnContinuingLines = true;
+
+ /**
+ * This might be configurable.
+ *
+ * KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section.
+ * Line „[section_1][$i]“ means that the „section_1“ is „locked“.
+ * We may emit this information somehow later, but for now, it is just ignored.
+ *
+ * TODO: Is „section tag“ right name?
+ *
+ * Related specifications:
+ * - https://userbase.kde.org/KDE_System_Administration/Configuration_Files#Lock_Down
+ */
+ bool allowSectionTags = true;
+
+ /**
+ * This might be configurable.
+ *
+ * If whole key is „aaa[bbb]“ then „aaa“ is considered to be the key and „bbb“ the sub-key.
+ * No \[ escaping is currently supported, so the key might not contain the bracket character.
+ *
+ * Related specifications:
+ * - https://userbase.kde.org/KDE_System_Administration/Configuration_Files#Shell_Expansion
+ * - https://specifications.freedesktop.org/desktop-entry-spec/latest/ar01s05.html
+ */
+ bool allowSubKeys = true;
+
+ int lineNumber = 1;
+ int eventNumber = 0;
+
+ /**
+ * Should be always used instead of input.peek().
+ * Skips \r.
+ */
+ char peek() {
+ // In 2020 there is no need to manually return the carriage. However some legacy systems still do it.
+ char ch = input.peek();
+ if (ch == '\r') {
+ input.get();
+ ch = input.peek();
+ }
+ return ch;
+ }
+
+ /**
+ * Should be always used instead of input.get().
+ * Counts the lines and skips \r.
+ */
+ char get() {
+ char ch = input.get();
+ if (ch == '\n') lineNumber++;
+ else if (ch == '\r') ch = get();
+ return ch;
+ }
+
+ std::string readSpacesAndTabs() {
+ std::stringstream result;
+ for (char ch = peek(); input.good() && (ch == ' ' || ch == '\t'); ch = peek()) result.put(get());
+ return result.str();
+ }
+
+ std::string readAllWhitespace() {
+ std::stringstream result;
+ for (char ch = peek(); input.good() && (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'); ch = peek()) result.put(get());
+ return result.str();
+ }
+
+ void processContinuingLine(std::stringstream& result) {
+ if (consumeLeadingSpacesOnContinuingLines) readSpacesAndTabs();
+ else result.put('\n');
+ }
+
+ std::string readUntil(char until, bool* found = nullptr) {
+ std::stringstream result;
+
+ for (char ch = peek(); input.good() && ch != until; ch = peek()) {
+ if (ch == '\\') {
+ get();
+ ch = get();
+ if (ch == until && ch == '\n') processContinuingLine(result);
+ else if (ch == until) result.put(ch);
+ else if (ch == std::istream::traits_type::eof()) break;
+ else result.put('\\').put(ch);
+ // TODO: two-stage and modular unescaping: here unescape only \+LF or more genereally: unescape only the until character and rest leave untouched
+ // second escaping stage move to separate class/wrapper (similar to hierarchical wrappers)
+ } else {
+ ch = get();
+ result.put(ch);
+ }
+ }
+
+ if (peek() == until) {
+ get();
+ if (found) *found = true;
+ } else {
+ if (found) *found = false;
+ }
+
+ return result.str();
+ }
+
+ std::string readToken(char until, char* quote = nullptr, bool* found = nullptr) {
+ std::string result;
+
+ char ch = peek();
+ if (isQuote(ch)) {
+ if (quote) *quote = ch;
+ result = readUntil(get(), found);
+ } else {
+ if (quote) *quote = 0;
+ result = readUntil(until, found);
+ }
+
+ return result;
+ }
+
+ std::string readTokenAndEatTerminator(char until, char* quote = nullptr, bool* found = nullptr) {
+ std::string result = readToken(until, quote, found);
+ if (*quote) {
+ readAllWhitespace();
+ if (get() != until) throw std::logic_error(std::string("missing „") + std::string(1, until) + "“ after quoted section name");
+ }
+ return result;
+ }
+
+ bool isComment(char ch) {
+ return ch == '#' || ch == ';';
+ }
+
+ bool isQuote(char ch) {
+ return ch == '"' || ch == '\'';
+ }
+
+ std::string trim(std::string s) {
+ return std::regex_replace(s, std::regex("^\\s+|\\s+$"), "");
+ }
+
public:
INIReaderImpl(std::istream& input) : input(input) {
@@ -34,124 +190,144 @@
}
void process() override {
-
for (INIContentHandler* handler : handlers) handler->startDocument();
- std::regex whitespacePattrern("\\s*");
- std::regex commentPattrern("\\s*(;|#)\\s*(.*)");
- std::regex sectionPattrern("\\s*\\[\\s*([^\\]]+)\\s*\\]\\s*(\\[\\s*([^\\]]+)\\s*\\])?\\s*((;|#)\\s*(.*))?");
- std::regex entryQuotedPattrern("\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*(\"|')((?:(?!\\5).)*)(\\5)?\\s*((;|#)\\s*(.*))?");
- std::regex entryPlainPattrern("\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*(.*?)\\s*");
+ bool inSection = false;
+
+ while (input.good()) { // TODO: condition
+ {
+ std::string whitespace = readAllWhitespace();
+ if (whitespace.size()) {
+ INIContentHandler::WhitespaceEvent event;
+ event.lineNumber = lineNumber;
+ event.eventNumber = ++eventNumber;
+ event.whitespace = whitespace;
+ for (INIContentHandler* handler : handlers) handler->whitespace(event);
+ }
+ }
- std::smatch match;
- bool inSection = false;
- std::string line;
- int lineNumber = 0;
- int eventNumber = 0;
+ bool found;
+ char quote;
+
+ char ch = peek();
-
- while (std::getline(input, line)) {
- lineNumber++;
-
- if (std::regex_match(line, match, whitespacePattrern)) {
- INIContentHandler::WhitespaceEvent event;
+ if (ch == std::istream::traits_type::eof()) {
+ break;
+ } else if (ch == '[') {
+ if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
+ inSection = true;
+ get();
+ readAllWhitespace();
+ INIContentHandler::SectionStartEvent event;
event.lineNumber = lineNumber;
event.eventNumber = ++eventNumber;
- event.whitespace = match[0];
- for (INIContentHandler* handler : handlers) handler->whitespace(event);
- } else if (std::regex_match(line, match, commentPattrern)) {
+ event.name = readTokenAndEatTerminator(']', "e, &found);
+
+ readSpacesAndTabs();
+ if (allowSectionTags && peek() == '[') {
+ get();
+ event.tag = readTokenAndEatTerminator(']', "e, &found);
+ }
+
+ readSpacesAndTabs();
+ ch = peek();
+ if (isComment(ch)) {
+ get();
+ readSpacesAndTabs();
+ event.comment = readUntil('\n', &found);
+ } else if (ch == '\n') {
+ get();
+ } else {
+ throw std::logic_error(std::string("unexpected content after the section: '") + event.name + "'");
+ }
+
+ for (INIContentHandler* handler : handlers) handler->startSection(event);
+ } else if (isComment(ch)) {
+ get();
+ readSpacesAndTabs();
INIContentHandler::CommentEvent event;
event.lineNumber = lineNumber;
event.eventNumber = ++eventNumber;
- event.comment = match[2];
+ event.comment = readUntil('\n', &found);
for (INIContentHandler* handler : handlers) handler->comment(event);
- } else if (std::regex_match(line, match, sectionPattrern)) {
- if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
- inSection = true;
- INIContentHandler::SectionStartEvent event;
- event.lineNumber = lineNumber;
- event.eventNumber = ++eventNumber;
- event.name = match[1];
- event.comment = match[6];
- // event.tag = match[3];
- // KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section.
- // see <https://userbase.kde.org/KDE_System_Administration/Configuration_Files>, „[$i]“ means that the section is „locked“
- // We may emit this information somehow later, but for now, it is just ignored.
- for (INIContentHandler* handler : handlers) handler->startSection(event);
- } else if (std::regex_match(line, match, entryQuotedPattrern)) {
+ } else {
+ std::string fullKey = readToken('=', "e, &found);
+ if (!found) throw std::logic_error(std::string("missing = after key: '") + fullKey + "'");
+ if (!quote) fullKey = trim(fullKey);
+ readSpacesAndTabs();
+
+ if (quote) {
+ ch = get();
+ if (ch == '=') readSpacesAndTabs();
+ else throw std::logic_error(std::string("missing = after quoted key: '") + fullKey + "'");
+ }
+
+ std::string value = readToken('\n', "e, &found);
+ if (!quote) value = trim(value);
+
INIContentHandler::EntryEvent event;
event.lineNumber = lineNumber;
event.eventNumber = ++eventNumber;
- event.key = match[2];
- event.subKey = match[4];
- event.fullKey = match[1];
- event.value = match[6];
- event.comment = match[10];
+ event.key = fullKey;
+ event.fullKey = fullKey;
+ event.value = value;
- // the "/' at the end is missing → line continues
- if (match.length(7) == 0) {
- std::regex endPattern(std::string("(.*?)") + (match[5] == "'" ? "'" : "\"") + "\\s*((;|#)\\s*(.*))?");
- while (std::getline(input, line)) {
- lineNumber++;
- event.value += "\n";
- if (std::regex_match(line, match, endPattern)) {
- event.value += std::string(match[1]);
- event.comment = match[4];
- break;
- } else {
- event.value += line;
- }
+ if (allowSubKeys) {
+ std::smatch match;
+ if (std::regex_match(fullKey, match, std::regex("([^\\[]+)\\[([^\\[]+)\\]"))) {
+ event.key = match[1];
+ event.subKey = match[2];
+ event.fullKey = fullKey;
+ }
+ }
+
+ if (quote) {
+ readSpacesAndTabs();
+ ch = peek();
+ if (isComment(ch)) {
+ get();
+ readSpacesAndTabs();
+ event.comment = readUntil('\n', &found);
+ } else if (ch == '\n') {
+ get();
+ } else {
+ throw std::logic_error(std::string("unexpected content after the quoted value: key='") + fullKey + "' value='" + event.value + "'");
}
}
for (INIContentHandler* handler : handlers) handler->entry(event);
- } else if (std::regex_match(line, match, entryPlainPattrern)) {
- INIContentHandler::EntryEvent event;
- event.lineNumber = lineNumber;
- event.eventNumber = ++eventNumber;
- event.key = match[2];
- event.subKey = match[4];
- event.fullKey = match[1];
- event.value = match[5];
-
- // the \ at the end → line continues
- while (line.back() == '\\' && std::getline(input, line)) {
- lineNumber++;
- line = std::regex_replace(line, std::regex("^\\s+|\\s+$"), ""); // trim the spaces: continuing lines might be aligned to the first line (desired spaces – if any – should be at the line end before the \ character)
- event.value = event.value.substr(0, event.value.size() - 1); // cut the trailing \ backslash
- event.value = event.value + line;
- }
-
- for (INIContentHandler* handler : handlers) handler->entry(event);
- } else {
- // TODO: warning, error, or support unknown content
}
-
- // General feautres:
- // TODO: probably switch to state-machine approach instead of regular expressions or use an existing library
- // TODO: warning/error handler
- // TODO: support also quoted or multiline keys?
- // TODO: support also escaped characters
- // TODO: support also Java .properties and manifest.mf formats?
- // TODO: support also quoted sections ["qoted section"] – useful for hierarchy (the path element may contain the separator character)
- // TODO: support also nested sections – hierarchy
- // TODO: support also nested keys e.g. key.sub.subsub.subsubsub=value – translate them to nested sections
- // TODO: support also option for alternative key-value separator (: instead of =)
- // TODO: support also other encodings (currently only UTF-8 is supported)
-
- // Lossless conversions:
- // TODO: emit also the quote style ('/"/)
- // TODO: emit also the comment style (;/#) ?
- // TODO: emit also the whitespace before key name, around =, after "values"/'values', around [sections] ?
- // TODO: emit also the line-end type (LF/CRLF) ?
}
-
+ // TODO: error at the end, catch premature/unexpected EOF
+ // TODO: unescape + trim values + ignore \r
+ // TODO: count lines
if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
-
for (INIContentHandler* handler : handlers) handler->endDocument();
}
+
+ // General feautres:
+ // TODO: warning/error handler
+ // TODO: support also escaped characters
+ // TODO: support also Java .properties and manifest.mf formats?
+ // TODO: support also nested sections – hierarchy
+ // TODO: support also nested keys e.g. key.sub.subsub.subsubsub=value – translate them to nested sections
+ // TODO: support also option for alternative key-value separator (: instead of =)
+ // TODO: support also other encodings (currently only UTF-8 is supported)
+ // TODO: better exceptions
+
+ // Lossless conversions:
+ // TODO: emit also the quote style ('/"/)
+ // TODO: emit also the comment style (;/#) ?
+ // TODO: emit also the whitespace before key name, around =, after "values"/'values', around [sections] ?
+ // TODO: emit also the line-end type (LF/CRLF) ?
+
};
INIReader* INIReader::create(std::istream& input) {
return new INIReaderImpl(input);
}
+
+}
+}
+}
+}
--- a/src/lib/INIReader.h Mon Nov 23 21:09:46 2020 +0100
+++ b/src/lib/INIReader.h Wed Nov 25 21:50:26 2020 +0100
@@ -21,6 +21,11 @@
#include "INIContentHandler.h"
+namespace relpipe {
+namespace in {
+namespace ini {
+namespace lib {
+
/**
* TODO: Files in the src/lib directory will be moved to alt2xml and used as a shared library.
*/
@@ -31,3 +36,8 @@
virtual void process() = 0;
static INIReader* create(std::istream& input);
};
+
+}
+}
+}
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/UnescapingINIHandler.h Wed Nov 25 21:50:26 2020 +0100
@@ -0,0 +1,99 @@
+/**
+ * Relational pipes
+ * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <sstream>
+
+#include "INIReader.h"
+
+using namespace std;
+using namespace relpipe::writer;
+
+namespace relpipe {
+namespace in {
+namespace ini {
+namespace lib {
+
+class UnescapingINIContentHandler : public INIContentHandler {
+private:
+ INIContentHandler& output;
+
+protected:
+ const char ESC = '\\';
+ bool lastEscaphingPhase;
+
+ std::stringstream& put(std::stringstream& result, const char& ch, int& i) {
+ result.put(ch);
+ i++;
+ return result;
+ }
+
+ virtual std::string unescape(const std::string& s) = 0;
+
+public:
+
+ /**
+ * @param output here will be sent events with unescaped values
+ * @param lastEscaphingPhase instances of UnescapingINIContentHandler might be chained:
+ * unsupported escaping sequences are kept untouched to be processed in further phases;
+ * in the last phase, all remaining sequences (including \\) must be recognized and unescaped
+ * (otherwise the input is considered invalid and an exception is thrown)
+ */
+ UnescapingINIContentHandler(INIContentHandler& output, bool lastEscaphingPhase) : output(output), lastEscaphingPhase(lastEscaphingPhase) {
+ }
+
+ void startDocument() override {
+ output.startDocument();
+ }
+
+ void endDocument() override {
+ output.endDocument();
+ }
+
+ void startSection(const SectionStartEvent& event) override {
+ SectionStartEvent e = event;
+ e.name = unescape(e.name);
+ output.startSection(e);
+ }
+
+ void endSection() override {
+ output.endSection();
+ }
+
+ void entry(const EntryEvent& event) override {
+ EntryEvent e = event;
+ e.key = unescape(e.key);
+ e.fullKey = unescape(e.fullKey);
+ e.subKey = unescape(e.subKey);
+ e.value = unescape(e.value);
+ output.entry(e);
+ }
+
+ void comment(const CommentEvent& event) override {
+ output.comment(event);
+ }
+
+ void whitespace(const WhitespaceEvent& event) override {
+ output.whitespace(event);
+ }
+
+};
+
+}
+}
+}
+}