--- a/src/XMLDocumentConstructor.h Thu Nov 26 11:42:26 2020 +0100
+++ b/src/XMLDocumentConstructor.h Sat Nov 28 18:14:15 2020 +0100
@@ -20,8 +20,9 @@
#include <libxml++-2.6/libxml++/libxml++.h>
#include "lib/INIReader.h"
-#include "lib/BasicUnescapingINIHandler.h"
-#include "lib/JavaPropertiesUnescapingINIHandler.h"
+#include "lib/BasicUnescapingProcessor.h"
+#include "lib/BackspaceUnescapingProcessor.h"
+#include "lib/JavaPropertiesUnescapingProcessor.h"
#include "lib/XMLNameCodec.h"
using namespace relpipe::in::ini::lib;
@@ -111,9 +112,14 @@
void process() {
HierarchicalINIContentHandler handler(parser);
std::shared_ptr<INIReader> reader(INIReader::create(*input));
- BasicUnescapingINIContentHandler unescapingHandler(handler, false);
- JavaPropertiesUnescapingINIContentHandler javaHandler(handler, true);
- reader->addHandler(&javaHandler);
+ reader->addUnescapingProcessor(std::make_shared<BasicUnescapingProcessor>(), "unescape-basic", true);
+ reader->addUnescapingProcessor(std::make_shared<JavaPropertiesUnescapingProcessor>(), "unescape-java-properties", false);
+ reader->addUnescapingProcessor(std::make_shared<BackspaceUnescapingProcessor>(false), "unescape-backspace-disorder", false);
+ reader->addUnescapingProcessor(std::make_shared<BackspaceUnescapingProcessor>(), "unescape-backspace", true);
+ reader->addHandler(&handler);
+ // TODO: smart pointers vs. references: are we going to call addUnescapingProcessor() dynamically/conditionally or share instances? Then pointers will be better.
+ // TODO: call setOption() according to the configuration
+ // for (ParserOptionRecipe option : configuration.parserOptions) reader->setOption(convertor.to_bytes(option.uri), convertor.to_bytes(option.value));
reader->process();
}
};
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/BackspaceUnescapingProcessor.h Sat Nov 28 18:14:15 2020 +0100
@@ -0,0 +1,70 @@
+/**
+ * Relational pipes
+ * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <sstream>
+
+#include "UnescapingProcessor.h"
+
+using namespace std;
+using namespace relpipe::writer;
+
+namespace relpipe {
+namespace in {
+namespace ini {
+namespace lib {
+
+class BackspaceUnescapingProcessor : public UnescapingProcessor {
+private:
+ const bool lastEscaphingPhase = true;
+public:
+
+ std::string unescape(const std::string& s, const TextType type) override {
+ std::stringstream result;
+ for (int i = 0, length = s.size(); i < length; i++) {
+ char ch = s[i];
+ if (i + 1 < length && ch == ESC) {
+ ch = s[i + 1];
+ if (ch == ESC) put(result, ESC, i); // unescape \\ to \.
+ else if (lastEscaphingPhase) throw std::logic_error(std::string("Unsupported escape sequence: ") + ch);
+ else result.put(ESC); // keep the escape sequence for later unescaping phase
+ } else if (ch == ESC) {
+ throw std::logic_error(std::string("Missing escape sequence")); // this should not happen
+ } else {
+ result.put(ch);
+ }
+ }
+ return result.str();
+ }
+
+ /**
+ * @param lastEscaphingPhase whether this is final unescaping stage.
+ * By default it is set to true, thus no unrecognized escape sequences may left after this stage.
+ * Setting this to false is dangerous and may lead to errors and ambiguous behavior.
+ * It should be used only as a last resort.
+ * Because both "\\ \xxx" and "\ \xxx" will be converted to "\ \xxx" and the information will be lost.
+ * So, it is usually better to keep the "\" escaped as "\\" and process both the escaped backspaces and unrecognized escape sequences later.
+ */
+ BackspaceUnescapingProcessor(bool lastEscaphingPhase = true) : lastEscaphingPhase(lastEscaphingPhase) {
+ }
+
+};
+
+}
+}
+}
+}
--- a/src/lib/BasicUnescapingINIHandler.h Thu Nov 26 11:42:26 2020 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,74 +0,0 @@
-/**
- * Relational pipes
- * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, version 3 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-#pragma once
-
-#include <sstream>
-
-#include "UnescapingINIHandler.h"
-
-using namespace std;
-using namespace relpipe::writer;
-
-namespace relpipe {
-namespace in {
-namespace ini {
-namespace lib {
-
-class BasicUnescapingINIContentHandler : public UnescapingINIContentHandler {
-protected:
-
- virtual std::string unescape(const std::string& s) {
- std::stringstream result;
- for (int i = 0, length = s.size(); i < length; i++) {
- char ch = s[i];
- if (i + 1 < length && ch == ESC) {
- ch = s[i + 1];
- if (ch == 'n') put(result, '\n', i);
- else if (ch == 'r') put(result, '\r', i);
- else if (ch == 't') put(result, '\t', i);
- else if (ch == 's') put(result, ' ', i); // TODO: Reconsider what is „basic“ escaping and should be supported.
- else if (ch == '"') put(result, ch, i); // The delimiters (\n,]",') are already unescaped during the first stage in the INIReader while parsing (the delimiter relevant to given environment is unescaped, e.g. \" in "quoted" value).
- else if (ch == '\'') put(result, ch, i); // So it does not necessary to do it here. But someone might write a="xxx\'zzz" however it is superfluous because a="xxx'zzz" will also work.
- else if (ch == ']') put(result, ch, i);
- else if (ch == ':') put(result, ch, i);
- else if (ch == ';') put(result, ch, i);
- else if (ch == '#') put(result, ch, i);
- else if (ch == '=') put(result, ch, i);
- else if (ch == ESC && !lastEscaphingPhase) put(result, ESC, i).put(ESC); // copy and skip even the second \ to avoid its misinterpretation in the next cycle
- else if (ch == ESC && lastEscaphingPhase) put(result, ESC, i); // unescape \\ to \.
- else if (lastEscaphingPhase) throw std::logic_error(std::string("Unsupported escape sequence: ") + ch);
- else result.put(ESC); // keep the escape sequence for later unescaping phase
- } else if (ch == ESC) {
- throw std::logic_error(std::string("Missing escape sequence")); // this should not happen
- } else {
- result.put(ch);
- }
- }
- return result.str();
- }
-
-public:
-
- BasicUnescapingINIContentHandler(INIContentHandler& output, bool lastEscaphingPhase) : UnescapingINIContentHandler(output, lastEscaphingPhase) {
- }
-
-};
-
-}
-}
-}
-}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/BasicUnescapingProcessor.h Sat Nov 28 18:14:15 2020 +0100
@@ -0,0 +1,67 @@
+/**
+ * Relational pipes
+ * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <sstream>
+
+#include "UnescapingProcessor.h"
+
+using namespace std;
+using namespace relpipe::writer;
+
+namespace relpipe {
+namespace in {
+namespace ini {
+namespace lib {
+
+class BasicUnescapingProcessor : public UnescapingProcessor {
+public:
+
+ std::string unescape(const std::string& s, const TextType type) override {
+ std::stringstream result;
+ for (int i = 0, length = s.size(); i < length; i++) {
+ char ch = s[i];
+ if (i + 1 < length && ch == ESC) {
+ ch = s[i + 1];
+ if (ch == 'n') put(result, '\n', i);
+ else if (ch == 'r') put(result, '\r', i);
+ else if (ch == 't') put(result, '\t', i);
+ else if (ch == 's') put(result, ' ', i); // TODO: Reconsider what is „basic“ escaping and should be supported.
+ else if (ch == '"') put(result, ch, i); // The delimiters (\n,]",') are already unescaped during the first stage in the INIReader while parsing (the delimiter relevant to given environment is unescaped, e.g. \" in "quoted" value).
+ else if (ch == '\'') put(result, ch, i); // So it does not necessary to do it here. But someone might write a="xxx\'zzz" however it is superfluous because a="xxx'zzz" will also work.
+ else if (ch == ']') put(result, ch, i);
+ else if (ch == ':') put(result, ch, i);
+ else if (ch == ';') put(result, ch, i);
+ else if (ch == '#') put(result, ch, i);
+ else if (ch == '=') put(result, ch, i);
+ else if (ch == ESC) put(result, ESC, i).put(ESC); // copy and skip even the second \ to avoid its misinterpretation in the next cycle
+ else result.put(ESC); // keep the escape sequence for later unescaping phase
+ } else if (ch == ESC) {
+ throw std::logic_error(std::string("Missing escape sequence")); // this should not happen
+ } else {
+ result.put(ch);
+ }
+ }
+ return result.str();
+ }
+
+};
+
+}
+}
+}
+}
--- a/src/lib/INIReader.cpp Thu Nov 26 11:42:26 2020 +0100
+++ b/src/lib/INIReader.cpp Sat Nov 28 18:14:15 2020 +0100
@@ -32,21 +32,47 @@
std::istream& input;
std::vector<INIContentHandler*> handlers;
+ class ConfiguredUnescapingProcessor {
+ public:
+ std::shared_ptr<UnescapingProcessor> processor;
+ const std::string uri;
+ bool enbaled;
+
+ ConfiguredUnescapingProcessor(std::shared_ptr<UnescapingProcessor> processor, const std::string uri, bool enbaled) : processor(processor), uri(uri), enbaled(enbaled) {
+ }
+
+ };
+
+ std::vector<ConfiguredUnescapingProcessor> unescapingProcessors;
+
/**
- * This might be configurable.
- *
* By default, we ignore all leading whitespace on continuing lines.
* If there should be some spaces or tabs, they should be placed on the previous line before the „\“.
* If a line break is desired, it should be written as \n (escaped) or the value should be quoted in " or '.
*
+ * TODO: several options:
+ * - enabled, disabled
+ * - if disabled, then: keep backslash, trim backslash, escape backslash
+ * (keep requires support in some further unescaping phase, or it will cause an error)
+ * - keep or trim the line end
+ * - keep or trim the leading spaces
+ * - allow comments interleaved with continuing lines (the freaky systemd syntax)
+ *
* Related specifications:
* - https://docs.oracle.com/javase/8/docs/api/index.html?java/util/Properties.html
+ * - https://www.freedesktop.org/software/systemd/man/systemd.syntax.html
*/
- bool consumeLeadingSpacesOnContinuingLines = true;
+ bool trimLeadingSpacesOnContinuingLines = true;
+
/**
- * This might be configurable.
- *
+ * Some dialects or configuration files in general does not support sections.
+ * Then a line, that looks like an INI section, should be interpreted as a key
+ * (or error, if does not have a proper key-value separator).
+ */
+ bool allowSections = true;
+
+ /**
* KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section.
* Line „[section_1][$i]“ means that the „section_1“ is „locked“.
* We may emit this information somehow later, but for now, it is just ignored.
@@ -59,8 +85,6 @@
bool allowSectionTags = true;
/**
- * This might be configurable.
- *
* If whole key is „aaa[bbb]“ then „aaa“ is considered to be the key and „bbb“ the sub-key.
* No \[ escaping is currently supported, so the key might not contain the bracket character.
*
@@ -70,6 +94,35 @@
*/
bool allowSubKeys = true;
+ /**
+ * Classic INI uses „key=value“ syntax.
+ * But some other formats/dialects might use key:value.
+ *
+ * Only single character separators are supported.
+ * If multiple separators should be recognized (e.g. both „=“ and „:“), this string will contain all of them,
+ * i.e. „:=“ does not mean that the „key:=value“ syntax, but „key=value“ or „key:value“.
+ */
+ std::string keyValueSeparators = "=";
+
+ /**
+ * Classic INI uses „; comment“ syntax.
+ * But many existing files contain „# comment“ lines.
+ *
+ * Only single character separators are supported (works same as keyValueSeparators).
+ */
+ std::string commentSeparators = ";#";
+
+ /**
+ * INI often support both "quotes" and 'apostrophes' styles.
+ * But some dialects may support only one of them or not support quoting at all.
+ *
+ * In such case e.g. „key="some value"“ would mean that the value is „"value"“ (including the quotes).
+ * Thus it is important to allow disabling quote recognizing (which is done by setting this parameter to empty string).
+ *
+ * Only single character quotes are supported (works same as keyValueSeparators).
+ */
+ std::string quotes = "\"'";
+
int lineNumber = 1;
int eventNumber = 0;
@@ -111,30 +164,35 @@
}
void processContinuingLine(std::stringstream& result) {
- if (consumeLeadingSpacesOnContinuingLines) readSpacesAndTabs();
+ if (trimLeadingSpacesOnContinuingLines) readSpacesAndTabs();
else result.put('\n');
}
- std::string readUntil(char until, bool* found = nullptr) {
+ std::string readUntil(const char until, bool* found = nullptr) {
+ return readUntil(std::string(1, until), found);
+ }
+
+ std::string readUntil(const std::string& until, bool* found = nullptr) {
std::stringstream result;
- for (char ch = peek(); input.good() && ch != until; ch = peek()) {
+ for (char ch = peek(); input.good() && !oneOf(ch, until); ch = peek()) {
if (ch == '\\') {
get();
ch = get();
- if (ch == until && ch == '\n') processContinuingLine(result);
- else if (ch == until) result.put(ch);
+ if (oneOf(ch, until) && ch == '\n') processContinuingLine(result);
+ else if (oneOf(ch, until)) result.put(ch);
else if (ch == std::istream::traits_type::eof()) break;
else result.put('\\').put(ch);
- // TODO: two-stage and modular unescaping: here unescape only \+LF or more genereally: unescape only the until character and rest leave untouched
- // second escaping stage move to separate class/wrapper (similar to hierarchical wrappers)
+ // unescaping is done in two phases:
+ // here we unescape just the \n (LF)
+ // other escape sequences are leaved untouched and will be processed in later phases, see see UnescapingINIHandler
} else {
ch = get();
result.put(ch);
}
}
- if (peek() == until) {
+ if (oneOf(peek(), until)) {
get();
if (found) *found = true;
} else {
@@ -144,13 +202,17 @@
return result.str();
}
- std::string readToken(char until, char* quote = nullptr, bool* found = nullptr) {
+ std::string readToken(const char until, char* quote = nullptr, bool* found = nullptr) {
+ return readToken(std::string(1, until), quote, found);
+ }
+
+ std::string readToken(const std::string& until, char* quote = nullptr, bool* found = nullptr) {
std::string result;
char ch = peek();
if (isQuote(ch)) {
if (quote) *quote = ch;
- result = readUntil(get(), found);
+ result = readUntil(std::string(1, get()), found);
} else {
if (quote) *quote = 0;
result = readUntil(until, found);
@@ -160,35 +222,107 @@
}
std::string readTokenAndEatTerminator(char until, char* quote = nullptr, bool* found = nullptr) {
+ return readTokenAndEatTerminator(std::string(1, until), quote, found);
+ }
+
+ std::string readTokenAndEatTerminator(const std::string& until, char* quote = nullptr, bool* found = nullptr) {
std::string result = readToken(until, quote, found);
if (*quote) {
readAllWhitespace();
- if (get() != until) throw std::logic_error(std::string("missing „") + std::string(1, until) + "“ after quoted section name");
+ if (!oneOf(get(), until)) throw std::logic_error(std::string("missing „") + until + "“ after quoted section name");
}
return result;
}
+ std::string unescape(const std::string& value, UnescapingProcessor::TextType type) {
+ std::string result = value;
+ for (ConfiguredUnescapingProcessor p : unescapingProcessors) if (p.enbaled) result = p.processor->unescape(result, type);
+ return result;
+ }
+
bool isComment(char ch) {
- return ch == '#' || ch == ';';
+ return oneOf(ch, commentSeparators);
}
bool isQuote(char ch) {
- return ch == '"' || ch == '\'';
+ return oneOf(ch, quotes);
+ }
+
+ /**
+ * @param ch character to be evaluated
+ * @param options list of options (characters)
+ * @return whether ch is one of options
+ */
+ bool oneOf(char ch, const std::string& options) {
+ return options.find(ch) != std::string::npos;
}
std::string trim(std::string s) {
return std::regex_replace(s, std::regex("^\\s+|\\s+$"), "");
}
+ /**
+ * TODO: use a common method
+ */
+ bool parseBoolean(const std::string& value) {
+ if (value == "true") return true;
+ else if (value == "false") return false;
+ else throw std::invalid_argument(std::string("Unable to parse boolean value: ") + value + " (expecting true or false)");
+ }
+
+ void setDialect(const std::string& name) {
+ if (name == "default-ini") {
+ // already set
+ } else if (name == "java-properties") {
+ trimLeadingSpacesOnContinuingLines = true;
+ allowSections = false;
+ allowSectionTags = false;
+ allowSubKeys = false;
+ commentSeparators = "#";
+ keyValueSeparators = "=:";
+ quotes = "";
+ // TODO: enable unicode unescaping
+ } else {
+ throw std::invalid_argument(std::string("Unsupported INI dialect: ") + name);
+ }
+ }
+
+ bool setUnescaping(const std::string& uri, const std::string& value) {
+ for (ConfiguredUnescapingProcessor& p : unescapingProcessors) {
+ if (p.uri == uri) {
+ p.enbaled = parseBoolean(value);
+ return true;
+ }
+ }
+ return false;
+ }
+
public:
INIReaderImpl(std::istream& input) : input(input) {
}
+ void setOption(const std::string& uri, const std::string& value) override {
+ if (uri == "trim-continuing-lines") trimLeadingSpacesOnContinuingLines = parseBoolean(value); // TODO: continuing lines modes (enum), not just boolean
+ else if (uri == "allow-sections") allowSections = parseBoolean(value);
+ else if (uri == "allow-section-tags") allowSectionTags = parseBoolean(value);
+ else if (uri == "allow-sub-keys") allowSubKeys = parseBoolean(value);
+ else if (uri == "comment-separators") commentSeparators = value;
+ else if (uri == "key-value-separators") keyValueSeparators = value;
+ else if (uri == "quotes") quotes = value;
+ else if (uri == "dialect") setDialect(value);
+ else if (setUnescaping(uri, value));
+ else throw std::invalid_argument(std::string("Invalid parser option: „") + uri + "“ with value: „" + value + "“");
+ }
+
void addHandler(INIContentHandler* handler) override {
handlers.push_back(handler);
}
+ void addUnescapingProcessor(std::shared_ptr<UnescapingProcessor> processor, const std::string uri, bool enabledByDefault) override {
+ unescapingProcessors.push_back({processor, uri, enabledByDefault});
+ }
+
void process() override {
for (INIContentHandler* handler : handlers) handler->startDocument();
@@ -196,10 +330,10 @@
while (input.good()) { // TODO: condition
{
+ INIContentHandler::WhitespaceEvent event;
+ event.lineNumber = lineNumber;
std::string whitespace = readAllWhitespace();
if (whitespace.size()) {
- INIContentHandler::WhitespaceEvent event;
- event.lineNumber = lineNumber;
event.eventNumber = ++eventNumber;
event.whitespace = whitespace;
for (INIContentHandler* handler : handlers) handler->whitespace(event);
@@ -213,20 +347,23 @@
if (ch == std::istream::traits_type::eof()) {
break;
- } else if (ch == '[') {
+ } else if (ch == '[' && allowSections) {
if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
inSection = true;
- get();
- readAllWhitespace();
INIContentHandler::SectionStartEvent event;
event.lineNumber = lineNumber;
event.eventNumber = ++eventNumber;
+ get();
+ readAllWhitespace();
event.name = readTokenAndEatTerminator(']', "e, &found);
+ if (!quote) event.name = trim(event.name);
+ event.name = unescape(event.name, UnescapingProcessor::TextType::SectionName);
readSpacesAndTabs();
if (allowSectionTags && peek() == '[') {
get();
event.tag = readTokenAndEatTerminator(']', "e, &found);
+ event.tag = unescape(event.tag, UnescapingProcessor::TextType::SectionTag);
}
readSpacesAndTabs();
@@ -235,6 +372,7 @@
get();
readSpacesAndTabs();
event.comment = readUntil('\n', &found);
+ event.comment = unescape(event.comment, UnescapingProcessor::TextType::SectionComment);
} else if (ch == '\n') {
get();
} else {
@@ -243,31 +381,33 @@
for (INIContentHandler* handler : handlers) handler->startSection(event);
} else if (isComment(ch)) {
- get();
- readSpacesAndTabs();
INIContentHandler::CommentEvent event;
event.lineNumber = lineNumber;
event.eventNumber = ++eventNumber;
+ get();
+ readSpacesAndTabs();
event.comment = readUntil('\n', &found);
+ event.comment = unescape(event.comment, UnescapingProcessor::TextType::Comment);
for (INIContentHandler* handler : handlers) handler->comment(event);
} else {
- std::string fullKey = readToken('=', "e, &found);
+ INIContentHandler::EntryEvent event;
+ event.lineNumber = lineNumber;
+ event.eventNumber = ++eventNumber;
+
+ std::string fullKey = readToken(keyValueSeparators, "e, &found);
if (!found) throw std::logic_error(std::string("missing = after key: '") + fullKey + "'");
if (!quote) fullKey = trim(fullKey);
readSpacesAndTabs();
if (quote) {
ch = get();
- if (ch == '=') readSpacesAndTabs();
+ if (oneOf(ch, keyValueSeparators)) readSpacesAndTabs();
else throw std::logic_error(std::string("missing = after quoted key: '") + fullKey + "'");
}
std::string value = readToken('\n', "e, &found);
if (!quote) value = trim(value);
- INIContentHandler::EntryEvent event;
- event.lineNumber = lineNumber;
- event.eventNumber = ++eventNumber;
event.key = fullKey;
event.fullKey = fullKey;
event.value = value;
@@ -278,9 +418,14 @@
event.key = match[1];
event.subKey = match[2];
event.fullKey = fullKey;
+ event.subKey = unescape(event.subKey, UnescapingProcessor::TextType::EntryKey);
}
}
+ event.key = unescape(event.key, UnescapingProcessor::TextType::EntryKey);
+ event.fullKey = unescape(event.fullKey, UnescapingProcessor::TextType::EntryKey);
+ event.value = unescape(event.value, UnescapingProcessor::TextType::EntryValue);
+
if (quote) {
readSpacesAndTabs();
ch = peek();
@@ -288,9 +433,15 @@
get();
readSpacesAndTabs();
event.comment = readUntil('\n', &found);
+ event.comment = unescape(event.comment, UnescapingProcessor::TextType::EntryComment);
} else if (ch == '\n') {
get();
} else {
+ // TODO: optional support for multiple tokens in a single entry?
+ // modes: array, concatenate
+ // some-array-1 = "item 1" "item 2" 'item 3' item 4
+ // some-array-2 = "item 1" "item 2" 'item 3' item_4 item_5
+ // some-bash-style-string-value = "this "will' be' concatenated → this will be concatenated
throw std::logic_error(std::string("unexpected content after the quoted value: key='") + fullKey + "' value='" + event.value + "'");
}
}
--- a/src/lib/INIReader.h Thu Nov 26 11:42:26 2020 +0100
+++ b/src/lib/INIReader.h Sat Nov 28 18:14:15 2020 +0100
@@ -20,6 +20,7 @@
#include <istream>
#include "INIContentHandler.h"
+#include "UnescapingProcessor.h"
namespace relpipe {
namespace in {
@@ -32,7 +33,20 @@
class INIReader {
public:
virtual ~INIReader() = default;
+ /**
+ * TODO: after moving to alt2xml:
+ * - option will be identified by globally unique URI/IRI
+ * - parsers will provide catalog of supported options (names, enum values, documentation)
+ * - options serves as both XML parser features and properties and are mapped to them
+ */
+ virtual void setOption(const std::string& uri, const std::string& value) = 0;
+ /**
+ * TODO: after moving to alt2xml:
+ * - this will be generic handler for SAX event
+ * - but both sides will know the schema (allowed elements and attributes for INI events)
+ */
virtual void addHandler(INIContentHandler* handler) = 0;
+ virtual void addUnescapingProcessor(std::shared_ptr<UnescapingProcessor> processor, const std::string uri, bool enabledByDefault) = 0;
virtual void process() = 0;
static INIReader* create(std::istream& input);
};
--- a/src/lib/JavaPropertiesUnescapingINIHandler.h Thu Nov 26 11:42:26 2020 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,105 +0,0 @@
-/**
- * Relational pipes
- * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, version 3 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-#pragma once
-
-#include <sstream>
-#include <codecvt>
-#include <arpa/inet.h>
-
-#include "UnescapingINIHandler.h"
-
-using namespace std;
-using namespace relpipe::writer;
-
-namespace relpipe {
-namespace in {
-namespace ini {
-namespace lib {
-
-class JavaPropertiesUnescapingINIContentHandler : public UnescapingINIContentHandler {
-private:
- wstring_convert < codecvt_utf8<wchar_t>> convertor; // INI parser works with UTF-8
-
- bool readHex(const char* hexadecimal, size_t hexLength, uint8_t* resultBuffer, size_t binLength) {
- if (hexLength != binLength * 2) return false;
-
- for (size_t i = 0; i < binLength; i++) {
- uint8_t value = 0;
- char a = hexadecimal[i * 2];
- char b = hexadecimal[i * 2 + 1];
-
- if (a >= '0' && a <= '9') value += (a - '0')*16;
- else if (a >= 'a' && a <= 'f') value += (a - 'a' + 10)*16;
- else if (a >= 'A' && a <= 'F') value += (a - 'A' + 10)*16;
- else return false;
-
- if (b >= '0' && b <= '9') value += b - '0';
- else if (b >= 'a' && b <= 'f') value += b - 'a' + 10;
- else if (b >= 'A' && b <= 'F') value += b - 'A' + 10;
- else return false;
-
- if (resultBuffer) resultBuffer[i] = value;
- }
- return true;
- }
-
-protected:
-
- virtual std::string unescape(const std::string& s) {
- std::stringstream result;
- for (int i = 0, length = s.size(); i < length; i++) {
- char ch = s[i];
- if (i + 1 < length && ch == ESC) {
- ch = s[i + 1];
- if (ch == 'u') {
- // TODO: simplify, clean-up, verify (but seems working)
- i++;
- int hexLength = 4;
- if (i + hexLength < length) {
- uint16_t u16;
- bool hexOK = readHex(s.c_str() + i + 1, hexLength, (uint8_t*) & u16, sizeof (u16));
- if (hexOK) result << convertor.to_bytes(ntohs(u16));
- else throw std::logic_error(std::string("Invalid unicode escape sequence: invalid HEX"));
- i += hexLength;
- } else {
- throw std::logic_error(std::string("Invalid unicode escape sequence: missing characters"));
- }
-
- } else if (ch == ESC && !lastEscaphingPhase) put(result, ESC, i).put(ESC); // copy and skip even the second \ to avoid its misinterpretation in the next cycle
- else if (ch == ESC && lastEscaphingPhase) put(result, ESC, i); // unescape \\ to \.
- else if (lastEscaphingPhase) throw std::logic_error(std::string("Unsupported escape sequence: ") + ch);
- else result.put(ESC); // keep the escape sequence for later unescaping phase
- } else if (ch == ESC) {
- throw std::logic_error(std::string("Missing escape sequence")); // this should not happen
- } else {
- result.put(ch);
- }
- }
- return result.str();
- }
-
-public:
-
- JavaPropertiesUnescapingINIContentHandler(INIContentHandler& output, bool lastEscaphingPhase) : UnescapingINIContentHandler(output, lastEscaphingPhase, true) {
- }
-
-};
-
-}
-}
-}
-}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/JavaPropertiesUnescapingProcessor.h Sat Nov 28 18:14:15 2020 +0100
@@ -0,0 +1,101 @@
+/**
+ * Relational pipes
+ * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <sstream>
+#include <codecvt>
+#include <arpa/inet.h>
+
+#include "UnescapingProcessor.h"
+
+using namespace std;
+using namespace relpipe::writer;
+
+namespace relpipe {
+namespace in {
+namespace ini {
+namespace lib {
+
+/**
+ * Should work according to <https://docs.oracle.com/javase/specs/jls/se15/html/jls-3.html#jls-3.3> 3.3. Unicode Escapes
+ */
+class JavaPropertiesUnescapingProcessor : public UnescapingProcessor {
+private:
+ wstring_convert < codecvt_utf8<wchar_t>> convertor; // INI parser works with UTF-8
+
+ bool readHex(const char* hexadecimal, size_t hexLength, uint8_t* resultBuffer, size_t binLength) {
+ if (hexLength != binLength * 2) return false;
+
+ for (size_t i = 0; i < binLength; i++) {
+ uint8_t value = 0;
+ char a = hexadecimal[i * 2];
+ char b = hexadecimal[i * 2 + 1];
+
+ if (a >= '0' && a <= '9') value += (a - '0')*16;
+ else if (a >= 'a' && a <= 'f') value += (a - 'a' + 10)*16;
+ else if (a >= 'A' && a <= 'F') value += (a - 'A' + 10)*16;
+ else return false;
+
+ if (b >= '0' && b <= '9') value += b - '0';
+ else if (b >= 'a' && b <= 'f') value += b - 'a' + 10;
+ else if (b >= 'A' && b <= 'F') value += b - 'A' + 10;
+ else return false;
+
+ if (resultBuffer) resultBuffer[i] = value;
+ }
+ return true;
+ }
+
+public:
+
+ std::string unescape(const std::string& s, const TextType type) override {
+ std::stringstream result;
+ for (int i = 0, length = s.size(); i < length; i++) {
+ char ch = s[i];
+ if (i + 1 < length && ch == ESC) {
+ ch = s[i + 1];
+ if (ch == 'u') {
+ // TODO: simplify, clean-up, verify (but seems working)
+ i++;
+ int hexLength = 4;
+ if (i + hexLength < length) {
+ uint16_t u16;
+ bool hexOK = readHex(s.c_str() + i + 1, hexLength, (uint8_t*) & u16, sizeof (u16));
+ if (hexOK) result << convertor.to_bytes(ntohs(u16));
+ else throw std::logic_error(std::string("Invalid unicode escape sequence: invalid HEX"));
+ i += hexLength;
+ } else {
+ throw std::logic_error(std::string("Invalid unicode escape sequence: missing characters"));
+ }
+
+ } else if (ch == ESC) put(result, ESC, i).put(ESC); // copy and skip even the second \ to avoid its misinterpretation in the next cycle
+ else result.put(ESC); // keep the escape sequence for later unescaping phase
+ } else if (ch == ESC) {
+ throw std::logic_error(std::string("Missing escape sequence")); // this should not happen
+ } else {
+ result.put(ch);
+ }
+ }
+ return result.str();
+ }
+
+};
+
+}
+}
+}
+}
--- a/src/lib/UnescapingINIHandler.h Thu Nov 26 11:42:26 2020 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,108 +0,0 @@
-/**
- * Relational pipes
- * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, version 3 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-#pragma once
-
-#include <sstream>
-
-#include "INIReader.h"
-
-using namespace std;
-using namespace relpipe::writer;
-
-namespace relpipe {
-namespace in {
-namespace ini {
-namespace lib {
-
-class UnescapingINIContentHandler : public INIContentHandler {
-private:
- INIContentHandler& output;
- bool unescapeComments;
-
-protected:
- const char ESC = '\\';
- bool lastEscaphingPhase;
-
- std::stringstream& put(std::stringstream& result, const char& ch, int& i) {
- result.put(ch);
- i++;
- return result;
- }
-
- virtual std::string unescape(const std::string& s) = 0;
-
-public:
-
- /**
- * @param output here will be sent events with unescaped values
- * @param lastEscaphingPhase instances of UnescapingINIContentHandler might be chained:
- * unsupported escaping sequences are kept untouched to be processed in further phases;
- * in the last phase, all remaining sequences (including \\) must be recognized and unescaped
- * (otherwise the input is considered invalid and an exception is thrown)
- */
- UnescapingINIContentHandler(INIContentHandler& output, bool lastEscaphingPhase, bool unescapeComments = false) : output(output), lastEscaphingPhase(lastEscaphingPhase), unescapeComments(unescapeComments) {
- }
-
- void startDocument() override {
- output.startDocument();
- }
-
- void endDocument() override {
- output.endDocument();
- }
-
- void startSection(const SectionStartEvent& event) override {
- SectionStartEvent e = event;
- e.name = unescape(e.name);
- if (unescapeComments) e.comment = unescape(e.comment);
- output.startSection(e);
- }
-
- void endSection() override {
- output.endSection();
- }
-
- void entry(const EntryEvent& event) override {
- EntryEvent e = event;
- e.key = unescape(e.key);
- e.fullKey = unescape(e.fullKey);
- e.subKey = unescape(e.subKey);
- e.value = unescape(e.value);
- if (unescapeComments) e.comment = unescape(e.comment);
- output.entry(e);
- }
-
- void comment(const CommentEvent& event) override {
- if (unescapeComments) {
- CommentEvent e = event;
- e.comment = unescape(e.comment);
- output.comment(e);
- } else {
- output.comment(event);
- }
- }
-
- void whitespace(const WhitespaceEvent& event) override {
- output.whitespace(event);
- }
-
-};
-
-}
-}
-}
-}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/UnescapingProcessor.h Sat Nov 28 18:14:15 2020 +0100
@@ -0,0 +1,60 @@
+/**
+ * Relational pipes
+ * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#pragma once
+
+#include <sstream>
+
+#include "INIReader.h"
+
+using namespace std;
+
+namespace relpipe {
+namespace in {
+namespace ini {
+namespace lib {
+
+class UnescapingProcessor {
+private:
+protected:
+ const char ESC = '\\';
+
+ std::stringstream& put(std::stringstream& result, const char& ch, int& i) {
+ result.put(ch);
+ i++;
+ return result;
+ }
+
+public:
+
+ enum class TextType {
+ SectionName,
+ SectionComment,
+ SectionTag,
+ EntryKey,
+ EntryValue,
+ EntryComment,
+ Comment,
+ };
+
+ virtual std::string unescape(const std::string& s, const TextType type) = 0;
+
+};
+
+}
+}
+}
+}