--- a/src/lib/INIReader.cpp Thu Nov 26 11:42:26 2020 +0100
+++ b/src/lib/INIReader.cpp Sat Nov 28 18:14:15 2020 +0100
@@ -32,21 +32,47 @@
std::istream& input;
std::vector<INIContentHandler*> handlers;
+ class ConfiguredUnescapingProcessor {
+ public:
+ std::shared_ptr<UnescapingProcessor> processor;
+ const std::string uri;
+ bool enbaled;
+
+ ConfiguredUnescapingProcessor(std::shared_ptr<UnescapingProcessor> processor, const std::string uri, bool enbaled) : processor(processor), uri(uri), enbaled(enbaled) {
+ }
+
+ };
+
+ std::vector<ConfiguredUnescapingProcessor> unescapingProcessors;
+
/**
- * This might be configurable.
- *
* By default, we ignore all leading whitespace on continuing lines.
* If there should be some spaces or tabs, they should be placed on the previous line before the „\“.
* If a line break is desired, it should be written as \n (escaped) or the value should be quoted in " or '.
*
+ * TODO: several options:
+ * - enabled, disabled
+ * - if disabled, then: keep backslash, trim backslash, escape backslash
+ * (keep requires support in some further unescaping phase, or it will cause an error)
+ * - keep or trim the line end
+ * - keep or trim the leading spaces
+ * - allow comments interleaved with continuing lines (the freaky systemd syntax)
+ *
* Related specifications:
* - https://docs.oracle.com/javase/8/docs/api/index.html?java/util/Properties.html
+ * - https://www.freedesktop.org/software/systemd/man/systemd.syntax.html
*/
- bool consumeLeadingSpacesOnContinuingLines = true;
+ bool trimLeadingSpacesOnContinuingLines = true;
+
/**
- * This might be configurable.
- *
+ * Some dialects or configuration files in general does not support sections.
+ * Then a line, that looks like an INI section, should be interpreted as a key
+ * (or error, if does not have a proper key-value separator).
+ */
+ bool allowSections = true;
+
+ /**
* KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section.
* Line „[section_1][$i]“ means that the „section_1“ is „locked“.
* We may emit this information somehow later, but for now, it is just ignored.
@@ -59,8 +85,6 @@
bool allowSectionTags = true;
/**
- * This might be configurable.
- *
* If whole key is „aaa[bbb]“ then „aaa“ is considered to be the key and „bbb“ the sub-key.
* No \[ escaping is currently supported, so the key might not contain the bracket character.
*
@@ -70,6 +94,35 @@
*/
bool allowSubKeys = true;
+ /**
+ * Classic INI uses „key=value“ syntax.
+ * But some other formats/dialects might use key:value.
+ *
+ * Only single character separators are supported.
+ * If multiple separators should be recognized (e.g. both „=“ and „:“), this string will contain all of them,
+ * i.e. „:=“ does not mean that the „key:=value“ syntax, but „key=value“ or „key:value“.
+ */
+ std::string keyValueSeparators = "=";
+
+ /**
+ * Classic INI uses „; comment“ syntax.
+ * But many existing files contain „# comment“ lines.
+ *
+ * Only single character separators are supported (works same as keyValueSeparators).
+ */
+ std::string commentSeparators = ";#";
+
+ /**
+ * INI often support both "quotes" and 'apostrophes' styles.
+ * But some dialects may support only one of them or not support quoting at all.
+ *
+ * In such case e.g. „key="some value"“ would mean that the value is „"value"“ (including the quotes).
+ * Thus it is important to allow disabling quote recognizing (which is done by setting this parameter to empty string).
+ *
+ * Only single character quotes are supported (works same as keyValueSeparators).
+ */
+ std::string quotes = "\"'";
+
int lineNumber = 1;
int eventNumber = 0;
@@ -111,30 +164,35 @@
}
void processContinuingLine(std::stringstream& result) {
- if (consumeLeadingSpacesOnContinuingLines) readSpacesAndTabs();
+ if (trimLeadingSpacesOnContinuingLines) readSpacesAndTabs();
else result.put('\n');
}
- std::string readUntil(char until, bool* found = nullptr) {
+ std::string readUntil(const char until, bool* found = nullptr) {
+ return readUntil(std::string(1, until), found);
+ }
+
+ std::string readUntil(const std::string& until, bool* found = nullptr) {
std::stringstream result;
- for (char ch = peek(); input.good() && ch != until; ch = peek()) {
+ for (char ch = peek(); input.good() && !oneOf(ch, until); ch = peek()) {
if (ch == '\\') {
get();
ch = get();
- if (ch == until && ch == '\n') processContinuingLine(result);
- else if (ch == until) result.put(ch);
+ if (oneOf(ch, until) && ch == '\n') processContinuingLine(result);
+ else if (oneOf(ch, until)) result.put(ch);
else if (ch == std::istream::traits_type::eof()) break;
else result.put('\\').put(ch);
- // TODO: two-stage and modular unescaping: here unescape only \+LF or more genereally: unescape only the until character and rest leave untouched
- // second escaping stage move to separate class/wrapper (similar to hierarchical wrappers)
+ // unescaping is done in two phases:
+ // here we unescape just the \n (LF)
+ // other escape sequences are leaved untouched and will be processed in later phases, see see UnescapingINIHandler
} else {
ch = get();
result.put(ch);
}
}
- if (peek() == until) {
+ if (oneOf(peek(), until)) {
get();
if (found) *found = true;
} else {
@@ -144,13 +202,17 @@
return result.str();
}
- std::string readToken(char until, char* quote = nullptr, bool* found = nullptr) {
+ std::string readToken(const char until, char* quote = nullptr, bool* found = nullptr) {
+ return readToken(std::string(1, until), quote, found);
+ }
+
+ std::string readToken(const std::string& until, char* quote = nullptr, bool* found = nullptr) {
std::string result;
char ch = peek();
if (isQuote(ch)) {
if (quote) *quote = ch;
- result = readUntil(get(), found);
+ result = readUntil(std::string(1, get()), found);
} else {
if (quote) *quote = 0;
result = readUntil(until, found);
@@ -160,35 +222,107 @@
}
std::string readTokenAndEatTerminator(char until, char* quote = nullptr, bool* found = nullptr) {
+ return readTokenAndEatTerminator(std::string(1, until), quote, found);
+ }
+
+ std::string readTokenAndEatTerminator(const std::string& until, char* quote = nullptr, bool* found = nullptr) {
std::string result = readToken(until, quote, found);
if (*quote) {
readAllWhitespace();
- if (get() != until) throw std::logic_error(std::string("missing „") + std::string(1, until) + "“ after quoted section name");
+ if (!oneOf(get(), until)) throw std::logic_error(std::string("missing „") + until + "“ after quoted section name");
}
return result;
}
+ std::string unescape(const std::string& value, UnescapingProcessor::TextType type) {
+ std::string result = value;
+ for (ConfiguredUnescapingProcessor p : unescapingProcessors) if (p.enbaled) result = p.processor->unescape(result, type);
+ return result;
+ }
+
bool isComment(char ch) {
- return ch == '#' || ch == ';';
+ return oneOf(ch, commentSeparators);
}
bool isQuote(char ch) {
- return ch == '"' || ch == '\'';
+ return oneOf(ch, quotes);
+ }
+
+ /**
+ * @param ch character to be evaluated
+ * @param options list of options (characters)
+ * @return whether ch is one of options
+ */
+ bool oneOf(char ch, const std::string& options) {
+ return options.find(ch) != std::string::npos;
}
std::string trim(std::string s) {
return std::regex_replace(s, std::regex("^\\s+|\\s+$"), "");
}
+ /**
+ * TODO: use a common method
+ */
+ bool parseBoolean(const std::string& value) {
+ if (value == "true") return true;
+ else if (value == "false") return false;
+ else throw std::invalid_argument(std::string("Unable to parse boolean value: ") + value + " (expecting true or false)");
+ }
+
+ void setDialect(const std::string& name) {
+ if (name == "default-ini") {
+ // already set
+ } else if (name == "java-properties") {
+ trimLeadingSpacesOnContinuingLines = true;
+ allowSections = false;
+ allowSectionTags = false;
+ allowSubKeys = false;
+ commentSeparators = "#";
+ keyValueSeparators = "=:";
+ quotes = "";
+ // TODO: enable unicode unescaping
+ } else {
+ throw std::invalid_argument(std::string("Unsupported INI dialect: ") + name);
+ }
+ }
+
+ bool setUnescaping(const std::string& uri, const std::string& value) {
+ for (ConfiguredUnescapingProcessor& p : unescapingProcessors) {
+ if (p.uri == uri) {
+ p.enbaled = parseBoolean(value);
+ return true;
+ }
+ }
+ return false;
+ }
+
public:
INIReaderImpl(std::istream& input) : input(input) {
}
+ void setOption(const std::string& uri, const std::string& value) override {
+ if (uri == "trim-continuing-lines") trimLeadingSpacesOnContinuingLines = parseBoolean(value); // TODO: continuing lines modes (enum), not just boolean
+ else if (uri == "allow-sections") allowSections = parseBoolean(value);
+ else if (uri == "allow-section-tags") allowSectionTags = parseBoolean(value);
+ else if (uri == "allow-sub-keys") allowSubKeys = parseBoolean(value);
+ else if (uri == "comment-separators") commentSeparators = value;
+ else if (uri == "key-value-separators") keyValueSeparators = value;
+ else if (uri == "quotes") quotes = value;
+ else if (uri == "dialect") setDialect(value);
+ else if (setUnescaping(uri, value));
+ else throw std::invalid_argument(std::string("Invalid parser option: „") + uri + "“ with value: „" + value + "“");
+ }
+
void addHandler(INIContentHandler* handler) override {
handlers.push_back(handler);
}
+ void addUnescapingProcessor(std::shared_ptr<UnescapingProcessor> processor, const std::string uri, bool enabledByDefault) override {
+ unescapingProcessors.push_back({processor, uri, enabledByDefault});
+ }
+
void process() override {
for (INIContentHandler* handler : handlers) handler->startDocument();
@@ -196,10 +330,10 @@
while (input.good()) { // TODO: condition
{
+ INIContentHandler::WhitespaceEvent event;
+ event.lineNumber = lineNumber;
std::string whitespace = readAllWhitespace();
if (whitespace.size()) {
- INIContentHandler::WhitespaceEvent event;
- event.lineNumber = lineNumber;
event.eventNumber = ++eventNumber;
event.whitespace = whitespace;
for (INIContentHandler* handler : handlers) handler->whitespace(event);
@@ -213,20 +347,23 @@
if (ch == std::istream::traits_type::eof()) {
break;
- } else if (ch == '[') {
+ } else if (ch == '[' && allowSections) {
if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
inSection = true;
- get();
- readAllWhitespace();
INIContentHandler::SectionStartEvent event;
event.lineNumber = lineNumber;
event.eventNumber = ++eventNumber;
+ get();
+ readAllWhitespace();
event.name = readTokenAndEatTerminator(']', "e, &found);
+ if (!quote) event.name = trim(event.name);
+ event.name = unescape(event.name, UnescapingProcessor::TextType::SectionName);
readSpacesAndTabs();
if (allowSectionTags && peek() == '[') {
get();
event.tag = readTokenAndEatTerminator(']', "e, &found);
+ event.tag = unescape(event.tag, UnescapingProcessor::TextType::SectionTag);
}
readSpacesAndTabs();
@@ -235,6 +372,7 @@
get();
readSpacesAndTabs();
event.comment = readUntil('\n', &found);
+ event.comment = unescape(event.comment, UnescapingProcessor::TextType::SectionComment);
} else if (ch == '\n') {
get();
} else {
@@ -243,31 +381,33 @@
for (INIContentHandler* handler : handlers) handler->startSection(event);
} else if (isComment(ch)) {
- get();
- readSpacesAndTabs();
INIContentHandler::CommentEvent event;
event.lineNumber = lineNumber;
event.eventNumber = ++eventNumber;
+ get();
+ readSpacesAndTabs();
event.comment = readUntil('\n', &found);
+ event.comment = unescape(event.comment, UnescapingProcessor::TextType::Comment);
for (INIContentHandler* handler : handlers) handler->comment(event);
} else {
- std::string fullKey = readToken('=', "e, &found);
+ INIContentHandler::EntryEvent event;
+ event.lineNumber = lineNumber;
+ event.eventNumber = ++eventNumber;
+
+ std::string fullKey = readToken(keyValueSeparators, "e, &found);
if (!found) throw std::logic_error(std::string("missing = after key: '") + fullKey + "'");
if (!quote) fullKey = trim(fullKey);
readSpacesAndTabs();
if (quote) {
ch = get();
- if (ch == '=') readSpacesAndTabs();
+ if (oneOf(ch, keyValueSeparators)) readSpacesAndTabs();
else throw std::logic_error(std::string("missing = after quoted key: '") + fullKey + "'");
}
std::string value = readToken('\n', "e, &found);
if (!quote) value = trim(value);
- INIContentHandler::EntryEvent event;
- event.lineNumber = lineNumber;
- event.eventNumber = ++eventNumber;
event.key = fullKey;
event.fullKey = fullKey;
event.value = value;
@@ -278,9 +418,14 @@
event.key = match[1];
event.subKey = match[2];
event.fullKey = fullKey;
+ event.subKey = unescape(event.subKey, UnescapingProcessor::TextType::EntryKey);
}
}
+ event.key = unescape(event.key, UnescapingProcessor::TextType::EntryKey);
+ event.fullKey = unescape(event.fullKey, UnescapingProcessor::TextType::EntryKey);
+ event.value = unescape(event.value, UnescapingProcessor::TextType::EntryValue);
+
if (quote) {
readSpacesAndTabs();
ch = peek();
@@ -288,9 +433,15 @@
get();
readSpacesAndTabs();
event.comment = readUntil('\n', &found);
+ event.comment = unescape(event.comment, UnescapingProcessor::TextType::EntryComment);
} else if (ch == '\n') {
get();
} else {
+ // TODO: optional support for multiple tokens in a single entry?
+ // modes: array, concatenate
+ // some-array-1 = "item 1" "item 2" 'item 3' item 4
+ // some-array-2 = "item 1" "item 2" 'item 3' item_4 item_5
+ // some-bash-style-string-value = "this "will' be' concatenated → this will be concatenated
throw std::logic_error(std::string("unexpected content after the quoted value: key='") + fullKey + "' value='" + event.value + "'");
}
}