/**
* Relational pipes
* Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <vector>
#include <regex>
#include <sstream>
#include <stdexcept>
#include "INIReader.h"
#include "uri.h"
namespace relpipe {
namespace in {
namespace ini {
namespace lib {
class INIReaderImpl : public INIReader {
private:
std::istream& input;
std::vector<INIContentHandler*> handlers;
class ConfiguredUnescapingProcessor {
public:
std::shared_ptr<UnescapingProcessor> processor;
const std::string uri;
bool enbaled;
ConfiguredUnescapingProcessor(std::shared_ptr<UnescapingProcessor> processor, const std::string uri, bool enbaled) : processor(processor), uri(uri), enbaled(enbaled) {
}
};
std::vector<ConfiguredUnescapingProcessor> unescapingProcessors;
class ConfiguredDialect {
public:
std::shared_ptr<Dialect> dialect;
const std::string uri;
ConfiguredDialect(std::shared_ptr<Dialect> dialect, const std::string uri) : dialect(dialect), uri(uri) {
}
};
std::vector<ConfiguredDialect> dialects;
/**
* If there is a „\“ backspace at the end of a physical line, the logical line continues on the next physical line.
*
* Disabling this option makes sense only if we also disable the unescaping processors (unescape-basic, unescape-backspace).
* Otherwise they will complain about „Missing escape sequence“ because they got „\“ at the end of the value.
*/
bool allowLineContinuationsWithEscaping = true;
/**
* If a line starts with a space, it is continuation of the previous line.
* This rule conflicts with default ignorance of such insignificant whitespace and is quite specific to the Java MANIFEST.MF dialect.
*/
bool allowLineContinuationsWithSpace = false;
/**
* By default, we ignore all leading whitespace on continuing lines.
* If there should be some spaces or tabs, they should be placed on the previous line before the „\“.
* If a line break is desired, it should be written as \n (escaped) or the value should be quoted in " or '.
*
* TODO: several options:
* - enabled, disabled
* - if disabled, then: keep backslash, trim backslash, escape backslash
* (keep requires support in some further unescaping phase, or it will cause an error)
* - keep or trim the line end
* - keep or trim the leading spaces
* - allow comments interleaved with continuing lines (the freaky systemd syntax)
*
* Related specifications:
* - https://docs.oracle.com/javase/8/docs/api/index.html?java/util/Properties.html
* - https://www.freedesktop.org/software/systemd/man/systemd.syntax.html
*/
bool trimLeadingSpacesOnContinuingLines = true;
/**
* Some dialects or configuration files in general does not support sections.
* Then a line, that looks like an INI section, should be interpreted as a key
* (or error, if does not have a proper key-value separator).
*/
bool allowSections = true;
/**
* KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section.
* Line „[section_1][$i]“ means that the „section_1“ is „locked“.
* We may emit this information somehow later, but for now, it is just ignored.
*
* TODO: Is „section tag“ right name?
*
* Related specifications:
* - https://userbase.kde.org/KDE_System_Administration/Configuration_Files#Lock_Down
*/
bool allowSectionTags = true;
/**
* If whole key is „aaa[bbb]“ then „aaa“ is considered to be the key and „bbb“ the sub-key.
* No \[ escaping is currently supported, so the key might not contain the bracket character.
*
* Related specifications:
* - https://userbase.kde.org/KDE_System_Administration/Configuration_Files#Shell_Expansion
* - https://specifications.freedesktop.org/desktop-entry-spec/latest/ar01s05.html
*/
bool allowSubKeys = true;
/**
* Classic INI uses „key=value“ syntax.
* But some other formats/dialects might use key:value.
*
* Only single character separators are supported.
* If multiple separators should be recognized (e.g. both „=“ and „:“), this string will contain all of them,
* i.e. „:=“ does not mean that the „key:=value“ syntax, but „key=value“ or „key:value“.
*/
std::string keyValueSeparators = "=";
/**
* Classic INI uses „; comment“ syntax.
* But many existing files contain „# comment“ lines.
*
* Only single character separators are supported (works same as keyValueSeparators).
*/
std::string commentSeparators = ";#";
/**
* INI often support both "quotes" and 'apostrophes' styles.
* But some dialects may support only one of them or not support quoting at all.
*
* In such case e.g. „key="some value"“ would mean that the value is „"value"“ (including the quotes).
* Thus it is important to allow disabling quote recognizing (which is done by setting this parameter to empty string).
*
* Only single character quotes are supported (works same as keyValueSeparators).
*/
std::string quotes = "\"'";
int lineNumber = 1;
int eventNumber = 0;
/**
* Should be always used instead of input.peek().
* Skips \r.
*/
char peek() {
// In 2020 there is no need to manually return the carriage. However some legacy systems still do it.
char ch = input.peek();
if (ch == '\r') {
input.get();
ch = input.peek();
}
return ch;
}
/**
* Should be always used instead of input.get().
* Counts the lines and skips \r.
*/
char get() {
char ch = input.get();
if (ch == '\n') lineNumber++;
else if (ch == '\r') ch = get();
return ch;
}
std::string readSpacesAndTabs() {
std::stringstream result;
for (char ch = peek(); input.good() && (ch == ' ' || ch == '\t'); ch = peek()) result.put(get());
return result.str();
}
std::string readAllWhitespace() {
std::stringstream result;
for (char ch = peek(); input.good() && (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'); ch = peek()) result.put(get());
return result.str();
}
void processContinuingLine(std::stringstream& result) {
if (trimLeadingSpacesOnContinuingLines) readSpacesAndTabs();
else result.put('\n');
}
std::string readUntil(const char until, bool* found = nullptr) {
return readUntil(std::string(1, until), found);
}
std::string readUntil(const std::string& until, bool* found = nullptr) {
std::stringstream result;
for (char ch = peek(); input.good(); ch = peek()) {
if (allowLineContinuationsWithSpace && ch == '\n') {
get();
ch = peek();
if (ch == ' ') get();
else if (ch == std::istream::traits_type::eof()) break;
else {
if (found) *found = true;
return result.str();
}
} else if (oneOf(ch, until)) {
break;
} else if (allowLineContinuationsWithEscaping && ch == '\\') {
get();
ch = get();
if (oneOf(ch, until) && ch == '\n') processContinuingLine(result);
else if (oneOf(ch, until)) result.put(ch);
else if (ch == std::istream::traits_type::eof()) break;
else result.put('\\').put(ch);
// unescaping is done in two phases:
// here we unescape just the \n (LF)
// other escape sequences are leaved untouched and will be processed in later phases, see see UnescapingINIHandler
} else {
ch = get();
result.put(ch);
}
}
if (oneOf(peek(), until)) {
get();
if (found) *found = true;
} else {
if (found) *found = false;
}
return result.str();
}
std::string readToken(const char until, char* quote = nullptr, bool* found = nullptr) {
return readToken(std::string(1, until), quote, found);
}
std::string readToken(const std::string& until, char* quote = nullptr, bool* found = nullptr) {
std::string result;
char ch = peek();
if (isQuote(ch)) {
if (quote) *quote = ch;
result = readUntil(std::string(1, get()), found);
} else {
if (quote) *quote = 0;
result = readUntil(until, found);
}
return result;
}
std::string readTokenAndEatTerminator(char until, char* quote = nullptr, bool* found = nullptr) {
return readTokenAndEatTerminator(std::string(1, until), quote, found);
}
std::string readTokenAndEatTerminator(const std::string& until, char* quote = nullptr, bool* found = nullptr) {
std::string result = readToken(until, quote, found);
if (*quote) {
readAllWhitespace();
if (!oneOf(get(), until)) throw std::logic_error(std::string("missing „") + until + "“ after quoted section name");
}
return result;
}
std::string unescape(const std::string& value, UnescapingProcessor::TextType type) {
std::string result = value;
for (ConfiguredUnescapingProcessor p : unescapingProcessors) if (p.enbaled) result = p.processor->unescape(result, type);
return result;
}
bool isComment(char ch) {
return oneOf(ch, commentSeparators);
}
bool isQuote(char ch) {
return oneOf(ch, quotes);
}
/**
* @param ch character to be evaluated
* @param options list of options (characters)
* @return whether ch is one of options
*/
bool oneOf(char ch, const std::string& options) {
return options.find(ch) != std::string::npos;
}
std::string trim(std::string s) {
return std::regex_replace(s, std::regex("^\\s+|\\s+$"), "");
}
/**
* TODO: use a common method
*/
bool parseBoolean(const std::string& value) {
if (value == "true") return true;
else if (value == "false") return false;
else throw std::invalid_argument(std::string("Unable to parse boolean value: ") + value + " (expecting true or false)");
}
void setDialect(const std::string& uri) {
for (ConfiguredDialect& d : dialects) {
if (d.uri == uri) {
d.dialect->apply(*this);
return;
}
}
throw std::invalid_argument(std::string("Unsupported INI dialect: ") + uri);
}
bool setUnescaping(const std::string& uri, const std::string& value) {
for (ConfiguredUnescapingProcessor& p : unescapingProcessors) {
if (p.uri == uri) {
p.enbaled = parseBoolean(value);
return true;
}
}
return false;
}
public:
INIReaderImpl(std::istream& input) : input(input) {
}
void setOption(const std::string& uri, const std::string& value) override {
if (uri == option::AllowLineContinuationWithEscaping) allowLineContinuationsWithEscaping = parseBoolean(value);
else if (uri == option::AllowLineContinuationWithSpace) allowLineContinuationsWithSpace = parseBoolean(value);
else if (uri == option::TrimContinuingLines) trimLeadingSpacesOnContinuingLines = parseBoolean(value);
else if (uri == option::AllowSections) allowSections = parseBoolean(value);
else if (uri == option::AllowSectionTags) allowSectionTags = parseBoolean(value);
else if (uri == option::AllowSubKeys) allowSubKeys = parseBoolean(value);
else if (uri == option::CommentSeparators) commentSeparators = value;
else if (uri == option::KeyValueSeparators) keyValueSeparators = value;
else if (uri == option::Quotes) quotes = value;
else if (uri == option::Dialect) setDialect(value);
else if (setUnescaping(uri, value));
else throw std::invalid_argument(std::string("Invalid parser option: „") + uri + "“ with value: „" + value + "“");
}
void addHandler(INIContentHandler* handler) override {
handlers.push_back(handler);
}
void addUnescapingProcessor(std::shared_ptr<UnescapingProcessor> processor, const std::string uri, bool enabledByDefault) override {
unescapingProcessors.push_back({processor, uri, enabledByDefault});
}
void addDialect(std::shared_ptr<Dialect> dialect, const std::string uri, bool enabledByDefault) override {
dialects.push_back({dialect, uri});
if (enabledByDefault) dialect->apply(*this);
}
void process() override {
for (INIContentHandler* handler : handlers) handler->startDocument();
bool inSection = false;
while (input.good()) { // TODO: condition
{
INIContentHandler::WhitespaceEvent event;
event.lineNumber = lineNumber;
std::string whitespace = readAllWhitespace();
if (whitespace.size()) {
event.eventNumber = ++eventNumber;
event.whitespace = whitespace;
for (INIContentHandler* handler : handlers) handler->whitespace(event);
}
}
bool found;
char quote;
char ch = peek();
if (ch == std::istream::traits_type::eof()) {
break;
} else if (ch == '[' && allowSections) {
if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
inSection = true;
INIContentHandler::SectionStartEvent event;
event.lineNumber = lineNumber;
event.eventNumber = ++eventNumber;
get();
readAllWhitespace();
event.name = readTokenAndEatTerminator(']', "e, &found);
if (!quote) event.name = trim(event.name);
event.name = unescape(event.name, UnescapingProcessor::TextType::SectionName);
readSpacesAndTabs();
if (allowSectionTags && peek() == '[') {
get();
event.tag = readTokenAndEatTerminator(']', "e, &found);
event.tag = unescape(event.tag, UnescapingProcessor::TextType::SectionTag);
}
readSpacesAndTabs();
ch = peek();
if (isComment(ch)) {
get();
readSpacesAndTabs();
event.comment = readUntil('\n', &found);
event.comment = unescape(event.comment, UnescapingProcessor::TextType::SectionComment);
} else if (ch == '\n') {
get();
} else {
throw std::logic_error(std::string("unexpected content after the section: '") + event.name + "'");
}
for (INIContentHandler* handler : handlers) handler->startSection(event);
} else if (isComment(ch)) {
INIContentHandler::CommentEvent event;
event.lineNumber = lineNumber;
event.eventNumber = ++eventNumber;
get();
readSpacesAndTabs();
event.comment = readUntil('\n', &found);
event.comment = unescape(event.comment, UnescapingProcessor::TextType::Comment);
for (INIContentHandler* handler : handlers) handler->comment(event);
} else {
INIContentHandler::EntryEvent event;
event.lineNumber = lineNumber;
event.eventNumber = ++eventNumber;
std::string fullKey = readToken(keyValueSeparators, "e, &found);
if (!found) throw std::logic_error(std::string("missing = after key: '") + fullKey + "'");
if (!quote) fullKey = trim(fullKey);
readSpacesAndTabs();
if (quote) {
ch = get();
if (oneOf(ch, keyValueSeparators)) readSpacesAndTabs();
else throw std::logic_error(std::string("missing = after quoted key: '") + fullKey + "'");
}
std::string value = readToken('\n', "e, &found);
if (!quote) value = trim(value);
event.key = fullKey;
event.fullKey = fullKey;
event.value = value;
if (allowSubKeys) {
std::smatch match;
if (std::regex_match(fullKey, match, std::regex("([^\\[]+)\\[([^\\[]+)\\]"))) {
event.key = match[1];
event.subKey = match[2];
event.fullKey = fullKey;
event.subKey = unescape(event.subKey, UnescapingProcessor::TextType::EntryKey);
}
}
event.key = unescape(event.key, UnescapingProcessor::TextType::EntryKey);
event.fullKey = unescape(event.fullKey, UnescapingProcessor::TextType::EntryKey);
event.value = unescape(event.value, UnescapingProcessor::TextType::EntryValue);
if (quote) {
readSpacesAndTabs();
ch = peek();
if (isComment(ch)) {
get();
readSpacesAndTabs();
event.comment = readUntil('\n', &found);
event.comment = unescape(event.comment, UnescapingProcessor::TextType::EntryComment);
} else if (ch == '\n') {
get();
} else {
// TODO: optional support for multiple tokens in a single entry?
// modes: array, concatenate
// some-array-1 = "item 1" "item 2" 'item 3' item 4
// some-array-2 = "item 1" "item 2" 'item 3' item_4 item_5
// some-bash-style-string-value = "this "will' be' concatenated → this will be concatenated
throw std::logic_error(std::string("unexpected content after the quoted value: key='") + fullKey + "' value='" + event.value + "'");
}
}
for (INIContentHandler* handler : handlers) handler->entry(event);
}
}
// TODO: error at the end, catch premature/unexpected EOF
// TODO: unescape + trim values + ignore \r
// TODO: count lines
if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
for (INIContentHandler* handler : handlers) handler->endDocument();
}
// General feautres:
// TODO: warning/error handler
// TODO: support also escaped characters
// TODO: support also Java .properties and manifest.mf formats?
// TODO: support also nested sections – hierarchy
// TODO: support also nested keys e.g. key.sub.subsub.subsubsub=value – translate them to nested sections
// TODO: support also option for alternative key-value separator (: instead of =)
// TODO: support also other encodings (currently only UTF-8 is supported)
// TODO: better exceptions
// Lossless conversions:
// TODO: emit also the quote style ('/"/)
// TODO: emit also the comment style (;/#) ?
// TODO: emit also the whitespace before key name, around =, after "values"/'values', around [sections] ?
// TODO: emit also the line-end type (LF/CRLF) ?
};
INIReader* INIReader::create(std::istream& input) {
return new INIReaderImpl(input);
}
}
}
}
}