# HG changeset patch # User František Kučera # Date 1606583655 -3600 # Node ID 0e7c57d48d1eca0049b61348fbf7ee8ff2b5891c # Parent fd669e73d39a045e7ec8e2066bb1e2aa5c138551 configurable unescaping processors diff -r fd669e73d39a -r 0e7c57d48d1e src/XMLDocumentConstructor.h --- a/src/XMLDocumentConstructor.h Thu Nov 26 11:42:26 2020 +0100 +++ b/src/XMLDocumentConstructor.h Sat Nov 28 18:14:15 2020 +0100 @@ -20,8 +20,9 @@ #include #include "lib/INIReader.h" -#include "lib/BasicUnescapingINIHandler.h" -#include "lib/JavaPropertiesUnescapingINIHandler.h" +#include "lib/BasicUnescapingProcessor.h" +#include "lib/BackspaceUnescapingProcessor.h" +#include "lib/JavaPropertiesUnescapingProcessor.h" #include "lib/XMLNameCodec.h" using namespace relpipe::in::ini::lib; @@ -111,9 +112,14 @@ void process() { HierarchicalINIContentHandler handler(parser); std::shared_ptr reader(INIReader::create(*input)); - BasicUnescapingINIContentHandler unescapingHandler(handler, false); - JavaPropertiesUnescapingINIContentHandler javaHandler(handler, true); - reader->addHandler(&javaHandler); + reader->addUnescapingProcessor(std::make_shared(), "unescape-basic", true); + reader->addUnescapingProcessor(std::make_shared(), "unescape-java-properties", false); + reader->addUnescapingProcessor(std::make_shared(false), "unescape-backspace-disorder", false); + reader->addUnescapingProcessor(std::make_shared(), "unescape-backspace", true); + reader->addHandler(&handler); + // TODO: smart pointers vs. references: are we going to call addUnescapingProcessor() dynamically/conditionally or share instances? Then pointers will be better. + // TODO: call setOption() according to the configuration + // for (ParserOptionRecipe option : configuration.parserOptions) reader->setOption(convertor.to_bytes(option.uri), convertor.to_bytes(option.value)); reader->process(); } }; diff -r fd669e73d39a -r 0e7c57d48d1e src/lib/BackspaceUnescapingProcessor.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib/BackspaceUnescapingProcessor.h Sat Nov 28 18:14:15 2020 +0100 @@ -0,0 +1,70 @@ +/** + * Relational pipes + * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#pragma once + +#include + +#include "UnescapingProcessor.h" + +using namespace std; +using namespace relpipe::writer; + +namespace relpipe { +namespace in { +namespace ini { +namespace lib { + +class BackspaceUnescapingProcessor : public UnescapingProcessor { +private: + const bool lastEscaphingPhase = true; +public: + + std::string unescape(const std::string& s, const TextType type) override { + std::stringstream result; + for (int i = 0, length = s.size(); i < length; i++) { + char ch = s[i]; + if (i + 1 < length && ch == ESC) { + ch = s[i + 1]; + if (ch == ESC) put(result, ESC, i); // unescape \\ to \. + else if (lastEscaphingPhase) throw std::logic_error(std::string("Unsupported escape sequence: ") + ch); + else result.put(ESC); // keep the escape sequence for later unescaping phase + } else if (ch == ESC) { + throw std::logic_error(std::string("Missing escape sequence")); // this should not happen + } else { + result.put(ch); + } + } + return result.str(); + } + + /** + * @param lastEscaphingPhase whether this is final unescaping stage. + * By default it is set to true, thus no unrecognized escape sequences may left after this stage. + * Setting this to false is dangerous and may lead to errors and ambiguous behavior. + * It should be used only as a last resort. + * Because both "\\ \xxx" and "\ \xxx" will be converted to "\ \xxx" and the information will be lost. + * So, it is usually better to keep the "\" escaped as "\\" and process both the escaped backspaces and unrecognized escape sequences later. + */ + BackspaceUnescapingProcessor(bool lastEscaphingPhase = true) : lastEscaphingPhase(lastEscaphingPhase) { + } + +}; + +} +} +} +} diff -r fd669e73d39a -r 0e7c57d48d1e src/lib/BasicUnescapingINIHandler.h --- a/src/lib/BasicUnescapingINIHandler.h Thu Nov 26 11:42:26 2020 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,74 +0,0 @@ -/** - * Relational pipes - * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info) - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, version 3 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ -#pragma once - -#include - -#include "UnescapingINIHandler.h" - -using namespace std; -using namespace relpipe::writer; - -namespace relpipe { -namespace in { -namespace ini { -namespace lib { - -class BasicUnescapingINIContentHandler : public UnescapingINIContentHandler { -protected: - - virtual std::string unescape(const std::string& s) { - std::stringstream result; - for (int i = 0, length = s.size(); i < length; i++) { - char ch = s[i]; - if (i + 1 < length && ch == ESC) { - ch = s[i + 1]; - if (ch == 'n') put(result, '\n', i); - else if (ch == 'r') put(result, '\r', i); - else if (ch == 't') put(result, '\t', i); - else if (ch == 's') put(result, ' ', i); // TODO: Reconsider what is „basic“ escaping and should be supported. - else if (ch == '"') put(result, ch, i); // The delimiters (\n,]",') are already unescaped during the first stage in the INIReader while parsing (the delimiter relevant to given environment is unescaped, e.g. \" in "quoted" value). - else if (ch == '\'') put(result, ch, i); // So it does not necessary to do it here. But someone might write a="xxx\'zzz" however it is superfluous because a="xxx'zzz" will also work. - else if (ch == ']') put(result, ch, i); - else if (ch == ':') put(result, ch, i); - else if (ch == ';') put(result, ch, i); - else if (ch == '#') put(result, ch, i); - else if (ch == '=') put(result, ch, i); - else if (ch == ESC && !lastEscaphingPhase) put(result, ESC, i).put(ESC); // copy and skip even the second \ to avoid its misinterpretation in the next cycle - else if (ch == ESC && lastEscaphingPhase) put(result, ESC, i); // unescape \\ to \. - else if (lastEscaphingPhase) throw std::logic_error(std::string("Unsupported escape sequence: ") + ch); - else result.put(ESC); // keep the escape sequence for later unescaping phase - } else if (ch == ESC) { - throw std::logic_error(std::string("Missing escape sequence")); // this should not happen - } else { - result.put(ch); - } - } - return result.str(); - } - -public: - - BasicUnescapingINIContentHandler(INIContentHandler& output, bool lastEscaphingPhase) : UnescapingINIContentHandler(output, lastEscaphingPhase) { - } - -}; - -} -} -} -} diff -r fd669e73d39a -r 0e7c57d48d1e src/lib/BasicUnescapingProcessor.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib/BasicUnescapingProcessor.h Sat Nov 28 18:14:15 2020 +0100 @@ -0,0 +1,67 @@ +/** + * Relational pipes + * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#pragma once + +#include + +#include "UnescapingProcessor.h" + +using namespace std; +using namespace relpipe::writer; + +namespace relpipe { +namespace in { +namespace ini { +namespace lib { + +class BasicUnescapingProcessor : public UnescapingProcessor { +public: + + std::string unescape(const std::string& s, const TextType type) override { + std::stringstream result; + for (int i = 0, length = s.size(); i < length; i++) { + char ch = s[i]; + if (i + 1 < length && ch == ESC) { + ch = s[i + 1]; + if (ch == 'n') put(result, '\n', i); + else if (ch == 'r') put(result, '\r', i); + else if (ch == 't') put(result, '\t', i); + else if (ch == 's') put(result, ' ', i); // TODO: Reconsider what is „basic“ escaping and should be supported. + else if (ch == '"') put(result, ch, i); // The delimiters (\n,]",') are already unescaped during the first stage in the INIReader while parsing (the delimiter relevant to given environment is unescaped, e.g. \" in "quoted" value). + else if (ch == '\'') put(result, ch, i); // So it does not necessary to do it here. But someone might write a="xxx\'zzz" however it is superfluous because a="xxx'zzz" will also work. + else if (ch == ']') put(result, ch, i); + else if (ch == ':') put(result, ch, i); + else if (ch == ';') put(result, ch, i); + else if (ch == '#') put(result, ch, i); + else if (ch == '=') put(result, ch, i); + else if (ch == ESC) put(result, ESC, i).put(ESC); // copy and skip even the second \ to avoid its misinterpretation in the next cycle + else result.put(ESC); // keep the escape sequence for later unescaping phase + } else if (ch == ESC) { + throw std::logic_error(std::string("Missing escape sequence")); // this should not happen + } else { + result.put(ch); + } + } + return result.str(); + } + +}; + +} +} +} +} diff -r fd669e73d39a -r 0e7c57d48d1e src/lib/INIReader.cpp --- a/src/lib/INIReader.cpp Thu Nov 26 11:42:26 2020 +0100 +++ b/src/lib/INIReader.cpp Sat Nov 28 18:14:15 2020 +0100 @@ -32,21 +32,47 @@ std::istream& input; std::vector handlers; + class ConfiguredUnescapingProcessor { + public: + std::shared_ptr processor; + const std::string uri; + bool enbaled; + + ConfiguredUnescapingProcessor(std::shared_ptr processor, const std::string uri, bool enbaled) : processor(processor), uri(uri), enbaled(enbaled) { + } + + }; + + std::vector unescapingProcessors; + /** - * This might be configurable. - * * By default, we ignore all leading whitespace on continuing lines. * If there should be some spaces or tabs, they should be placed on the previous line before the „\“. * If a line break is desired, it should be written as \n (escaped) or the value should be quoted in " or '. * + * TODO: several options: + * - enabled, disabled + * - if disabled, then: keep backslash, trim backslash, escape backslash + * (keep requires support in some further unescaping phase, or it will cause an error) + * - keep or trim the line end + * - keep or trim the leading spaces + * - allow comments interleaved with continuing lines (the freaky systemd syntax) + * * Related specifications: * - https://docs.oracle.com/javase/8/docs/api/index.html?java/util/Properties.html + * - https://www.freedesktop.org/software/systemd/man/systemd.syntax.html */ - bool consumeLeadingSpacesOnContinuingLines = true; + bool trimLeadingSpacesOnContinuingLines = true; + /** - * This might be configurable. - * + * Some dialects or configuration files in general does not support sections. + * Then a line, that looks like an INI section, should be interpreted as a key + * (or error, if does not have a proper key-value separator). + */ + bool allowSections = true; + + /** * KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section. * Line „[section_1][$i]“ means that the „section_1“ is „locked“. * We may emit this information somehow later, but for now, it is just ignored. @@ -59,8 +85,6 @@ bool allowSectionTags = true; /** - * This might be configurable. - * * If whole key is „aaa[bbb]“ then „aaa“ is considered to be the key and „bbb“ the sub-key. * No \[ escaping is currently supported, so the key might not contain the bracket character. * @@ -70,6 +94,35 @@ */ bool allowSubKeys = true; + /** + * Classic INI uses „key=value“ syntax. + * But some other formats/dialects might use key:value. + * + * Only single character separators are supported. + * If multiple separators should be recognized (e.g. both „=“ and „:“), this string will contain all of them, + * i.e. „:=“ does not mean that the „key:=value“ syntax, but „key=value“ or „key:value“. + */ + std::string keyValueSeparators = "="; + + /** + * Classic INI uses „; comment“ syntax. + * But many existing files contain „# comment“ lines. + * + * Only single character separators are supported (works same as keyValueSeparators). + */ + std::string commentSeparators = ";#"; + + /** + * INI often support both "quotes" and 'apostrophes' styles. + * But some dialects may support only one of them or not support quoting at all. + * + * In such case e.g. „key="some value"“ would mean that the value is „"value"“ (including the quotes). + * Thus it is important to allow disabling quote recognizing (which is done by setting this parameter to empty string). + * + * Only single character quotes are supported (works same as keyValueSeparators). + */ + std::string quotes = "\"'"; + int lineNumber = 1; int eventNumber = 0; @@ -111,30 +164,35 @@ } void processContinuingLine(std::stringstream& result) { - if (consumeLeadingSpacesOnContinuingLines) readSpacesAndTabs(); + if (trimLeadingSpacesOnContinuingLines) readSpacesAndTabs(); else result.put('\n'); } - std::string readUntil(char until, bool* found = nullptr) { + std::string readUntil(const char until, bool* found = nullptr) { + return readUntil(std::string(1, until), found); + } + + std::string readUntil(const std::string& until, bool* found = nullptr) { std::stringstream result; - for (char ch = peek(); input.good() && ch != until; ch = peek()) { + for (char ch = peek(); input.good() && !oneOf(ch, until); ch = peek()) { if (ch == '\\') { get(); ch = get(); - if (ch == until && ch == '\n') processContinuingLine(result); - else if (ch == until) result.put(ch); + if (oneOf(ch, until) && ch == '\n') processContinuingLine(result); + else if (oneOf(ch, until)) result.put(ch); else if (ch == std::istream::traits_type::eof()) break; else result.put('\\').put(ch); - // TODO: two-stage and modular unescaping: here unescape only \+LF or more genereally: unescape only the until character and rest leave untouched - // second escaping stage move to separate class/wrapper (similar to hierarchical wrappers) + // unescaping is done in two phases: + // here we unescape just the \n (LF) + // other escape sequences are leaved untouched and will be processed in later phases, see see UnescapingINIHandler } else { ch = get(); result.put(ch); } } - if (peek() == until) { + if (oneOf(peek(), until)) { get(); if (found) *found = true; } else { @@ -144,13 +202,17 @@ return result.str(); } - std::string readToken(char until, char* quote = nullptr, bool* found = nullptr) { + std::string readToken(const char until, char* quote = nullptr, bool* found = nullptr) { + return readToken(std::string(1, until), quote, found); + } + + std::string readToken(const std::string& until, char* quote = nullptr, bool* found = nullptr) { std::string result; char ch = peek(); if (isQuote(ch)) { if (quote) *quote = ch; - result = readUntil(get(), found); + result = readUntil(std::string(1, get()), found); } else { if (quote) *quote = 0; result = readUntil(until, found); @@ -160,35 +222,107 @@ } std::string readTokenAndEatTerminator(char until, char* quote = nullptr, bool* found = nullptr) { + return readTokenAndEatTerminator(std::string(1, until), quote, found); + } + + std::string readTokenAndEatTerminator(const std::string& until, char* quote = nullptr, bool* found = nullptr) { std::string result = readToken(until, quote, found); if (*quote) { readAllWhitespace(); - if (get() != until) throw std::logic_error(std::string("missing „") + std::string(1, until) + "“ after quoted section name"); + if (!oneOf(get(), until)) throw std::logic_error(std::string("missing „") + until + "“ after quoted section name"); } return result; } + std::string unescape(const std::string& value, UnescapingProcessor::TextType type) { + std::string result = value; + for (ConfiguredUnescapingProcessor p : unescapingProcessors) if (p.enbaled) result = p.processor->unescape(result, type); + return result; + } + bool isComment(char ch) { - return ch == '#' || ch == ';'; + return oneOf(ch, commentSeparators); } bool isQuote(char ch) { - return ch == '"' || ch == '\''; + return oneOf(ch, quotes); + } + + /** + * @param ch character to be evaluated + * @param options list of options (characters) + * @return whether ch is one of options + */ + bool oneOf(char ch, const std::string& options) { + return options.find(ch) != std::string::npos; } std::string trim(std::string s) { return std::regex_replace(s, std::regex("^\\s+|\\s+$"), ""); } + /** + * TODO: use a common method + */ + bool parseBoolean(const std::string& value) { + if (value == "true") return true; + else if (value == "false") return false; + else throw std::invalid_argument(std::string("Unable to parse boolean value: ") + value + " (expecting true or false)"); + } + + void setDialect(const std::string& name) { + if (name == "default-ini") { + // already set + } else if (name == "java-properties") { + trimLeadingSpacesOnContinuingLines = true; + allowSections = false; + allowSectionTags = false; + allowSubKeys = false; + commentSeparators = "#"; + keyValueSeparators = "=:"; + quotes = ""; + // TODO: enable unicode unescaping + } else { + throw std::invalid_argument(std::string("Unsupported INI dialect: ") + name); + } + } + + bool setUnescaping(const std::string& uri, const std::string& value) { + for (ConfiguredUnescapingProcessor& p : unescapingProcessors) { + if (p.uri == uri) { + p.enbaled = parseBoolean(value); + return true; + } + } + return false; + } + public: INIReaderImpl(std::istream& input) : input(input) { } + void setOption(const std::string& uri, const std::string& value) override { + if (uri == "trim-continuing-lines") trimLeadingSpacesOnContinuingLines = parseBoolean(value); // TODO: continuing lines modes (enum), not just boolean + else if (uri == "allow-sections") allowSections = parseBoolean(value); + else if (uri == "allow-section-tags") allowSectionTags = parseBoolean(value); + else if (uri == "allow-sub-keys") allowSubKeys = parseBoolean(value); + else if (uri == "comment-separators") commentSeparators = value; + else if (uri == "key-value-separators") keyValueSeparators = value; + else if (uri == "quotes") quotes = value; + else if (uri == "dialect") setDialect(value); + else if (setUnescaping(uri, value)); + else throw std::invalid_argument(std::string("Invalid parser option: „") + uri + "“ with value: „" + value + "“"); + } + void addHandler(INIContentHandler* handler) override { handlers.push_back(handler); } + void addUnescapingProcessor(std::shared_ptr processor, const std::string uri, bool enabledByDefault) override { + unescapingProcessors.push_back({processor, uri, enabledByDefault}); + } + void process() override { for (INIContentHandler* handler : handlers) handler->startDocument(); @@ -196,10 +330,10 @@ while (input.good()) { // TODO: condition { + INIContentHandler::WhitespaceEvent event; + event.lineNumber = lineNumber; std::string whitespace = readAllWhitespace(); if (whitespace.size()) { - INIContentHandler::WhitespaceEvent event; - event.lineNumber = lineNumber; event.eventNumber = ++eventNumber; event.whitespace = whitespace; for (INIContentHandler* handler : handlers) handler->whitespace(event); @@ -213,20 +347,23 @@ if (ch == std::istream::traits_type::eof()) { break; - } else if (ch == '[') { + } else if (ch == '[' && allowSections) { if (inSection) for (INIContentHandler* handler : handlers) handler->endSection(); inSection = true; - get(); - readAllWhitespace(); INIContentHandler::SectionStartEvent event; event.lineNumber = lineNumber; event.eventNumber = ++eventNumber; + get(); + readAllWhitespace(); event.name = readTokenAndEatTerminator(']', "e, &found); + if (!quote) event.name = trim(event.name); + event.name = unescape(event.name, UnescapingProcessor::TextType::SectionName); readSpacesAndTabs(); if (allowSectionTags && peek() == '[') { get(); event.tag = readTokenAndEatTerminator(']', "e, &found); + event.tag = unescape(event.tag, UnescapingProcessor::TextType::SectionTag); } readSpacesAndTabs(); @@ -235,6 +372,7 @@ get(); readSpacesAndTabs(); event.comment = readUntil('\n', &found); + event.comment = unescape(event.comment, UnescapingProcessor::TextType::SectionComment); } else if (ch == '\n') { get(); } else { @@ -243,31 +381,33 @@ for (INIContentHandler* handler : handlers) handler->startSection(event); } else if (isComment(ch)) { - get(); - readSpacesAndTabs(); INIContentHandler::CommentEvent event; event.lineNumber = lineNumber; event.eventNumber = ++eventNumber; + get(); + readSpacesAndTabs(); event.comment = readUntil('\n', &found); + event.comment = unescape(event.comment, UnescapingProcessor::TextType::Comment); for (INIContentHandler* handler : handlers) handler->comment(event); } else { - std::string fullKey = readToken('=', "e, &found); + INIContentHandler::EntryEvent event; + event.lineNumber = lineNumber; + event.eventNumber = ++eventNumber; + + std::string fullKey = readToken(keyValueSeparators, "e, &found); if (!found) throw std::logic_error(std::string("missing = after key: '") + fullKey + "'"); if (!quote) fullKey = trim(fullKey); readSpacesAndTabs(); if (quote) { ch = get(); - if (ch == '=') readSpacesAndTabs(); + if (oneOf(ch, keyValueSeparators)) readSpacesAndTabs(); else throw std::logic_error(std::string("missing = after quoted key: '") + fullKey + "'"); } std::string value = readToken('\n', "e, &found); if (!quote) value = trim(value); - INIContentHandler::EntryEvent event; - event.lineNumber = lineNumber; - event.eventNumber = ++eventNumber; event.key = fullKey; event.fullKey = fullKey; event.value = value; @@ -278,9 +418,14 @@ event.key = match[1]; event.subKey = match[2]; event.fullKey = fullKey; + event.subKey = unescape(event.subKey, UnescapingProcessor::TextType::EntryKey); } } + event.key = unescape(event.key, UnescapingProcessor::TextType::EntryKey); + event.fullKey = unescape(event.fullKey, UnescapingProcessor::TextType::EntryKey); + event.value = unescape(event.value, UnescapingProcessor::TextType::EntryValue); + if (quote) { readSpacesAndTabs(); ch = peek(); @@ -288,9 +433,15 @@ get(); readSpacesAndTabs(); event.comment = readUntil('\n', &found); + event.comment = unescape(event.comment, UnescapingProcessor::TextType::EntryComment); } else if (ch == '\n') { get(); } else { + // TODO: optional support for multiple tokens in a single entry? + // modes: array, concatenate + // some-array-1 = "item 1" "item 2" 'item 3' item 4 + // some-array-2 = "item 1" "item 2" 'item 3' item_4 item_5 + // some-bash-style-string-value = "this "will' be' concatenated → this will be concatenated throw std::logic_error(std::string("unexpected content after the quoted value: key='") + fullKey + "' value='" + event.value + "'"); } } diff -r fd669e73d39a -r 0e7c57d48d1e src/lib/INIReader.h --- a/src/lib/INIReader.h Thu Nov 26 11:42:26 2020 +0100 +++ b/src/lib/INIReader.h Sat Nov 28 18:14:15 2020 +0100 @@ -20,6 +20,7 @@ #include #include "INIContentHandler.h" +#include "UnescapingProcessor.h" namespace relpipe { namespace in { @@ -32,7 +33,20 @@ class INIReader { public: virtual ~INIReader() = default; + /** + * TODO: after moving to alt2xml: + * - option will be identified by globally unique URI/IRI + * - parsers will provide catalog of supported options (names, enum values, documentation) + * - options serves as both XML parser features and properties and are mapped to them + */ + virtual void setOption(const std::string& uri, const std::string& value) = 0; + /** + * TODO: after moving to alt2xml: + * - this will be generic handler for SAX event + * - but both sides will know the schema (allowed elements and attributes for INI events) + */ virtual void addHandler(INIContentHandler* handler) = 0; + virtual void addUnescapingProcessor(std::shared_ptr processor, const std::string uri, bool enabledByDefault) = 0; virtual void process() = 0; static INIReader* create(std::istream& input); }; diff -r fd669e73d39a -r 0e7c57d48d1e src/lib/JavaPropertiesUnescapingINIHandler.h --- a/src/lib/JavaPropertiesUnescapingINIHandler.h Thu Nov 26 11:42:26 2020 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,105 +0,0 @@ -/** - * Relational pipes - * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info) - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, version 3 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ -#pragma once - -#include -#include -#include - -#include "UnescapingINIHandler.h" - -using namespace std; -using namespace relpipe::writer; - -namespace relpipe { -namespace in { -namespace ini { -namespace lib { - -class JavaPropertiesUnescapingINIContentHandler : public UnescapingINIContentHandler { -private: - wstring_convert < codecvt_utf8> convertor; // INI parser works with UTF-8 - - bool readHex(const char* hexadecimal, size_t hexLength, uint8_t* resultBuffer, size_t binLength) { - if (hexLength != binLength * 2) return false; - - for (size_t i = 0; i < binLength; i++) { - uint8_t value = 0; - char a = hexadecimal[i * 2]; - char b = hexadecimal[i * 2 + 1]; - - if (a >= '0' && a <= '9') value += (a - '0')*16; - else if (a >= 'a' && a <= 'f') value += (a - 'a' + 10)*16; - else if (a >= 'A' && a <= 'F') value += (a - 'A' + 10)*16; - else return false; - - if (b >= '0' && b <= '9') value += b - '0'; - else if (b >= 'a' && b <= 'f') value += b - 'a' + 10; - else if (b >= 'A' && b <= 'F') value += b - 'A' + 10; - else return false; - - if (resultBuffer) resultBuffer[i] = value; - } - return true; - } - -protected: - - virtual std::string unescape(const std::string& s) { - std::stringstream result; - for (int i = 0, length = s.size(); i < length; i++) { - char ch = s[i]; - if (i + 1 < length && ch == ESC) { - ch = s[i + 1]; - if (ch == 'u') { - // TODO: simplify, clean-up, verify (but seems working) - i++; - int hexLength = 4; - if (i + hexLength < length) { - uint16_t u16; - bool hexOK = readHex(s.c_str() + i + 1, hexLength, (uint8_t*) & u16, sizeof (u16)); - if (hexOK) result << convertor.to_bytes(ntohs(u16)); - else throw std::logic_error(std::string("Invalid unicode escape sequence: invalid HEX")); - i += hexLength; - } else { - throw std::logic_error(std::string("Invalid unicode escape sequence: missing characters")); - } - - } else if (ch == ESC && !lastEscaphingPhase) put(result, ESC, i).put(ESC); // copy and skip even the second \ to avoid its misinterpretation in the next cycle - else if (ch == ESC && lastEscaphingPhase) put(result, ESC, i); // unescape \\ to \. - else if (lastEscaphingPhase) throw std::logic_error(std::string("Unsupported escape sequence: ") + ch); - else result.put(ESC); // keep the escape sequence for later unescaping phase - } else if (ch == ESC) { - throw std::logic_error(std::string("Missing escape sequence")); // this should not happen - } else { - result.put(ch); - } - } - return result.str(); - } - -public: - - JavaPropertiesUnescapingINIContentHandler(INIContentHandler& output, bool lastEscaphingPhase) : UnescapingINIContentHandler(output, lastEscaphingPhase, true) { - } - -}; - -} -} -} -} diff -r fd669e73d39a -r 0e7c57d48d1e src/lib/JavaPropertiesUnescapingProcessor.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib/JavaPropertiesUnescapingProcessor.h Sat Nov 28 18:14:15 2020 +0100 @@ -0,0 +1,101 @@ +/** + * Relational pipes + * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#pragma once + +#include +#include +#include + +#include "UnescapingProcessor.h" + +using namespace std; +using namespace relpipe::writer; + +namespace relpipe { +namespace in { +namespace ini { +namespace lib { + +/** + * Should work according to 3.3. Unicode Escapes + */ +class JavaPropertiesUnescapingProcessor : public UnescapingProcessor { +private: + wstring_convert < codecvt_utf8> convertor; // INI parser works with UTF-8 + + bool readHex(const char* hexadecimal, size_t hexLength, uint8_t* resultBuffer, size_t binLength) { + if (hexLength != binLength * 2) return false; + + for (size_t i = 0; i < binLength; i++) { + uint8_t value = 0; + char a = hexadecimal[i * 2]; + char b = hexadecimal[i * 2 + 1]; + + if (a >= '0' && a <= '9') value += (a - '0')*16; + else if (a >= 'a' && a <= 'f') value += (a - 'a' + 10)*16; + else if (a >= 'A' && a <= 'F') value += (a - 'A' + 10)*16; + else return false; + + if (b >= '0' && b <= '9') value += b - '0'; + else if (b >= 'a' && b <= 'f') value += b - 'a' + 10; + else if (b >= 'A' && b <= 'F') value += b - 'A' + 10; + else return false; + + if (resultBuffer) resultBuffer[i] = value; + } + return true; + } + +public: + + std::string unescape(const std::string& s, const TextType type) override { + std::stringstream result; + for (int i = 0, length = s.size(); i < length; i++) { + char ch = s[i]; + if (i + 1 < length && ch == ESC) { + ch = s[i + 1]; + if (ch == 'u') { + // TODO: simplify, clean-up, verify (but seems working) + i++; + int hexLength = 4; + if (i + hexLength < length) { + uint16_t u16; + bool hexOK = readHex(s.c_str() + i + 1, hexLength, (uint8_t*) & u16, sizeof (u16)); + if (hexOK) result << convertor.to_bytes(ntohs(u16)); + else throw std::logic_error(std::string("Invalid unicode escape sequence: invalid HEX")); + i += hexLength; + } else { + throw std::logic_error(std::string("Invalid unicode escape sequence: missing characters")); + } + + } else if (ch == ESC) put(result, ESC, i).put(ESC); // copy and skip even the second \ to avoid its misinterpretation in the next cycle + else result.put(ESC); // keep the escape sequence for later unescaping phase + } else if (ch == ESC) { + throw std::logic_error(std::string("Missing escape sequence")); // this should not happen + } else { + result.put(ch); + } + } + return result.str(); + } + +}; + +} +} +} +} diff -r fd669e73d39a -r 0e7c57d48d1e src/lib/UnescapingINIHandler.h --- a/src/lib/UnescapingINIHandler.h Thu Nov 26 11:42:26 2020 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,108 +0,0 @@ -/** - * Relational pipes - * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info) - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, version 3 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ -#pragma once - -#include - -#include "INIReader.h" - -using namespace std; -using namespace relpipe::writer; - -namespace relpipe { -namespace in { -namespace ini { -namespace lib { - -class UnescapingINIContentHandler : public INIContentHandler { -private: - INIContentHandler& output; - bool unescapeComments; - -protected: - const char ESC = '\\'; - bool lastEscaphingPhase; - - std::stringstream& put(std::stringstream& result, const char& ch, int& i) { - result.put(ch); - i++; - return result; - } - - virtual std::string unescape(const std::string& s) = 0; - -public: - - /** - * @param output here will be sent events with unescaped values - * @param lastEscaphingPhase instances of UnescapingINIContentHandler might be chained: - * unsupported escaping sequences are kept untouched to be processed in further phases; - * in the last phase, all remaining sequences (including \\) must be recognized and unescaped - * (otherwise the input is considered invalid and an exception is thrown) - */ - UnescapingINIContentHandler(INIContentHandler& output, bool lastEscaphingPhase, bool unescapeComments = false) : output(output), lastEscaphingPhase(lastEscaphingPhase), unescapeComments(unescapeComments) { - } - - void startDocument() override { - output.startDocument(); - } - - void endDocument() override { - output.endDocument(); - } - - void startSection(const SectionStartEvent& event) override { - SectionStartEvent e = event; - e.name = unescape(e.name); - if (unescapeComments) e.comment = unescape(e.comment); - output.startSection(e); - } - - void endSection() override { - output.endSection(); - } - - void entry(const EntryEvent& event) override { - EntryEvent e = event; - e.key = unescape(e.key); - e.fullKey = unescape(e.fullKey); - e.subKey = unescape(e.subKey); - e.value = unescape(e.value); - if (unescapeComments) e.comment = unescape(e.comment); - output.entry(e); - } - - void comment(const CommentEvent& event) override { - if (unescapeComments) { - CommentEvent e = event; - e.comment = unescape(e.comment); - output.comment(e); - } else { - output.comment(event); - } - } - - void whitespace(const WhitespaceEvent& event) override { - output.whitespace(event); - } - -}; - -} -} -} -} diff -r fd669e73d39a -r 0e7c57d48d1e src/lib/UnescapingProcessor.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib/UnescapingProcessor.h Sat Nov 28 18:14:15 2020 +0100 @@ -0,0 +1,60 @@ +/** + * Relational pipes + * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#pragma once + +#include + +#include "INIReader.h" + +using namespace std; + +namespace relpipe { +namespace in { +namespace ini { +namespace lib { + +class UnescapingProcessor { +private: +protected: + const char ESC = '\\'; + + std::stringstream& put(std::stringstream& result, const char& ch, int& i) { + result.put(ch); + i++; + return result; + } + +public: + + enum class TextType { + SectionName, + SectionComment, + SectionTag, + EntryKey, + EntryValue, + EntryComment, + Comment, + }; + + virtual std::string unescape(const std::string& s, const TextType type) = 0; + +}; + +} +} +} +}