# HG changeset patch # User František Kučera # Date 1606061472 -3600 # Node ID b9a3c806468afc0fa435c93213ae24581784d07e # Parent f4fb07ed875322dcc8d7cf91b743d7e4b4989243 temporary copy INIReader.h, INIReader.cpp, INIContentHandler.h from relpipe-in-ini + XMLNameCodec.h from relpipe-in-yamltable (will be moved to alt2xml and shared) diff -r f4fb07ed8753 -r b9a3c806468a nbproject/configurations.xml --- a/nbproject/configurations.xml Sun Nov 22 17:06:17 2020 +0100 +++ b/nbproject/configurations.xml Sun Nov 22 17:11:12 2020 +0100 @@ -42,6 +42,9 @@ + + INIReader.cpp + XMLDocumentConstructor.h relpipe-in-xmltable.cpp @@ -102,6 +105,10 @@ true + + + + diff -r f4fb07ed8753 -r b9a3c806468a src/CMakeLists.txt --- a/src/CMakeLists.txt Sun Nov 22 17:06:17 2020 +0100 +++ b/src/CMakeLists.txt Sun Nov 22 17:11:12 2020 +0100 @@ -29,6 +29,7 @@ # Executable output: add_executable( ${EXECUTABLE_FILE} + lib/INIReader.cpp relpipe-in-xmltable.cpp ) diff -r f4fb07ed8753 -r b9a3c806468a src/lib/INIContentHandler.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib/INIContentHandler.h Sun Nov 22 17:11:12 2020 +0100 @@ -0,0 +1,50 @@ +/** + * Relational pipes + * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#pragma once + +#include + +class INIContentHandler { +public: + + class Event { + public: + int64_t eventNumber = -1; + int64_t lineNumber = -1; + std::string comment; + }; + + class SectionStartEvent : public Event { + public: + std::string name; + }; + + class EntryEvent : public Event { + public: + std::string key; + std::string subKey; + std::string fullKey; + std::string value; + }; + + virtual ~INIContentHandler() = default; + virtual void startDocument() = 0; + virtual void endDocument() = 0; + virtual void startSection(const SectionStartEvent& event) = 0; + virtual void endSection() = 0; + virtual void entry(const EntryEvent& event) = 0; +}; \ No newline at end of file diff -r f4fb07ed8753 -r b9a3c806468a src/lib/INIReader.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib/INIReader.cpp Sun Nov 22 17:11:12 2020 +0100 @@ -0,0 +1,105 @@ +/** + * Relational pipes + * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include + +#include "INIReader.h" + +class INIReaderImpl : public INIReader { +private: + std::istream& input; + std::vector handlers; +public: + + INIReaderImpl(std::istream& input) : input(input) { + } + + void addHandler(INIContentHandler* handler) override { + handlers.push_back(handler); + } + + void process() override { + + for (INIContentHandler* handler : handlers) handler->startDocument(); + + std::regex whitespacePattrern("\\s*"); + std::regex commentPattrern("\\s*(;|#)\\s*(.*)"); + std::regex sectionPattrern("\\s*\\[\\s*([^\\]]+)\\s*\\]\\s*"); + std::regex entryQuotesPattrern(/***/"\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*\"([^']+)\"\\s*((;|#)\\s*(.*))?"); + std::regex entryApostrophesPattrern("\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*'([^']+)'\\s*((;|#)\\s*(.*))?"); + std::regex entryPlainPattrern("\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*(.*)"); + + std::smatch match; + bool inSection = false; + std::string line; + int lineNumber = 0; + int eventNumber = 0; + + + while (std::getline(input, line)) { + lineNumber++; + + if (std::regex_match(line, match, whitespacePattrern)) { + // TODO: support also whitespace + } else if (std::regex_match(line, match, commentPattrern)) { + // TODO: support also comments + emit also the comment style (;/#) + } else if (std::regex_match(line, match, sectionPattrern)) { + if (inSection) for (INIContentHandler* handler : handlers) handler->endSection(); + inSection = true; + INIContentHandler::SectionStartEvent event; + event.lineNumber = lineNumber; + event.eventNumber = ++eventNumber; + event.name = match[1]; + // TODO: support also comments + emit also the comment style (;/#) + for (INIContentHandler* handler : handlers) handler->startSection(event); + } else if (std::regex_match(line, match, entryQuotesPattrern) || std::regex_match(line, match, entryApostrophesPattrern) || std::regex_match(line, match, entryPlainPattrern)) { + INIContentHandler::EntryEvent event; + event.lineNumber = lineNumber; + event.eventNumber = ++eventNumber; + event.key = match[2]; + event.subKey = match[4]; + event.fullKey = match[1]; + event.value = match[5]; + if (match.size() == 9) event.comment = match[8]; + // TODO: emit also the quote style ('/"/) and surrounding whitespace + for (INIContentHandler* handler : handlers) handler->entry(event); + } else { + // TODO: warning, error, or support unknown content + } + + // TODO: probably switch to state-machine approach instead of regular expressions + // TODO: warning/error handler + // TODO: support also multiline content (\ + \n) + // TODO: support also quoted or multiline keys? + // TODO: support also escaped characters + // TODO: support also Java .properties and manifest.mf formats? + // TODO: support also nested sections – hierarchy + // TODO: support also nested keys e.g. key.sub.subsub.subsubsub=value – translate them to nested sections + // TODO: support also option for alternative key-value separator (: instead of =) + // TODO: support also other encodings (currently only UTF-8 is supported) + } + + if (inSection) for (INIContentHandler* handler : handlers) handler->endSection(); + + for (INIContentHandler* handler : handlers) handler->endDocument(); + } +}; + +INIReader* INIReader::create(std::istream& input) { + return new INIReaderImpl(input); +} diff -r f4fb07ed8753 -r b9a3c806468a src/lib/INIReader.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib/INIReader.h Sun Nov 22 17:11:12 2020 +0100 @@ -0,0 +1,33 @@ +/** + * Relational pipes + * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#pragma once + +#include +#include + +#include "INIContentHandler.h" + +/** + * TODO: Files in the src/lib directory will be moved to alt2xml and used as a shared library. + */ +class INIReader { +public: + virtual ~INIReader() = default; + virtual void addHandler(INIContentHandler* handler) = 0; + virtual void process() = 0; + static INIReader* create(std::istream& input); +}; diff -r f4fb07ed8753 -r b9a3c806468a src/lib/XMLNameCodec.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib/XMLNameCodec.h Sun Nov 22 17:11:12 2020 +0100 @@ -0,0 +1,127 @@ +/** + * Relational pipes + * Copyright © 2019 František Kučera (Frantovo.cz, GlobalCode.info) + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#pragma once + +#include +#include +#include + +#include + +namespace relpipe { +namespace in { +namespace xmltable { + +class XMLNameCodec { +private: + static const char DEFAULT_ESCAPING_CHARACTER = '_'; + const char esc; + const bool namespaceAware; + + bool between(gunichar codepoint, gunichar start, gunichar end) { + return codepoint >= start && codepoint <= end; + } + + /** + * https://www.w3.org/TR/REC-xml/#NT-NameStartChar + * + * @param codepoint unicode character + * @return whether this character is allowed at the beginning of a XML name + */ + bool isValidNameStartChar(gunichar codepoint) { + // NameStartChar ::= ":" | [A-Z] | "_" | [a-z] + // | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] + // | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] + // | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] + return (codepoint == ':' && !namespaceAware) || between(codepoint, 'A', 'Z') || codepoint == '_' || between(codepoint, 'a', 'z') + || between(codepoint, 0xC0, 0xD6) || between(codepoint, 0xD8, 0xF6) || between(codepoint, 0xF8, 0x2FF) || between(codepoint, 0x370, 0x37D) || between(codepoint, 0x37F, 0x1FFF) + || between(codepoint, 0x200C, 0x200D) || between(codepoint, 0x2070, 0x218F) || between(codepoint, 0x2C00, 0x2FEF) || between(codepoint, 0x3001, 0xD7FF) + || between(codepoint, 0xF900, 0xFDCF) || between(codepoint, 0xFDF0, 0xFFFD) || between(codepoint, 0x10000, 0xEFFFF); + } + + /** + * https://www.w3.org/TR/REC-xml/#NT-NameChar + * + * @param codepoint unicode character + * @return whether this character is allowed in a XML name + */ + bool isValidNameChar(gunichar codepoint) { + // NameChar ::= NameStartChar | "-" | "." | [0-9] + // | #xB7 + // | [#x0300-#x036F] | [#x203F-#x2040] + return isValidNameStartChar(codepoint) || codepoint == '-' || codepoint == '.' || between(codepoint, '0', '9') + || codepoint == 0xB7 + || between(codepoint, 0x0300, 0x036F) || between(codepoint, 0x203F, 0x2040); + } + +public: + + XMLNameCodec() : XMLNameCodec(DEFAULT_ESCAPING_CHARACTER, true) { + } + + /** + * @param escapingCharacter must be valid character allowed not only in the middle of the XML name but also as the + * first character of the name + * @param namespaceAware colon character is reserved as a separator of the prefix and the local name, see + * https://www.w3.org/TR/REC-xml-names/#NT-NCName + * @throws std::invalid_argument if escapingCharacter is not valid + */ + XMLNameCodec(const char escapingCharacter, const bool namespaceAware) : esc(escapingCharacter), namespaceAware(namespaceAware) { + // TODO: allow also characters like #xB7 and add another escaping if they occur at the beginning of the name? + if (!isValidNameStartChar(esc)) { + throw std::invalid_argument("The character „" + std::to_string(escapingCharacter) + "“ is not allowed at the beginning of a XML name and thus not usable for escaping"); + } + } + + virtual ~XMLNameCodec() { + } + + /** + * @param name any string + * @return valid name of XML element or attribute + */ + Glib::ustring encode(Glib::ustring name) { + if (name.empty()) { + return Glib::ustring(1, esc); + } else { + std::stringstream result; + + for (int i = 0; i < name.size(); i++) { + gunichar codepoint = name[i]; + if (codepoint == esc) { + result.put(esc); + result.put(esc); + continue; + } else if ((i == 0 && isValidNameStartChar(codepoint)) || (i > 0 && isValidNameChar(codepoint))) { + result << Glib::ustring(1, codepoint); + continue; + } + + result.put(esc); + result << Glib::ustring::format(std::hex, std::setfill(L'0'), std::setw(2), codepoint); + result.put(esc); + } + + return result.str(); + } + } + +}; + +} +} +}