src/lib/INIReader.cpp
author František Kučera <franta-hg@frantovo.cz>
Mon, 23 Nov 2020 18:13:27 +0100
branchv_0
changeset 21 ccd0677746ce
parent 20 fc8f9aab211d
child 22 817c83a3efab
permissions -rw-r--r--
trim whitespace characters after plain (unquoted) values

/**
 * Relational pipes
 * Copyright © 2020 František Kučera (Frantovo.cz, GlobalCode.info)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

#include <vector>
#include <regex>

#include "INIReader.h"

class INIReaderImpl : public INIReader {
private:
	std::istream& input;
	std::vector<INIContentHandler*> handlers;
public:

	INIReaderImpl(std::istream& input) : input(input) {
	}

	void addHandler(INIContentHandler* handler) override {
		handlers.push_back(handler);
	}

	void process() override {

		for (INIContentHandler* handler : handlers) handler->startDocument();

		std::regex whitespacePattrern("\\s*");
		std::regex commentPattrern("\\s*(;|#)\\s*(.*)");
		std::regex sectionPattrern("\\s*\\[\\s*([^\\]]+)\\s*\\]\\s*(\\[\\s*([^\\]]+)\\s*\\])?\\s*((;|#)\\s*(.*))?");
		std::regex entryQuotesPattrern(/***/"\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*\"([^']+)\"\\s*((;|#)\\s*(.*))?");
		std::regex entryApostrophesPattrern("\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*'([^']+)'\\s*((;|#)\\s*(.*))?");
		std::regex entryPlainPattrern("\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*(.+?)\\s*");

		std::smatch match;
		bool inSection = false;
		std::string line;
		int lineNumber = 0;
		int eventNumber = 0;


		while (std::getline(input, line)) {
			lineNumber++;

			if (std::regex_match(line, match, whitespacePattrern)) {
				INIContentHandler::WhitespaceEvent event;
				event.lineNumber = lineNumber;
				event.eventNumber = ++eventNumber;
				event.whitespace = match[0];
				for (INIContentHandler* handler : handlers) handler->whitespace(event);
			} else if (std::regex_match(line, match, commentPattrern)) {
				INIContentHandler::CommentEvent event;
				event.lineNumber = lineNumber;
				event.eventNumber = ++eventNumber;
				event.comment = match[2];
				for (INIContentHandler* handler : handlers) handler->comment(event);
			} else if (std::regex_match(line, match, sectionPattrern)) {
				if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
				inSection = true;
				INIContentHandler::SectionStartEvent event;
				event.lineNumber = lineNumber;
				event.eventNumber = ++eventNumber;
				event.name = match[1];
				event.comment = match[6];
				// event.tag = match[3];
				// KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section.
				// see <https://userbase.kde.org/KDE_System_Administration/Configuration_Files>, „[$i]“ means that the section is „locked“
				// We may emit this information somehow later, but for now, it is just ignored.
				for (INIContentHandler* handler : handlers) handler->startSection(event);
			} else if (std::regex_match(line, match, entryQuotesPattrern) || std::regex_match(line, match, entryApostrophesPattrern) || std::regex_match(line, match, entryPlainPattrern)) {
				INIContentHandler::EntryEvent event;
				event.lineNumber = lineNumber;
				event.eventNumber = ++eventNumber;
				event.key = match[2];
				event.subKey = match[4];
				event.fullKey = match[1];
				event.value = match[5];
				if (match.size() == 9) event.comment = match[8];
				for (INIContentHandler* handler : handlers) handler->entry(event);
			} else {
				// TODO: warning, error, or support unknown content
			}

			// General feautres:
			// TODO: probably switch to state-machine approach instead of regular expressions or use an existing library
			// TODO: warning/error handler
			// TODO: support also multiline content (\ + \n)
			// TODO: support also quoted or multiline keys?
			// TODO: support also escaped characters
			// TODO: support also Java .properties and manifest.mf formats?
			// TODO: support also quoted sections ["qoted section"] – useful for hierarchy (the path element may contain the separator character)
			// TODO: support also nested sections – hierarchy
			// TODO: support also nested keys e.g. key.sub.subsub.subsubsub=value – translate them to nested sections
			// TODO: support also option for alternative key-value separator (: instead of =)
			// TODO: support also other encodings (currently only UTF-8 is supported)
			
			// Lossless conversions:
			// TODO: emit also the quote style ('/"/)
			// TODO: emit also the comment style (;/#) ?
			// TODO: emit also the whitespace before key name, around =, after "values"/'values', around [sections] ?
			// TODO: emit also the line-end type (LF/CRLF) ?
		}

		if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();

		for (INIContentHandler* handler : handlers) handler->endDocument();
	}
};

INIReader* INIReader::create(std::istream& input) {
	return new INIReaderImpl(input);
}