simple INI parser based on regular expressions v_0
authorFrantišek Kučera <franta-hg@frantovo.cz>
Sat, 21 Nov 2020 20:09:18 +0100
branchv_0
changeset 1 3876a9c56a66
parent 0 16c7fa9b7c49
child 2 f031a4dc7c52
simple INI parser based on regular expressions patterns taken from alt2xml: - https://alt2xml.globalcode.info/ - https://hg.frantovo.cz/alt2xml/file/94081a55bf41/java/alt2xml-in-ini/src/cz/frantovo/alt2xml/in/ini/Reader.java#l151
src/INICommand.cpp
src/lib/INIContentHandler.h
src/lib/INIReader.cpp
src/lib/INIReader.h
--- a/src/INICommand.cpp	Sat Nov 21 18:26:39 2020 +0100
+++ b/src/INICommand.cpp	Sat Nov 21 20:09:18 2020 +0100
@@ -68,6 +68,7 @@
 		vector<AttributeMetadata> metadata;
 		metadata.push_back({L"section", TypeId::STRING});
 		metadata.push_back({L"key", TypeId::STRING});
+		metadata.push_back({L"subkey", TypeId::STRING});
 		metadata.push_back({L"value", TypeId::STRING});
 		writer->startRelation(configuration.relation, metadata, true);
 	};
@@ -84,14 +85,15 @@
 		currentSection.pop_back();
 	};
 
-	void entry(const std::string& key, const std::string& value) override {
+	void entry(const std::string& key, const std::string& subkey, const std::string& value) override {
 		writer->writeAttribute(convertor.from_bytes(getCurrentSectionFullName()));
 		writer->writeAttribute(convertor.from_bytes(key));
+		writer->writeAttribute(convertor.from_bytes(subkey));
 		writer->writeAttribute(convertor.from_bytes(value));
 	};
 
 	// TODO: handle also comments and whitespace (to allow lossless transformation from INI and back to INI)
-	// TODO: handle also subkeys (in [] brackets in the key)
+	// TODO: make subkeys (in [] brackets in the key) optional/configurable
 
 };
 
--- a/src/lib/INIContentHandler.h	Sat Nov 21 18:26:39 2020 +0100
+++ b/src/lib/INIContentHandler.h	Sat Nov 21 20:09:18 2020 +0100
@@ -25,5 +25,5 @@
 	virtual void endDocument() = 0;
 	virtual void startSection(const std::string& name) = 0;
 	virtual void endSection() = 0;
-	virtual void entry(const std::string& key, const std::string& value) = 0;
+	virtual void entry(const std::string& key, const std::string& subkey, const std::string& value) = 0;
 };
\ No newline at end of file
--- a/src/lib/INIReader.cpp	Sat Nov 21 18:26:39 2020 +0100
+++ b/src/lib/INIReader.cpp	Sat Nov 21 20:09:18 2020 +0100
@@ -16,6 +16,7 @@
  */
 
 #include <vector>
+#include <regex>
 
 #include "INIReader.h"
 
@@ -33,34 +34,55 @@
 	}
 
 	void process() override {
-		
-		// TODO: real parser instead of demo data
-		for (INIContentHandler* handler : handlers) {
-			handler->startDocument();
-			
-			handler->entry("key-0", "outside sections");
-			
-			handler->startSection("section-1");
-			handler->entry("key-1", "in section 1");
-			handler->entry("key-2", "in section 1");
-			handler->entry("key-3", "in section 1");
-			
-			handler->startSection("nested-section-1-1");
-			handler->entry("key-1", "in nested section 1-1");
-			handler->entry("key-2", "in nested section 1-1");
-			handler->endSection();
-			
-			handler->endSection();
-			
-			handler->startSection("section-2");
-			handler->entry("key-1", "in section 2");
-			handler->endSection();
-			
-			handler->entry("key-666", "outside sections again; this normally would not happen, but should be supported");
-			
-			handler->endDocument();
+
+		for (INIContentHandler* handler : handlers) handler->startDocument();
+
+		std::regex whitespacePattrern("\\s*");
+		std::regex commentPattrern("\\s*(;|#)\\s*(.*)");
+		std::regex sectionPattrern("\\s*\\[\\s*([^\\]]+)\\s*\\]\\s*");
+		std::regex entryQuotesPattrern("\\s*([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?\\s*=\\s*\"([^']+)\"\\s*((;|#)\\s*(.*))?");
+		std::regex entryApostrophesPattrern("\\s*([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?\\s*=\\s*'([^']+)'\\s*((;|#)\\s*(.*))?");
+		std::regex entryPlainPattrern("\\s*([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?\\s*=\\s*(.*)");
+
+		std::smatch match;
+		std::string section;
+		std::string line;
+
+		while (std::getline(input, line)) {
+
+			if (std::regex_match(line, match, whitespacePattrern)) {
+				// TODO: support also whitespace
+			} else if (std::regex_match(line, match, commentPattrern)) {
+				// TODO: support also comments + emit also the comment style (;/#)
+			} else if (std::regex_match(line, match, sectionPattrern)) {
+				if (section.size()) for (INIContentHandler* handler : handlers) handler->endSection();
+				section = match[1];
+				for (INIContentHandler* handler : handlers) handler->startSection(section);
+			} else if (std::regex_match(line, match, entryQuotesPattrern) || std::regex_match(line, match, entryApostrophesPattrern)) {
+				// TODO: support also comments + emit also the comment style (;/#)
+				// TODO: emit also the quote style ('/"/) and surrounding whitespace
+				for (INIContentHandler* handler : handlers) handler->entry(match[1], match[3], match[4]);
+			} else if (std::regex_match(line, match, entryPlainPattrern)) {
+				for (INIContentHandler* handler : handlers) handler->entry(match[1], match[3], match[4]);
+			} else {
+				// TODO: warning, error, or support unknown content
+			}
+
+			// TODO: probably switch to state-machine approach instead of regular expressions
+			// TODO: warning/error handler
+			// TODO: support also multiline content (\ + \n)
+			// TODO: support also quoted or multiline keys?
+			// TODO: support also escaped characters
+			// TODO: support also Java .properties and manifest.mf formats?
+			// TODO: support also nested sections – hierarchy
+			// TODO: support also option for alternative key-value separator (: instead of =)
+			// TODO: support also other encodings (currently only UTF-8 is supported)
+			// TODO: emit line numbers and/or event order?
 		}
-		
+
+		if (section.size()) for (INIContentHandler* handler : handlers) handler->endSection();
+
+		for (INIContentHandler* handler : handlers) handler->endDocument();
 	}
 };
 
--- a/src/lib/INIReader.h	Sat Nov 21 18:26:39 2020 +0100
+++ b/src/lib/INIReader.h	Sat Nov 21 20:09:18 2020 +0100
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <string>
+#include <istream>
 
 #include "INIContentHandler.h"