enable configuring the parser from CLI: --parser-option v_0
authorFrantišek Kučera <franta-hg@frantovo.cz>
Fri, 27 Nov 2020 16:29:12 +0100
branchv_0
changeset 23 b497140b0b63
parent 22 29d673a54ecf
child 24 07e0a2edf3bc
enable configuring the parser from CLI: --parser-option
bash-completion.sh
src/CLIParser.h
src/Configuration.h
src/INICommand.cpp
src/INICommand.h
src/lib/INIReader.cpp
src/lib/INIReader.h
--- a/bash-completion.sh	Thu Nov 26 18:52:49 2020 +0100
+++ b/bash-completion.sh	Fri Nov 27 16:29:12 2020 +0100
@@ -27,6 +27,21 @@
 		"false"
 	)
 
+	PARSER_OPTIONS=(
+		"trim-continuing-lines"
+		"allow-section-tags"
+		"allow-sub-keys"
+		"comment-separators"
+		"key-value-separators"
+		"quotes"
+		"dialect"
+	);
+
+	DIALECTS=(
+		"default-ini"
+		"java-properties"
+	);
+
 	if   [[ "$w1" == "--relation"                      && "x$w0" == "x" ]];    then COMPREPLY=("''")
 	elif [[ "$w1" == "--enable-sections"                                ]];    then COMPREPLY=($(compgen -W "${BOOLEAN_VALUES[*]}" -- "$w0"))
 	elif [[ "$w1" == "--enable-sub-keys"                                ]];    then COMPREPLY=($(compgen -W "${BOOLEAN_VALUES[*]}" -- "$w0"))
@@ -34,6 +49,14 @@
 	elif [[ "$w1" == "--enable-whitespace"                              ]];    then COMPREPLY=($(compgen -W "${BOOLEAN_VALUES[*]}" -- "$w0"))
 	elif [[ "$w1" == "--enable-line-numbers"                            ]];    then COMPREPLY=($(compgen -W "${BOOLEAN_VALUES[*]}" -- "$w0"))
 	elif [[ "$w1" == "--enable-event-numbers"                           ]];    then COMPREPLY=($(compgen -W "${BOOLEAN_VALUES[*]}" -- "$w0"))
+	elif [[ "$w1" == "--parser-option"                                  ]];    then COMPREPLY=($(compgen -W "${PARSER_OPTIONS[*]}" -- "$w0"))
+	elif [[ "$w2" == "--parser-option" && "$w1" == "trim-continuing-lines"                    ]];    then COMPREPLY=($(compgen -W "${BOOLEAN_VALUES[*]}" -- "$w0"))
+	elif [[ "$w2" == "--parser-option" && "$w1" == "allow-section-tags"                       ]];    then COMPREPLY=($(compgen -W "${BOOLEAN_VALUES[*]}" -- "$w0"))
+	elif [[ "$w2" == "--parser-option" && "$w1" == "allow-sub-keys"                           ]];    then COMPREPLY=($(compgen -W "${BOOLEAN_VALUES[*]}" -- "$w0"))
+	elif [[ "$w2" == "--parser-option" && "$w1" == "dialect"                                  ]];    then COMPREPLY=($(compgen -W "${DIALECTS[*]}" -- "$w0"))
+	elif [[ "$w2" == "--parser-option" && "$w1" == "comment-separators"      && "x$w0" == "x" ]];    then COMPREPLY=("'#;'")
+	elif [[ "$w2" == "--parser-option" && "$w1" == "key-value-separators"    && "x$w0" == "x" ]];    then COMPREPLY=("'=:'")
+	elif [[ "$w2" == "--parser-option" && "$w1" == "quotes"                  && "x$w0" == "x" ]];    then COMPREPLY=("'\"\\''")
 	else
 		OPTIONS=(
 			"--relation"
@@ -43,6 +66,7 @@
 			"--enable-whitespace"
 			"--enable-line-numbers"
 			"--enable-event-numbers"
+			"--parser-option"
 		)
 		COMPREPLY=($(compgen -W "${OPTIONS[*]}" -- "$w0"))
 	fi
--- a/src/CLIParser.h	Thu Nov 26 18:52:49 2020 +0100
+++ b/src/CLIParser.h	Fri Nov 27 16:29:12 2020 +0100
@@ -49,6 +49,7 @@
 public:
 
 	static const relpipe::writer::string_t OPTION_RELATION;
+	static const relpipe::writer::string_t OPTION_PARSER_OPTION;
 	static const relpipe::writer::string_t OPTION_ENABLE_SECTIONS;
 	static const relpipe::writer::string_t OPTION_ENABLE_SUB_KEYS;
 	static const relpipe::writer::string_t OPTION_ENABLE_COMMENTS;
@@ -63,6 +64,7 @@
 			relpipe::writer::string_t option = readNext(arguments, i);
 
 			if (option == OPTION_RELATION) c.relation = readNext(arguments, i);
+			else if (option == OPTION_PARSER_OPTION) c.parserOptions.push_back({readNext(arguments, i), readNext(arguments, i)});
 			else if (option == OPTION_ENABLE_SECTIONS) c.enableSections = parseBoolean(readNext(arguments, i));
 			else if (option == OPTION_ENABLE_SUB_KEYS) c.enableSubKeys = parseBoolean(readNext(arguments, i));
 			else if (option == OPTION_ENABLE_COMMENTS) c.enableComments = parseBoolean(readNext(arguments, i));
@@ -80,6 +82,7 @@
 };
 
 const relpipe::writer::string_t CLIParser::OPTION_RELATION = L"--relation";
+const relpipe::writer::string_t CLIParser::OPTION_PARSER_OPTION = L"--parser-option";
 const relpipe::writer::string_t CLIParser::OPTION_ENABLE_SECTIONS = L"--enable-sections";
 const relpipe::writer::string_t CLIParser::OPTION_ENABLE_SUB_KEYS = L"--enable-sub-keys";
 const relpipe::writer::string_t CLIParser::OPTION_ENABLE_COMMENTS = L"--enable-comments";
--- a/src/Configuration.h	Thu Nov 26 18:52:49 2020 +0100
+++ b/src/Configuration.h	Fri Nov 27 16:29:12 2020 +0100
@@ -26,9 +26,19 @@
 namespace in {
 namespace ini {
 
+class ParserOptionRecipe {
+public:
+	relpipe::writer::string_t uri;
+	relpipe::writer::string_t value;
+
+	ParserOptionRecipe(relpipe::writer::string_t uri, relpipe::writer::string_t value) : uri(uri), value(value) {
+	}
+};
+
 class Configuration {
 public:
 	relpipe::writer::string_t relation = L"ini";
+	std::vector<ParserOptionRecipe> parserOptions;
 	relpipe::writer::boolean_t enableLineNumbers = false;
 	relpipe::writer::boolean_t enableEventNumbers = false;
 	relpipe::writer::boolean_t enableSections = true;
--- a/src/INICommand.cpp	Thu Nov 26 18:52:49 2020 +0100
+++ b/src/INICommand.cpp	Fri Nov 27 16:29:12 2020 +0100
@@ -170,7 +170,7 @@
 void INICommand::process(std::istream& input, std::shared_ptr<writer::RelationalWriter> writer, Configuration& configuration) {
 	FlatINIContentHandler handler(writer, configuration);
 	std::shared_ptr<INIReader> reader(INIReader::create(input));
-	// TODO: configure the INIReader (features/properties) according to our Configuration (sub-keys etc.)
+	for (ParserOptionRecipe option : configuration.parserOptions) reader->setOption(convertor.to_bytes(option.uri), convertor.to_bytes(option.value));
 	BasicUnescapingINIContentHandler unescapingHandler(handler, false);
 	JavaPropertiesUnescapingINIContentHandler javaHandler(unescapingHandler, true);
 	reader->addHandler(&javaHandler);
--- a/src/INICommand.h	Thu Nov 26 18:52:49 2020 +0100
+++ b/src/INICommand.h	Fri Nov 27 16:29:12 2020 +0100
@@ -30,6 +30,8 @@
 namespace ini {
 
 class INICommand {
+private:
+	wstring_convert < codecvt_utf8<wchar_t>> convertor; // INI parser works with UTF-8
 public:
 	void process(std::istream& input, std::shared_ptr<writer::RelationalWriter> writer, Configuration& configuration);
 
--- a/src/lib/INIReader.cpp	Thu Nov 26 18:52:49 2020 +0100
+++ b/src/lib/INIReader.cpp	Fri Nov 27 16:29:12 2020 +0100
@@ -33,8 +33,6 @@
 	std::vector<INIContentHandler*> handlers;
 
 	/** 
-	 * This might be configurable.
-	 * 
 	 * By default, we ignore all leading whitespace on continuing lines.
 	 * If there should be some spaces or tabs, they should be placed on the previous line before the „\“.
 	 * If a line break is desired, it should be written as \n (escaped) or the value should be quoted in " or '.
@@ -42,11 +40,9 @@
 	 * Related specifications:
 	 *  - https://docs.oracle.com/javase/8/docs/api/index.html?java/util/Properties.html
 	 */
-	bool consumeLeadingSpacesOnContinuingLines = true;
+	bool trimLeadingSpacesOnContinuingLines = true;
 
 	/**
-	 * This might be configurable.
-	 * 
 	 * KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section.
 	 * Line „[section_1][$i]“ means that the „section_1“ is „locked“.
 	 * We may emit this information somehow later, but for now, it is just ignored.
@@ -59,8 +55,6 @@
 	bool allowSectionTags = true;
 
 	/**
-	 * This might be configurable.
-	 * 
 	 * If whole key is „aaa[bbb]“ then „aaa“ is considered to be the key and „bbb“ the sub-key.
 	 * No \[ escaping is currently supported, so the key might not contain the bracket character.
 	 * 
@@ -71,8 +65,6 @@
 	bool allowSubKeys = true;
 
 	/**
-	 * This might be configurable.
-	 * 
 	 * Classic INI uses „key=value“ syntax.
 	 * But some other formats/dialects might use key:value.
 	 * 
@@ -83,8 +75,6 @@
 	std::string keyValueSeparators = "=";
 
 	/**
-	 * This might be configurable.
-	 * 
 	 * Classic INI uses „; comment“ syntax.
 	 * But many existing files contain „# comment“ lines.
 	 * 
@@ -93,8 +83,6 @@
 	std::string commentSeparators = ";#";
 
 	/**
-	 * This might be configurable.
-	 * 
 	 * INI often support both "quotes" and 'apostrophes' styles.
 	 * But some dialects may support only one of them or not support quoting at all.
 	 * 
@@ -146,7 +134,7 @@
 	}
 
 	void processContinuingLine(std::stringstream& result) {
-		if (consumeLeadingSpacesOnContinuingLines) readSpacesAndTabs();
+		if (trimLeadingSpacesOnContinuingLines) readSpacesAndTabs();
 		else result.put('\n');
 	}
 
@@ -237,11 +225,49 @@
 		return std::regex_replace(s, std::regex("^\\s+|\\s+$"), "");
 	}
 
+	/**
+	 * TODO: use a common method
+	 */
+	bool parseBoolean(const std::string& value) {
+		if (value == "true") return true;
+		else if (value == "false") return false;
+		else throw std::invalid_argument(std::string("Unable to parse boolean value: ") + value + " (expecting true or false)");
+	}
+
+	void setDialect(const std::string& name) {
+		if (name == "default-ini") {
+			// already set
+		} else if (name == "java-properties") {
+			trimLeadingSpacesOnContinuingLines = true;
+			allowSectionTags = false;
+			allowSubKeys = false;
+			commentSeparators = "#";
+			keyValueSeparators = "=:";
+			quotes = "";
+			// TODO: allowSections = false;
+			// TODO: enable unicode unescaping
+		} else {
+			throw std::invalid_argument(std::string("Unsupported INI dialect: ") + name);
+		}
+	}
+
 public:
 
 	INIReaderImpl(std::istream& input) : input(input) {
 	}
 
+	void setOption(const std::string& uri, const std::string& value) override {
+		if (uri == "trim-continuing-lines") trimLeadingSpacesOnContinuingLines = parseBoolean(value); // TODO: continuing lines modes (enum), not just boolean
+		// TODO: else if (uri == "allow-sections") allowSections = parseBoolean(value);
+		else if (uri == "allow-section-tags") allowSectionTags = parseBoolean(value);
+		else if (uri == "allow-sub-keys") allowSubKeys = parseBoolean(value);
+		else if (uri == "comment-separators") commentSeparators = value;
+		else if (uri == "key-value-separators") keyValueSeparators = value;
+		else if (uri == "quotes") quotes = value;
+		else if (uri == "dialect") setDialect(value);
+		else throw std::invalid_argument(std::string("Invalid parser option: „") + uri + "“ with value: „" + value + "“");
+	}
+
 	void addHandler(INIContentHandler* handler) override {
 		handlers.push_back(handler);
 	}
--- a/src/lib/INIReader.h	Thu Nov 26 18:52:49 2020 +0100
+++ b/src/lib/INIReader.h	Fri Nov 27 16:29:12 2020 +0100
@@ -32,6 +32,18 @@
 class INIReader {
 public:
 	virtual ~INIReader() = default;
+	/**
+	 * TODO: after moving to alt2xml:
+	 *        - option will be identified by globally unique URI/IRI
+	 *        - parsers will provide catalog of supported options (names, enum values, documentation)
+	 *        - options serves as both XML parser features and properties and are mapped to them
+	 */
+	virtual void setOption(const std::string& uri, const std::string& value) = 0;
+	/**
+	 * TODO: after moving to alt2xml:
+	 *        - this will be generic handler for SAX event
+	 *        - but both sides will know the schema (allowed elements and attributes for INI events)
+	 */
 	virtual void addHandler(INIContentHandler* handler) = 0;
 	virtual void process() = 0;
 	static INIReader* create(std::istream& input);