optionally read data types from the CSV header: --read-types (complements relpipe-out-csv --write-types true) v_0
authorFrantišek Kučera <franta-hg@frantovo.cz>
Sun, 18 Apr 2021 18:20:09 +0200
branchv_0
changeset 20 90ae67de2f68
parent 19 0d858e0eedf8
child 21 22eb4838e8d0
optionally read data types from the CSV header: --read-types (complements relpipe-out-csv --write-types true)
bash-completion.sh
src/CLIParser.h
src/CSVCommand.cpp
src/Configuration.h
--- a/bash-completion.sh	Sun Apr 18 10:53:28 2021 +0200
+++ b/bash-completion.sh	Sun Apr 18 18:20:09 2021 +0200
@@ -28,13 +28,21 @@
 		"boolean"
 	)
 
+	READ_TYPES=(
+		"auto"
+		"true"
+		"false"
+	)
+
 	if   [[ "$w1" == "--relation"                      && "x$w0" == "x" ]];    then COMPREPLY=("''")
 	elif [[ "$w1" == "--attribute"                     && "x$w0" == "x" ]];    then COMPREPLY=("''")
 	elif [[ "$w2" == "--attribute"                                      ]];    then COMPREPLY=($(compgen -W "${DATA_TYPE[*]}" -- "$w0"))
+	elif [[ "$w1" == "--read-types"                                     ]];    then COMPREPLY=($(compgen -W "${READ_TYPES[*]}" -- "$w0"))
 	else
 		OPTIONS=(
 			"--relation"
 			"--attribute"
+			"--read-types"
 		)
 		COMPREPLY=($(compgen -W "${OPTIONS[*]}" -- "$w0"))
 	fi
--- a/src/CLIParser.h	Sun Apr 18 10:53:28 2021 +0200
+++ b/src/CLIParser.h	Sun Apr 18 18:20:09 2021 +0200
@@ -57,10 +57,18 @@
 		else throw relpipe::cli::RelpipeCLIException(L"Unable to parse TypeId: " + value, relpipe::cli::CLI::EXIT_CODE_BAD_CLI_ARGUMENTS);
 	}
 
+	Configuration::ReadTypes parseReadTypes(const relpipe::writer::string_t& value) {
+		if (value == L"auto") return Configuration::ReadTypes::AUTO;
+		else if (value == L"true") return Configuration::ReadTypes::TRUE;
+		else if (value == L"false") return Configuration::ReadTypes::FALSE;
+		else throw relpipe::cli::RelpipeCLIException(L"Unable to parse ReadTypes: " + value, relpipe::cli::CLI::EXIT_CODE_BAD_CLI_ARGUMENTS);
+	}
+
 public:
 
 	static const relpipe::writer::string_t OPTION_RELATION;
 	static const relpipe::writer::string_t OPTION_ATTRIBUTE;
+	static const relpipe::writer::string_t OPTION_READ_TYPES;
 
 	Configuration parse(const std::vector<relpipe::writer::string_t>& arguments) {
 		Configuration c;
@@ -75,6 +83,8 @@
 				attribute.name = readNext(arguments, i);
 				attribute.type = parseTypeId(readNext(arguments, i));
 				c.attributes.push_back(attribute);
+			} else if (option == OPTION_READ_TYPES) {
+				c.readTypes = parseReadTypes(readNext(arguments, i));
 			} else throw relpipe::cli::RelpipeCLIException(L"Unsupported CLI option: " + option, relpipe::cli::CLI::EXIT_CODE_BAD_CLI_ARGUMENTS);
 		}
 
@@ -87,6 +97,7 @@
 
 const relpipe::writer::string_t CLIParser::OPTION_RELATION = L"--relation";
 const relpipe::writer::string_t CLIParser::OPTION_ATTRIBUTE = L"--attribute";
+const relpipe::writer::string_t CLIParser::OPTION_READ_TYPES = L"--read-types";
 
 }
 }
--- a/src/CSVCommand.cpp	Sun Apr 18 10:53:28 2021 +0200
+++ b/src/CSVCommand.cpp	Sun Apr 18 18:20:09 2021 +0200
@@ -17,6 +17,7 @@
 #include <cstdlib>
 #include <vector>
 #include <memory>
+#include <locale>
 #include <regex>
 #include <algorithm>
 #include <unistd.h>
@@ -87,6 +88,47 @@
 	return false;
 }
 
+/**
+ * Data types might be encoded in the attribute names: name::type e.g. some_attribute::integer
+ * 
+ * TODO: share this code through relpipe-lib-infertypes (when available)
+ */
+void tryParseTypes(vector<AttributeMetadata>& metadata, std::shared_ptr<writer::RelationalWriter> writer, Configuration& configuration) {
+	std::wregex pattern(L"(.*)::(.*)");
+	std::wsmatch match;
+
+	if (configuration.readTypes == Configuration::ReadTypes::AUTO || configuration.readTypes == Configuration::ReadTypes::TRUE) {
+		bool hasTypes = true;
+		std::vector<TypeId> types;
+		std::vector<string_t> names;
+		for (AttributeMetadata& am : metadata) {
+			if (std::regex_match(am.attributeName, match, pattern)) {
+				names.push_back(match[1]);
+				if (configuration.readTypes == Configuration::ReadTypes::TRUE) {
+					types.push_back(writer->toTypeId(match[2])); // must be valid type name otherwise exception is thrown
+				} else {
+					try {
+						types.push_back(writer->toTypeId(match[2]));
+					} catch (...) {
+						hasTypes = false; // ignore exception and keep original names and default type (string)
+					}
+				}
+			} else {
+				hasTypes = false;
+			}
+		}
+
+		if (hasTypes) {
+			for (int i = 0, count = metadata.size(); i < count; i++) {
+				metadata[i].attributeName = names[i];
+				metadata[i].typeId = types[i];
+			}
+		} else if (configuration.readTypes == Configuration::ReadTypes::TRUE) {
+			throw RelpipeWriterException(L"Types were expected in the CSV header, but not found.");
+		}
+	}
+}
+
 void CSVCommand::process(std::istream& input, std::shared_ptr<writer::RelationalWriter> writer, Configuration& configuration) {
 	wstring_convert < codecvt_utf8<wchar_t>> convertor; // UTF-8 is required for CSV
 	vector<AttributeMetadata> metadata;
@@ -105,12 +147,6 @@
 			metadata.push_back(am);
 			if (lastInRecord) {
 
-				// TODO: allow types on CLI and names from CSV?
-				// TODO: allow types on the second line of the CSV?
-				// TODO: allow regex pattern+replacement for extracting name and type from the first line of the CSV?
-				// TODO: allow attribute filtering, subset, like relpipe-tr-cur?
-				// TODO: allow skipping lines, like tail -n +2 ?
-				
 				vector<string_t> firstLine;
 
 				if (metadata.size() == configuration.attributes.size()) {
@@ -120,7 +156,8 @@
 						metadata[i].typeId = configuration.attributes[i].type;
 					}
 				} else if (configuration.attributes.size() == 0) {
-					// first line contains attribute names and type is always string
+					// first line contains attribute names and maybe also types
+					tryParseTypes(metadata, writer, configuration);
 				} else {
 					throw RelpipeWriterException(L"Declared attribute count (" + std::to_wstring(configuration.attributes.size()) + L") does not match with number of columns of the first line (" + std::to_wstring(metadata.size()) + L")");
 				}
--- a/src/Configuration.h	Sun Apr 18 10:53:28 2021 +0200
+++ b/src/Configuration.h	Sun Apr 18 18:20:09 2021 +0200
@@ -39,6 +39,14 @@
 
 class Configuration {
 public:
+
+	enum class ReadTypes {
+		AUTO,
+		TRUE,
+		FALSE,
+	};
+
+	ReadTypes readTypes = ReadTypes::AUTO;
 	relpipe::writer::string_t relation = L"csv";
 	std::vector<AttributeRecipe> attributes;