src/CSVCommand.cpp
branchv_0
changeset 20 90ae67de2f68
parent 16 15ee963675af
child 21 22eb4838e8d0
--- a/src/CSVCommand.cpp	Sun Apr 18 10:53:28 2021 +0200
+++ b/src/CSVCommand.cpp	Sun Apr 18 18:20:09 2021 +0200
@@ -17,6 +17,7 @@
 #include <cstdlib>
 #include <vector>
 #include <memory>
+#include <locale>
 #include <regex>
 #include <algorithm>
 #include <unistd.h>
@@ -87,6 +88,47 @@
 	return false;
 }
 
+/**
+ * Data types might be encoded in the attribute names: name::type e.g. some_attribute::integer
+ * 
+ * TODO: share this code through relpipe-lib-infertypes (when available)
+ */
+void tryParseTypes(vector<AttributeMetadata>& metadata, std::shared_ptr<writer::RelationalWriter> writer, Configuration& configuration) {
+	std::wregex pattern(L"(.*)::(.*)");
+	std::wsmatch match;
+
+	if (configuration.readTypes == Configuration::ReadTypes::AUTO || configuration.readTypes == Configuration::ReadTypes::TRUE) {
+		bool hasTypes = true;
+		std::vector<TypeId> types;
+		std::vector<string_t> names;
+		for (AttributeMetadata& am : metadata) {
+			if (std::regex_match(am.attributeName, match, pattern)) {
+				names.push_back(match[1]);
+				if (configuration.readTypes == Configuration::ReadTypes::TRUE) {
+					types.push_back(writer->toTypeId(match[2])); // must be valid type name otherwise exception is thrown
+				} else {
+					try {
+						types.push_back(writer->toTypeId(match[2]));
+					} catch (...) {
+						hasTypes = false; // ignore exception and keep original names and default type (string)
+					}
+				}
+			} else {
+				hasTypes = false;
+			}
+		}
+
+		if (hasTypes) {
+			for (int i = 0, count = metadata.size(); i < count; i++) {
+				metadata[i].attributeName = names[i];
+				metadata[i].typeId = types[i];
+			}
+		} else if (configuration.readTypes == Configuration::ReadTypes::TRUE) {
+			throw RelpipeWriterException(L"Types were expected in the CSV header, but not found.");
+		}
+	}
+}
+
 void CSVCommand::process(std::istream& input, std::shared_ptr<writer::RelationalWriter> writer, Configuration& configuration) {
 	wstring_convert < codecvt_utf8<wchar_t>> convertor; // UTF-8 is required for CSV
 	vector<AttributeMetadata> metadata;
@@ -105,12 +147,6 @@
 			metadata.push_back(am);
 			if (lastInRecord) {
 
-				// TODO: allow types on CLI and names from CSV?
-				// TODO: allow types on the second line of the CSV?
-				// TODO: allow regex pattern+replacement for extracting name and type from the first line of the CSV?
-				// TODO: allow attribute filtering, subset, like relpipe-tr-cur?
-				// TODO: allow skipping lines, like tail -n +2 ?
-				
 				vector<string_t> firstLine;
 
 				if (metadata.size() == configuration.attributes.size()) {
@@ -120,7 +156,8 @@
 						metadata[i].typeId = configuration.attributes[i].type;
 					}
 				} else if (configuration.attributes.size() == 0) {
-					// first line contains attribute names and type is always string
+					// first line contains attribute names and maybe also types
+					tryParseTypes(metadata, writer, configuration);
 				} else {
 					throw RelpipeWriterException(L"Declared attribute count (" + std::to_wstring(configuration.attributes.size()) + L") does not match with number of columns of the first line (" + std::to_wstring(metadata.size()) + L")");
 				}