optionally read data types from the CSV header: --read-types (complements relpipe-out-csv --write-types true)
--- a/bash-completion.sh Sun Apr 18 10:53:28 2021 +0200
+++ b/bash-completion.sh Sun Apr 18 18:20:09 2021 +0200
@@ -28,13 +28,21 @@
"boolean"
)
+ READ_TYPES=(
+ "auto"
+ "true"
+ "false"
+ )
+
if [[ "$w1" == "--relation" && "x$w0" == "x" ]]; then COMPREPLY=("''")
elif [[ "$w1" == "--attribute" && "x$w0" == "x" ]]; then COMPREPLY=("''")
elif [[ "$w2" == "--attribute" ]]; then COMPREPLY=($(compgen -W "${DATA_TYPE[*]}" -- "$w0"))
+ elif [[ "$w1" == "--read-types" ]]; then COMPREPLY=($(compgen -W "${READ_TYPES[*]}" -- "$w0"))
else
OPTIONS=(
"--relation"
"--attribute"
+ "--read-types"
)
COMPREPLY=($(compgen -W "${OPTIONS[*]}" -- "$w0"))
fi
--- a/src/CLIParser.h Sun Apr 18 10:53:28 2021 +0200
+++ b/src/CLIParser.h Sun Apr 18 18:20:09 2021 +0200
@@ -57,10 +57,18 @@
else throw relpipe::cli::RelpipeCLIException(L"Unable to parse TypeId: " + value, relpipe::cli::CLI::EXIT_CODE_BAD_CLI_ARGUMENTS);
}
+ Configuration::ReadTypes parseReadTypes(const relpipe::writer::string_t& value) {
+ if (value == L"auto") return Configuration::ReadTypes::AUTO;
+ else if (value == L"true") return Configuration::ReadTypes::TRUE;
+ else if (value == L"false") return Configuration::ReadTypes::FALSE;
+ else throw relpipe::cli::RelpipeCLIException(L"Unable to parse ReadTypes: " + value, relpipe::cli::CLI::EXIT_CODE_BAD_CLI_ARGUMENTS);
+ }
+
public:
static const relpipe::writer::string_t OPTION_RELATION;
static const relpipe::writer::string_t OPTION_ATTRIBUTE;
+ static const relpipe::writer::string_t OPTION_READ_TYPES;
Configuration parse(const std::vector<relpipe::writer::string_t>& arguments) {
Configuration c;
@@ -75,6 +83,8 @@
attribute.name = readNext(arguments, i);
attribute.type = parseTypeId(readNext(arguments, i));
c.attributes.push_back(attribute);
+ } else if (option == OPTION_READ_TYPES) {
+ c.readTypes = parseReadTypes(readNext(arguments, i));
} else throw relpipe::cli::RelpipeCLIException(L"Unsupported CLI option: " + option, relpipe::cli::CLI::EXIT_CODE_BAD_CLI_ARGUMENTS);
}
@@ -87,6 +97,7 @@
const relpipe::writer::string_t CLIParser::OPTION_RELATION = L"--relation";
const relpipe::writer::string_t CLIParser::OPTION_ATTRIBUTE = L"--attribute";
+const relpipe::writer::string_t CLIParser::OPTION_READ_TYPES = L"--read-types";
}
}
--- a/src/CSVCommand.cpp Sun Apr 18 10:53:28 2021 +0200
+++ b/src/CSVCommand.cpp Sun Apr 18 18:20:09 2021 +0200
@@ -17,6 +17,7 @@
#include <cstdlib>
#include <vector>
#include <memory>
+#include <locale>
#include <regex>
#include <algorithm>
#include <unistd.h>
@@ -87,6 +88,47 @@
return false;
}
+/**
+ * Data types might be encoded in the attribute names: name::type e.g. some_attribute::integer
+ *
+ * TODO: share this code through relpipe-lib-infertypes (when available)
+ */
+void tryParseTypes(vector<AttributeMetadata>& metadata, std::shared_ptr<writer::RelationalWriter> writer, Configuration& configuration) {
+ std::wregex pattern(L"(.*)::(.*)");
+ std::wsmatch match;
+
+ if (configuration.readTypes == Configuration::ReadTypes::AUTO || configuration.readTypes == Configuration::ReadTypes::TRUE) {
+ bool hasTypes = true;
+ std::vector<TypeId> types;
+ std::vector<string_t> names;
+ for (AttributeMetadata& am : metadata) {
+ if (std::regex_match(am.attributeName, match, pattern)) {
+ names.push_back(match[1]);
+ if (configuration.readTypes == Configuration::ReadTypes::TRUE) {
+ types.push_back(writer->toTypeId(match[2])); // must be valid type name otherwise exception is thrown
+ } else {
+ try {
+ types.push_back(writer->toTypeId(match[2]));
+ } catch (...) {
+ hasTypes = false; // ignore exception and keep original names and default type (string)
+ }
+ }
+ } else {
+ hasTypes = false;
+ }
+ }
+
+ if (hasTypes) {
+ for (int i = 0, count = metadata.size(); i < count; i++) {
+ metadata[i].attributeName = names[i];
+ metadata[i].typeId = types[i];
+ }
+ } else if (configuration.readTypes == Configuration::ReadTypes::TRUE) {
+ throw RelpipeWriterException(L"Types were expected in the CSV header, but not found.");
+ }
+ }
+}
+
void CSVCommand::process(std::istream& input, std::shared_ptr<writer::RelationalWriter> writer, Configuration& configuration) {
wstring_convert < codecvt_utf8<wchar_t>> convertor; // UTF-8 is required for CSV
vector<AttributeMetadata> metadata;
@@ -105,12 +147,6 @@
metadata.push_back(am);
if (lastInRecord) {
- // TODO: allow types on CLI and names from CSV?
- // TODO: allow types on the second line of the CSV?
- // TODO: allow regex pattern+replacement for extracting name and type from the first line of the CSV?
- // TODO: allow attribute filtering, subset, like relpipe-tr-cur?
- // TODO: allow skipping lines, like tail -n +2 ?
-
vector<string_t> firstLine;
if (metadata.size() == configuration.attributes.size()) {
@@ -120,7 +156,8 @@
metadata[i].typeId = configuration.attributes[i].type;
}
} else if (configuration.attributes.size() == 0) {
- // first line contains attribute names and type is always string
+ // first line contains attribute names and maybe also types
+ tryParseTypes(metadata, writer, configuration);
} else {
throw RelpipeWriterException(L"Declared attribute count (" + std::to_wstring(configuration.attributes.size()) + L") does not match with number of columns of the first line (" + std::to_wstring(metadata.size()) + L")");
}
--- a/src/Configuration.h Sun Apr 18 10:53:28 2021 +0200
+++ b/src/Configuration.h Sun Apr 18 18:20:09 2021 +0200
@@ -39,6 +39,14 @@
class Configuration {
public:
+
+ enum class ReadTypes {
+ AUTO,
+ TRUE,
+ FALSE,
+ };
+
+ ReadTypes readTypes = ReadTypes::AUTO;
relpipe::writer::string_t relation = L"csv";
std::vector<AttributeRecipe> attributes;