# HG changeset patch # User František Kučera # Date 1618762809 -7200 # Node ID 90ae67de2f68e08b0138d3f61472e54e148227ff # Parent 0d858e0eedf89ae3527529688251c560f4f8a11e optionally read data types from the CSV header: --read-types (complements relpipe-out-csv --write-types true) diff -r 0d858e0eedf8 -r 90ae67de2f68 bash-completion.sh --- a/bash-completion.sh Sun Apr 18 10:53:28 2021 +0200 +++ b/bash-completion.sh Sun Apr 18 18:20:09 2021 +0200 @@ -28,13 +28,21 @@ "boolean" ) + READ_TYPES=( + "auto" + "true" + "false" + ) + if [[ "$w1" == "--relation" && "x$w0" == "x" ]]; then COMPREPLY=("''") elif [[ "$w1" == "--attribute" && "x$w0" == "x" ]]; then COMPREPLY=("''") elif [[ "$w2" == "--attribute" ]]; then COMPREPLY=($(compgen -W "${DATA_TYPE[*]}" -- "$w0")) + elif [[ "$w1" == "--read-types" ]]; then COMPREPLY=($(compgen -W "${READ_TYPES[*]}" -- "$w0")) else OPTIONS=( "--relation" "--attribute" + "--read-types" ) COMPREPLY=($(compgen -W "${OPTIONS[*]}" -- "$w0")) fi diff -r 0d858e0eedf8 -r 90ae67de2f68 src/CLIParser.h --- a/src/CLIParser.h Sun Apr 18 10:53:28 2021 +0200 +++ b/src/CLIParser.h Sun Apr 18 18:20:09 2021 +0200 @@ -57,10 +57,18 @@ else throw relpipe::cli::RelpipeCLIException(L"Unable to parse TypeId: " + value, relpipe::cli::CLI::EXIT_CODE_BAD_CLI_ARGUMENTS); } + Configuration::ReadTypes parseReadTypes(const relpipe::writer::string_t& value) { + if (value == L"auto") return Configuration::ReadTypes::AUTO; + else if (value == L"true") return Configuration::ReadTypes::TRUE; + else if (value == L"false") return Configuration::ReadTypes::FALSE; + else throw relpipe::cli::RelpipeCLIException(L"Unable to parse ReadTypes: " + value, relpipe::cli::CLI::EXIT_CODE_BAD_CLI_ARGUMENTS); + } + public: static const relpipe::writer::string_t OPTION_RELATION; static const relpipe::writer::string_t OPTION_ATTRIBUTE; + static const relpipe::writer::string_t OPTION_READ_TYPES; Configuration parse(const std::vector& arguments) { Configuration c; @@ -75,6 +83,8 @@ attribute.name = readNext(arguments, i); attribute.type = parseTypeId(readNext(arguments, i)); c.attributes.push_back(attribute); + } else if (option == OPTION_READ_TYPES) { + c.readTypes = parseReadTypes(readNext(arguments, i)); } else throw relpipe::cli::RelpipeCLIException(L"Unsupported CLI option: " + option, relpipe::cli::CLI::EXIT_CODE_BAD_CLI_ARGUMENTS); } @@ -87,6 +97,7 @@ const relpipe::writer::string_t CLIParser::OPTION_RELATION = L"--relation"; const relpipe::writer::string_t CLIParser::OPTION_ATTRIBUTE = L"--attribute"; +const relpipe::writer::string_t CLIParser::OPTION_READ_TYPES = L"--read-types"; } } diff -r 0d858e0eedf8 -r 90ae67de2f68 src/CSVCommand.cpp --- a/src/CSVCommand.cpp Sun Apr 18 10:53:28 2021 +0200 +++ b/src/CSVCommand.cpp Sun Apr 18 18:20:09 2021 +0200 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -87,6 +88,47 @@ return false; } +/** + * Data types might be encoded in the attribute names: name::type e.g. some_attribute::integer + * + * TODO: share this code through relpipe-lib-infertypes (when available) + */ +void tryParseTypes(vector& metadata, std::shared_ptr writer, Configuration& configuration) { + std::wregex pattern(L"(.*)::(.*)"); + std::wsmatch match; + + if (configuration.readTypes == Configuration::ReadTypes::AUTO || configuration.readTypes == Configuration::ReadTypes::TRUE) { + bool hasTypes = true; + std::vector types; + std::vector names; + for (AttributeMetadata& am : metadata) { + if (std::regex_match(am.attributeName, match, pattern)) { + names.push_back(match[1]); + if (configuration.readTypes == Configuration::ReadTypes::TRUE) { + types.push_back(writer->toTypeId(match[2])); // must be valid type name otherwise exception is thrown + } else { + try { + types.push_back(writer->toTypeId(match[2])); + } catch (...) { + hasTypes = false; // ignore exception and keep original names and default type (string) + } + } + } else { + hasTypes = false; + } + } + + if (hasTypes) { + for (int i = 0, count = metadata.size(); i < count; i++) { + metadata[i].attributeName = names[i]; + metadata[i].typeId = types[i]; + } + } else if (configuration.readTypes == Configuration::ReadTypes::TRUE) { + throw RelpipeWriterException(L"Types were expected in the CSV header, but not found."); + } + } +} + void CSVCommand::process(std::istream& input, std::shared_ptr writer, Configuration& configuration) { wstring_convert < codecvt_utf8> convertor; // UTF-8 is required for CSV vector metadata; @@ -105,12 +147,6 @@ metadata.push_back(am); if (lastInRecord) { - // TODO: allow types on CLI and names from CSV? - // TODO: allow types on the second line of the CSV? - // TODO: allow regex pattern+replacement for extracting name and type from the first line of the CSV? - // TODO: allow attribute filtering, subset, like relpipe-tr-cur? - // TODO: allow skipping lines, like tail -n +2 ? - vector firstLine; if (metadata.size() == configuration.attributes.size()) { @@ -120,7 +156,8 @@ metadata[i].typeId = configuration.attributes[i].type; } } else if (configuration.attributes.size() == 0) { - // first line contains attribute names and type is always string + // first line contains attribute names and maybe also types + tryParseTypes(metadata, writer, configuration); } else { throw RelpipeWriterException(L"Declared attribute count (" + std::to_wstring(configuration.attributes.size()) + L") does not match with number of columns of the first line (" + std::to_wstring(metadata.size()) + L")"); } diff -r 0d858e0eedf8 -r 90ae67de2f68 src/Configuration.h --- a/src/Configuration.h Sun Apr 18 10:53:28 2021 +0200 +++ b/src/Configuration.h Sun Apr 18 18:20:09 2021 +0200 @@ -39,6 +39,14 @@ class Configuration { public: + + enum class ReadTypes { + AUTO, + TRUE, + FALSE, + }; + + ReadTypes readTypes = ReadTypes::AUTO; relpipe::writer::string_t relation = L"csv"; std::vector attributes;