optionally read data types from the CSV header: --read-types (complements relpipe-out-csv --write-types true)
/**
* Relational pipes
* Copyright © 2018 František Kučera (Frantovo.cz, GlobalCode.info)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, version 3 of the License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <cstdlib>
#include <vector>
#include <memory>
#include <locale>
#include <regex>
#include <algorithm>
#include <unistd.h>
#include <relpipe/writer/RelationalWriter.h>
#include <relpipe/writer/RelpipeWriterException.h>
#include <relpipe/writer/AttributeMetadata.h>
#include <relpipe/writer/Factory.h>
#include <relpipe/writer/TypeId.h>
#include <relpipe/cli/CLI.h>
#include "CSVCommand.h"
using namespace std;
using namespace relpipe::cli;
using namespace relpipe::writer;
namespace relpipe {
namespace in {
namespace csv {
bool CSVCommand::readValue(std::istream& input, std::stringstream& currentValue, bool& lastInRecord) {
lastInRecord = false;
char ch;
input.get(ch);
if (ch == '"') {
while (input.get(ch)) {
if (ch == '"') {
input.get(ch);
if (ch == '"') {
currentValue << ch;
} else {
if (ch == '\r') input.get(ch);
if (ch == '\n') lastInRecord = true;
else if (ch != ',') throw RelpipeWriterException(L"Unexpected character (should be „\\n“ or „,“)");
return true;
}
} else {
currentValue << ch;
}
}
} else if (ch == ',') {
return true;
} else if (ch == '\n') {
lastInRecord = true;
return true;
} else if (ch == '\r') {
input.get(ch);
if (ch == '\n') {
lastInRecord = true;
return true;
} else {
throw RelpipeWriterException(L"Crazy carriage stuck during journey");
}
} else {
for (currentValue << ch; input.get(ch);) {
switch (ch) {
case ',': return true;
case '\r': break;
case '\n':
lastInRecord = true;
return true;
default: currentValue << ch;
}
}
}
return false;
}
/**
* Data types might be encoded in the attribute names: name::type e.g. some_attribute::integer
*
* TODO: share this code through relpipe-lib-infertypes (when available)
*/
void tryParseTypes(vector<AttributeMetadata>& metadata, std::shared_ptr<writer::RelationalWriter> writer, Configuration& configuration) {
std::wregex pattern(L"(.*)::(.*)");
std::wsmatch match;
if (configuration.readTypes == Configuration::ReadTypes::AUTO || configuration.readTypes == Configuration::ReadTypes::TRUE) {
bool hasTypes = true;
std::vector<TypeId> types;
std::vector<string_t> names;
for (AttributeMetadata& am : metadata) {
if (std::regex_match(am.attributeName, match, pattern)) {
names.push_back(match[1]);
if (configuration.readTypes == Configuration::ReadTypes::TRUE) {
types.push_back(writer->toTypeId(match[2])); // must be valid type name otherwise exception is thrown
} else {
try {
types.push_back(writer->toTypeId(match[2]));
} catch (...) {
hasTypes = false; // ignore exception and keep original names and default type (string)
}
}
} else {
hasTypes = false;
}
}
if (hasTypes) {
for (int i = 0, count = metadata.size(); i < count; i++) {
metadata[i].attributeName = names[i];
metadata[i].typeId = types[i];
}
} else if (configuration.readTypes == Configuration::ReadTypes::TRUE) {
throw RelpipeWriterException(L"Types were expected in the CSV header, but not found.");
}
}
}
void CSVCommand::process(std::istream& input, std::shared_ptr<writer::RelationalWriter> writer, Configuration& configuration) {
wstring_convert < codecvt_utf8<wchar_t>> convertor; // UTF-8 is required for CSV
vector<AttributeMetadata> metadata;
bool headerDone = false;
bool lastInRecord = false;
stringstream currentValue;
while (readValue(input, currentValue, lastInRecord) && input.good()) {
if (headerDone) {
writer->writeAttribute(convertor.from_bytes(currentValue.str()));
} else {
AttributeMetadata am;
am.attributeName = convertor.from_bytes(currentValue.str());
am.typeId = TypeId::STRING;
metadata.push_back(am);
if (lastInRecord) {
vector<string_t> firstLine;
if (metadata.size() == configuration.attributes.size()) {
for (int i = 0; i < metadata.size(); i++) {
firstLine.push_back(metadata[i].attributeName);
metadata[i].attributeName = configuration.attributes[i].name;
metadata[i].typeId = configuration.attributes[i].type;
}
} else if (configuration.attributes.size() == 0) {
// first line contains attribute names and maybe also types
tryParseTypes(metadata, writer, configuration);
} else {
throw RelpipeWriterException(L"Declared attribute count (" + std::to_wstring(configuration.attributes.size()) + L") does not match with number of columns of the first line (" + std::to_wstring(metadata.size()) + L")");
}
headerDone = true;
writer->startRelation(configuration.relation, metadata, true);
if (firstLine.size()) {
for (string_t value : firstLine) writer->writeAttribute(value);
}
}
}
currentValue.str("");
currentValue.clear();
}
}
}
}
}