src/CSVCommand.cpp
author František Kučera <franta-hg@frantovo.cz>
Sat, 13 Nov 2021 17:05:03 +0100
branchv_0
changeset 22 3f6488171e34
parent 21 22eb4838e8d0
permissions -rw-r--r--
check number of values on each row

/**
 * Relational pipes
 * Copyright © 2018 František Kučera (Frantovo.cz, GlobalCode.info)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
#include <cstdlib>
#include <vector>
#include <memory>
#include <locale>
#include <regex>
#include <algorithm>
#include <unistd.h>

#include <relpipe/writer/RelationalWriter.h>
#include <relpipe/writer/RelpipeWriterException.h>
#include <relpipe/writer/AttributeMetadata.h>
#include <relpipe/writer/Factory.h>
#include <relpipe/writer/TypeId.h>

#include <relpipe/cli/CLI.h>

#include "CSVCommand.h"

using namespace std;
using namespace relpipe::cli;
using namespace relpipe::writer;

namespace relpipe {
namespace in {
namespace csv {

bool CSVCommand::readValue(std::istream& input, std::stringstream& currentValue, bool& lastInRecord) {
	lastInRecord = false;
	char ch;
	input.get(ch);
	if (ch == '"') {
		while (input.get(ch)) {
			if (ch == '"') {
				input.get(ch);
				if (ch == '"') {
					currentValue << ch;
				} else {
					if (ch == '\r') input.get(ch);
					if (ch == '\n') lastInRecord = true;
					else if (ch != ',') throw RelpipeWriterException(L"Unexpected character (should be „\\n“ or „,“)");
					return true;
				}
			} else {
				currentValue << ch;
			}
		}
	} else if (ch == ',') {
		return true;
	} else if (ch == '\n') {
		lastInRecord = true;
		return true;
	} else if (ch == '\r') {
		input.get(ch);
		if (ch == '\n') {
			lastInRecord = true;
			return true;
		} else {
			throw RelpipeWriterException(L"Crazy carriage stuck during journey");
		}
	} else {
		for (currentValue << ch; input.get(ch);) {
			switch (ch) {
				case ',': return true;
				case '\r': break;
				case '\n':
					lastInRecord = true;
					return true;
				default: currentValue << ch;
			}
		}
	}
	return false;
}

/**
 * Data types might be encoded in the attribute names: name::type e.g. some_attribute::integer
 * 
 * TODO: share this code through relpipe-lib-infertypes (when available)
 */
void tryParseTypes(vector<AttributeMetadata>& metadata, std::shared_ptr<writer::RelationalWriter> writer, Configuration& configuration) {
	std::wregex pattern(L"(.*)::(.*)");
	std::wsmatch match;

	if (configuration.readTypes == Configuration::ReadTypes::AUTO || configuration.readTypes == Configuration::ReadTypes::TRUE) {
		bool hasTypes = true;
		std::vector<TypeId> types;
		std::vector<string_t> names;
		for (AttributeMetadata& am : metadata) {
			if (std::regex_match(am.attributeName, match, pattern)) {
				names.push_back(match[1]);
				if (configuration.readTypes == Configuration::ReadTypes::TRUE) {
					types.push_back(writer->toTypeId(match[2])); // must be valid type name otherwise exception is thrown
				} else {
					try {
						types.push_back(writer->toTypeId(match[2]));
					} catch (...) {
						hasTypes = false; // ignore exception and keep original names and default type (string)
					}
				}
			} else {
				hasTypes = false;
			}
		}

		if (hasTypes) {
			for (int i = 0, count = metadata.size(); i < count; i++) {
				metadata[i].attributeName = names[i];
				metadata[i].typeId = types[i];
			}
		} else if (configuration.readTypes == Configuration::ReadTypes::TRUE) {
			throw RelpipeWriterException(L"Types were expected in the CSV header, but not found.");
		}
	}
}

void CSVCommand::process(std::istream& input, std::shared_ptr<writer::RelationalWriter> writer, Configuration& configuration) {
	wstring_convert < codecvt_utf8<wchar_t>> convertor; // UTF-8 is required for CSV
	vector<AttributeMetadata> metadata;
	bool headerDone = false;
	bool lastInRecord = false;
	integer_t valueCount = 0;
	stringstream currentValue;


	while (readValue(input, currentValue, lastInRecord) && input.good()) {
		if (headerDone) {
			writer->writeAttribute(convertor.from_bytes(currentValue.str()));

			valueCount++;
			if (valueCount > metadata.size()) throw RelpipeWriterException(L"The number of values " + std::to_wstring(valueCount) + L" exceeds declared column count: " + std::to_wstring(metadata.size()));
			if (lastInRecord && valueCount != metadata.size()) throw RelpipeWriterException(L"The number of values " + std::to_wstring(valueCount) + L" is lower than declared column count: " + std::to_wstring(metadata.size()));
			if (lastInRecord) valueCount = 0;
		} else {
			AttributeMetadata am;
			am.attributeName = convertor.from_bytes(currentValue.str());
			am.typeId = TypeId::STRING;
			metadata.push_back(am);
			if (lastInRecord) {

				vector<string_t> firstLine;

				if (metadata.size() == configuration.attributes.size()) {
					for (int i = 0; i < metadata.size(); i++) {
						firstLine.push_back(metadata[i].attributeName);
						metadata[i].attributeName = configuration.attributes[i].name;
						metadata[i].typeId = configuration.attributes[i].type;
					}
				} else if (configuration.attributes.size() == 0) {
					// first line contains attribute names and maybe also types
					tryParseTypes(metadata, writer, configuration);
				} else {
					throw RelpipeWriterException(L"Declared attribute count (" + std::to_wstring(configuration.attributes.size()) + L") does not match with number of columns of the first line (" + std::to_wstring(metadata.size()) + L")");
				}

				headerDone = true;
				writer->startRelation(configuration.relation, metadata, true);
				if (firstLine.size()) {
					for (string_t value : firstLine) writer->writeAttribute(value);
				}
			}
		}

		currentValue.str("");
		currentValue.clear();
	}

	/**
	 * RFC 4180:
	 *  - Each line should contain the same number of fields throughout the file.
	 *  - The last field in the record must not be followed by a comma.
	 */
	if (valueCount && valueCount != metadata.size()) throw RelpipeWriterException(L"Unexpected EOF: The number of values " + std::to_wstring(valueCount) + L" is lower than declared column count: " + std::to_wstring(metadata.size()));
}

}
}
}