src/RecfileHandler.h
author František Kučera <franta-hg@frantovo.cz>
Sat, 18 Feb 2023 22:57:22 +0100
branchv_0
changeset 16 1731e8dff446
parent 15 e5421eea0583
child 17 f67047a1e19e
permissions -rw-r--r--
attribute name escaping: first version

/**
 * Relational pipes
 * Copyright © 2019 František Kučera (Frantovo.cz, GlobalCode.info)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
#pragma once

#include <string>
#include <vector>
#include <iostream>
#include <sstream>
#include <locale>
#include <codecvt>
#include <regex>
#include <cassert>

#include <relpipe/reader/typedefs.h>
#include <relpipe/reader/TypeId.h>
#include <relpipe/reader/RelpipeReaderException.h>
#include <relpipe/reader/handlers/RelationalReaderStringHandler.h>
#include <relpipe/reader/handlers/AttributeMetadata.h>

namespace relpipe {
namespace out {
namespace recfile {

using namespace relpipe::reader;

class RecfileHandler : public handlers::RelationalReaderStringHandler {
private:
	std::ostream& output;
	wstring_convert<codecvt_utf8<wchar_t>> convertor; // XML output will be always in UTF-8
	std::vector<TypeId> attributeTypes;
	std::vector<string_t> attributeTypeCodes;
	std::vector<string_t> attributeNames;
	integer_t valueCount = 0;
	integer_t attributeCount = 0;
	integer_t relationCount = 0;

	void writeRelationName(const string_t& name) {
		// FIXME: escaping/filtering
		output << "%rec: " << convertor.to_bytes(name) << std::endl;
	}

	const std::string toRecfileType(const TypeId& type) {
		switch (type) {
			case TypeId::BOOLEAN: return "bool";
			case TypeId::INTEGER: return "int";
			case TypeId::STRING: return "";
			default: throw RelpipeReaderException(L"Unsupported type – unable to convert to a Recfile type");
		}
	}

	void writeAttributeMetadata(const handlers::AttributeMetadata& attribute) {
		// FIXME: escaping/filtering
		std::string recfileType = toRecfileType(attribute.getTypeId());
		if (recfileType.size()) output << "%type: " << convertor.to_bytes(attribute.getAttributeName()) << " " << recfileType << std::endl;
	}

	void writeSeparator() {
		output << std::endl;
	}

	bool between(wchar_t ch, wchar_t start, wchar_t end) {
		return ch >= start && ch <= end;
	}

	bool isValidNameCharacter(wchar_t ch, bool first) {
		if (first) {
			// also '%' is technically valid here, but it is used for special
			// purposes like the relation name or attribute types
			return /**/between(ch, L'a', L'z')
					|| between(ch, L'A', L'Z');
		} else {
			return ch == L'_'
					|| between(ch, L'a', L'z')
					|| between(ch, L'A', L'Z')
					|| between(ch, L'0', L'9');
		}
	}

	void writeAttribute(const string_t& name, const TypeId& type, const string_t& value) {
		// TODO: multiple escapting mode - including one that is not lossless
		// but allows writing a single '_' inside the name
		for (size_t i = 0, limit = name.size(); i < limit; i++) {
			wchar_t ch = name[i];
			bool valid = isValidNameCharacter(ch, i == 0);

			// Not a lossless round-trip
			// (maybe we could sacrifice some reserved prefix):
			if (i == 0 && !valid) output << 'x';

			if (ch == '_') output << "__";
			else if (valid) output << convertor.to_bytes(ch);
			else output << '_' << ((uint32_t) ch) << '_';
		}
		output << ": ";

		for (char ch : convertor.to_bytes(value)) {
			output << ch;
			if (ch == '\n') output << "+ ";
		}

		output << std::endl;
	}

	void writeRecordCount() {
		if (attributeCount) {
			output << std::endl << "# Record count: " << (valueCount / attributeCount) << std::endl;
		}
	}

public:

	RecfileHandler(std::ostream& output) : output(output) {
	}

	void startRelation(string_t name, std::vector<handlers::AttributeMetadata> attributes) override {
		writeRecordCount();

		valueCount = 0;
		attributeCount = 0;

		if (relationCount) writeSeparator();

		relationCount++;
		writeRelationName(name);

		attributeCount = attributes.size();
		attributeTypes.resize(attributeCount);
		attributeTypeCodes.resize(attributeCount);
		attributeNames.resize(attributeCount);
		for (int i = 0; i < attributes.size(); i++) {
			attributeNames[i] = attributes[i].getAttributeName();
			attributeTypes[i] = attributes[i].getTypeId();
			attributeTypeCodes[i] = attributes[i].getTypeName();
			writeAttributeMetadata(attributes[i]);
		}
	}

	void attribute(const string_t& value) override {
		integer_t i = valueCount % attributeCount;
		if (i == 0) writeSeparator();
		valueCount++;
		writeAttribute(attributeNames[i], attributeTypes[i], value);
	}

	void endOfPipe() {
		writeRecordCount();
		if (valueCount) writeSeparator();
	}

};

}
}
}