src/RecfileHandler.h
author František Kučera <franta-hg@frantovo.cz>
Sun, 19 Feb 2023 02:26:23 +0100
branchv_0
changeset 18 002077ecb17a
parent 17 f67047a1e19e
permissions -rw-r--r--
long line wrapping: first version

/**
 * Relational pipes
 * Copyright © 2019 František Kučera (Frantovo.cz, GlobalCode.info)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
#pragma once

#include <string>
#include <vector>
#include <iostream>
#include <sstream>
#include <locale>
#include <codecvt>
#include <regex>
#include <cassert>

#include <relpipe/reader/typedefs.h>
#include <relpipe/reader/TypeId.h>
#include <relpipe/reader/RelpipeReaderException.h>
#include <relpipe/reader/handlers/RelationalReaderStringHandler.h>
#include <relpipe/reader/handlers/AttributeMetadata.h>

namespace relpipe {
namespace out {
namespace recfile {

using namespace relpipe::reader;

class RecfileHandler : public handlers::RelationalReaderStringHandler {
private:
	std::ostream& output;
	wstring_convert<codecvt_utf8<wchar_t>> convertor; // XML output will be always in UTF-8
	std::vector<TypeId> attributeTypes;
	std::vector<string_t> attributeTypeCodes;
	std::vector<string_t> attributeNamesIn;
	std::vector<string_t> attributeNamesOut;
	integer_t valueCount = 0;
	integer_t attributeCount = 0;
	integer_t relationCount = 0;

	void writeRelationName(const string_t& name) {
		// FIXME: escaping/filtering
		output << "%rec: " << convertor.to_bytes(name) << std::endl;
	}

	const std::string toRecfileType(const TypeId& type) {
		switch (type) {
			case TypeId::BOOLEAN: return "bool";
			case TypeId::INTEGER: return "int";
			case TypeId::STRING: return "";
			default: throw RelpipeReaderException(L"Unsupported type – unable to convert to a Recfile type");
		}
	}

	void writeAttributeMetadata(size_t i) {
		std::string recfileType = toRecfileType(attributeTypes[i]);
		if (recfileType.size()) output << "%type: " << convertor.to_bytes(attributeNamesOut[i]) << " " << recfileType << std::endl;
	}

	void writeSeparator() {
		output << std::endl;
	}

	bool between(wchar_t ch, wchar_t start, wchar_t end) {
		return ch >= start && ch <= end;
	}

	bool isValidNameCharacter(wchar_t ch, bool first) {
		if (first) {
			// also '%' is technically valid here, but it is used for special
			// purposes like the relation name or attribute types
			return /**/between(ch, L'a', L'z')
					|| between(ch, L'A', L'Z');
		} else {
			return ch == L'_'
					|| between(ch, L'a', L'z')
					|| between(ch, L'A', L'Z')
					|| between(ch, L'0', L'9');
		}
	}

	const string_t escapeAttributeName(const string_t& name) {
		std::wstringstream escaped;

		// TODO: multiple escapting mode - including one that is not lossless
		// but allows writing a single '_' inside the name
		for (size_t i = 0, limit = name.size(); i < limit; i++) {
			wchar_t ch = name[i];
			bool valid = isValidNameCharacter(ch, i == 0);

			// Not a lossless round-trip
			// (maybe we could sacrifice some reserved prefix):
			if (i == 0 && !valid) escaped << 'x';

			if (ch == '_') escaped << "__";
			else if (valid) escaped << ch;
			else escaped << '_' << ((uint32_t) ch) << '_';
		}

		return escaped.str();
	}

	integer_t computeWidth(const wchar_t ch) {
		switch (ch) {
			case L'\t':
				// TODO: tabulator width?
				return 4;
			case L' ':
				return 1;
			default:
				return std::max(0, wcwidth(ch));
		}
	}

	/**
	 * @param stringValue
	 * @return the width that would the string occupy on the display (particular characters might be wider than 1 column)
	 */
	integer_t computeWidth(const string_t & stringValue) {
		integer_t width = 0;
		for (wchar_t ch : stringValue) width += computeWidth(ch);
		return width;
	}

	void writeAttribute(const string_t& escapedName, const TypeId& type, const string_t& value) {
		output << convertor.to_bytes(escapedName) << ": ";

		static const integer_t MAX_LINE_WIDTH = 80; // TODO: configuration
		static const boolean_t unlimited = MAX_LINE_WIDTH == -1;

		integer_t currentWidth = computeWidth(escapedName) + 2; // 2 = ": " separator
		integer_t valueWidth = computeWidth(value);
		integer_t remainingWidth = valueWidth;


		for (size_t i = 0, limit = value.size(); i < limit; i++) {
			wchar_t ch = value[i];
			integer_t characterWidth = computeWidth(ch);

			if (ch == '\n') {
				output << std::endl << "+ ";
				currentWidth = 2; // 2 = "+ " prefix
			} else {
				if (unlimited || (currentWidth + remainingWidth) <= MAX_LINE_WIDTH) {
					// all remaining characters fit the in the limit
					currentWidth += characterWidth;
					output << convertor.to_bytes(ch);
				} else if ((currentWidth + characterWidth + 1) <= MAX_LINE_WIDTH) { // 1 = "\"
					// we will wrap the line, but not yet
					currentWidth += characterWidth;
					output << convertor.to_bytes(ch);
				} else {
					output << "\\" << std::endl;
					currentWidth = 0;
					currentWidth += characterWidth;
					output << convertor.to_bytes(ch);
				}
			}

			remainingWidth -= characterWidth;
		}

		output << std::endl;
	}

	void writeRecordCount() {
		if (attributeCount) {
			output << std::endl << "# Record count: " << (valueCount / attributeCount) << std::endl;
		}
	}

public:

	RecfileHandler(std::ostream& output) : output(output) {
	}

	void startRelation(string_t name, std::vector<handlers::AttributeMetadata> attributes) override {
		writeRecordCount();

		valueCount = 0;
		attributeCount = 0;

		if (relationCount) writeSeparator();

		relationCount++;
		writeRelationName(name);

		attributeCount = attributes.size();
		attributeTypes.resize(attributeCount);
		attributeTypeCodes.resize(attributeCount);
		attributeNamesIn.resize(attributeCount);
		attributeNamesOut.resize(attributeCount);
		for (int i = 0; i < attributes.size(); i++) {
			attributeNamesIn[i] = attributes[i].getAttributeName();
			attributeNamesOut[i] = escapeAttributeName(attributeNamesIn[i]);
			attributeTypes[i] = attributes[i].getTypeId();
			attributeTypeCodes[i] = attributes[i].getTypeName();
			writeAttributeMetadata(i);
		}
	}

	void attribute(const string_t& value) override {
		integer_t i = valueCount % attributeCount;
		if (i == 0) writeSeparator();
		valueCount++;
		writeAttribute(attributeNamesOut[i], attributeTypes[i], value);
	}

	void endOfPipe() {
		writeRecordCount();
		if (valueCount) writeSeparator();
	}

};

}
}
}