src/XMLDocumentConstructor.h
author František Kučera <franta-hg@frantovo.cz>
Sun, 07 Feb 2021 12:15:56 +0100
branchv_0
changeset 8 04aa5591eee3
parent 7 15d9b0ca161a
permissions -rw-r--r--
add @length attribute: number of bytes of a binary part or a text part encoded in UTF-8 (i.e. not the original lenght in the MIME message)

/**
 * Relational pipes
 * Copyright © 2021 František Kučera (Frantovo.cz, GlobalCode.info)
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */
#pragma once

#include <codecvt>
#include <vector>

#include <libxml++-2.6/libxml++/libxml++.h>

#include <vmime/vmime.hpp>

#include "XMLNameCodec.h"

namespace relpipe {
namespace in {
namespace xmltable {

class XMLDocumentConstructor {
private:
	std::istream* input = nullptr;
	xmlpp::DomParser* parser = nullptr;
	XMLNameCodec nameCodec;

	std::string rootName = "mime-message";

	std::string format(std::shared_ptr<vmime::datetime> value) {
		std::stringstream timestamp;
		int tz = value->getZone();
		timestamp << value->getYear() << "-";
		timestamp << std::setw(2) << std::setfill('0') << value->getMonth() << "-";
		timestamp << std::setw(2) << std::setfill('0') << value->getDay() << "T";
		timestamp << std::setw(2) << std::setfill('0') << value->getHour() << ":";
		timestamp << std::setw(2) << std::setfill('0') << value->getMinute() << ":";
		timestamp << std::setw(2) << std::setfill('0') << value->getSecond() << (tz >= 0 ? "+" : "-");
		timestamp << std::setw(2) << std::setfill('0') << std::abs(tz / 60) << ":";
		timestamp << std::setw(2) << std::setfill('0') << std::abs(tz % 60);
		return timestamp.str();
	}

	std::string format(const vmime::mediaType& contentType) {
		return contentType.getType() + "/" + contentType.getSubType();
	}

	std::string toLowerCase(const std::string& value) {
		std::string result = value;
		std::transform(result.begin(), result.end(), result.begin(), ::tolower);
		return result;
	}

	std::string toHex(const std::string& value) {
		static const char* const hexSymbols = "0123456789abcdef";
		size_t length = value.length();

		std::string result;
		result.reserve(3 * length - 1);
		for (size_t i = 0; i < length; i++) {
			const unsigned char ch = value[i];
			result.push_back(hexSymbols[ch >> 4]);
			result.push_back(hexSymbols[ch & 15]);
			if (i < length - 1) result.push_back(' ');
		}
		return result;
	}

	std::string fetchBodyText(std::shared_ptr<vmime::body> body, size_t& bodyLength) {
		std::stringstream result;
		vmime::utility::outputStreamAdapter resultAdapter(result);

		const vmime::charset targetEncoding = vmime::charset("utf-8");
		const vmime::charset sourceEncoding = body->getCharset();

		vmime::shared_ptr <vmime::charsetConverter> charsetConverter = vmime::charsetConverter::create(sourceEncoding, targetEncoding);
		vmime::shared_ptr <vmime::utility::charsetFilteredOutputStream> resultConverter = charsetConverter->getFilteredOutputStream(resultAdapter);

		body->getContents()->extract(*resultConverter);
		resultConverter->flush();
		bodyLength = result.tellp();

		return result.str();
	}

	std::string fetchBodyBinary(std::shared_ptr<vmime::body> body, size_t& bodyLength) {
		std::stringstream result;
		vmime::utility::outputStreamAdapter resultAdapter(result);
		body->getContents()->extract(resultAdapter);
		resultAdapter.flush();
		bodyLength = result.tellp();
		return toHex(result.str());
	}

	void appendBody(xmlpp::Element* element, std::shared_ptr<vmime::body> body) {
		element->set_attribute("content-type", format(body->getContentType()));
		// element->set_attribute("content-type-charset", body->getCharset().getName());
		// element->set_attribute("content-transfer-encoding", body->getEncoding().getName());
		// TODO: size of raw data

		if (body->getPartCount() == 0) {
			size_t bodyLength = 0;
			if (body->getContentType().getType() == "text") element->add_child_cdata(fetchBodyText(body, bodyLength));
			else element->add_child_text(fetchBodyBinary(body, bodyLength));
			element->set_attribute("length", std::to_string(bodyLength));

			// TODO: if content is valid XML, import it in the DOM tree instead of pasting as a nested text/cdata
			// TODO: optional trim of long data
		} else {
			for (auto part : body->getPartList()) {
				xmlpp::Element* partElement = element->add_child("part");
				appendBody(partElement, part->getBody());
			}
		}
	}

public:

	XMLDocumentConstructor(std::istream* input, xmlpp::DomParser* parser) : input(input), parser(parser) {
	}

	virtual ~XMLDocumentConstructor() {
	}

	void setOption(const std::string& uri, const std::string& value) {
		if (uri == "root-name") rootName = value;
		else throw std::invalid_argument(std::string("Invalid parser option: „") + uri + "“ with value: „" + value + "“");
	}

	void process() {
		vmime::utility::inputStreamAdapter is(*input);
		vmime::string data;
		vmime::utility::outputStreamStringAdapter os(data);
		vmime::utility::bufferedStreamCopy(is, os);

		vmime::message m;
		m.parse(data);

		// vmime::shared_ptr<vmime::utility::inputStreamAdapter> is = vmime::make_shared<vmime::utility::inputStreamAdapter>(*input);
		// m.parse(is, 0);

		vmime::charset ch(vmime::charsets::UTF_8);

		//std::cerr << "Subject:" << m.getHeader()->Subject()->getValue<vmime::text>()->getConvertedText(ch) << std::endl;

		xmlpp::Element* root = parser->get_document()->create_root_node(rootName);

		xmlpp::Element* headers = root->add_child("headers");

		for (std::shared_ptr<vmime::headerField> mimeField : m.getHeader()->getFieldList()) {
			// TODO: Are names always ASCII and subset of UTF-8?
			xmlpp::Element* field = headers->add_child(toLowerCase(nameCodec.encode(mimeField->getName())));

			if (auto value = mimeField->getValue<vmime::text>()) {
				field->add_child_text(value->getConvertedText(ch));
			} else if (auto value = mimeField->getValue<vmime::mailbox>()) {
				std::string name = value->getName().getConvertedText(ch);
				std::string email = value->getEmail().toString();
				if (name.size()) field->set_attribute("name", name);
				if (email.size()) field->add_child_text(email);
			} else if (auto value = mimeField->getValue<vmime::addressList>()) {
				for (auto address : value->getAddressList()) {
					xmlpp::Element* addressField = field->add_child("address");
					if (std::shared_ptr<vmime::mailbox> mailbox = std::dynamic_pointer_cast<vmime::mailbox> (address)) {
						std::string name = mailbox->getName().getConvertedText(ch);
						std::string email = mailbox->getEmail().toString();
						if (name.size()) addressField->set_attribute("name", name);
						if (email.size()) addressField->add_child_text(email);
					} else if (std::shared_ptr<vmime::mailboxGroup> mailbox = std::dynamic_pointer_cast<vmime::mailboxGroup> (address)) {
						// TODO: mailboxGroup?
					}
				}
			} else if (auto value = mimeField->getValue<vmime::datetime>()) {
				// TODO: keep particular timestamp bits attributes or not?
				field->set_attribute("year", std::to_string(value->getYear()));
				field->set_attribute("month", std::to_string(value->getMonth()));
				field->set_attribute("day", std::to_string(value->getDay()));
				field->set_attribute("hour", std::to_string(value->getHour()));
				field->set_attribute("minute", std::to_string(value->getMinute()));
				field->set_attribute("second", std::to_string(value->getSecond()));
				field->set_attribute("zone", std::to_string(value->getZone())); // timezone is in minutes
				field->add_child_text(format(value));
			} else if (auto value = mimeField->getValue<vmime::mediaType>()) {
				if (value) field->add_child_text(format(*value));
				// TODO: encoding from the "Content-Type: text/plain; charset=us-ascii" type header?
			} else if (auto value = mimeField->getValue<vmime::messageId>()) {
				field->add_child_text(value->getId());
			} else if (auto value = mimeField->getValue<vmime::messageIdSequence>()) {
				for (auto messageId : value->getMessageIdList()) {
					xmlpp::Element* messageIdField = field->add_child("message-id");
					messageIdField->add_child_text(messageId->getId());
				}
			} else if (auto value = mimeField->getValue<vmime::contentDisposition>()) {
				field->add_child_text(value->getName());
			} else if (auto value = mimeField->getValue<vmime::relay>()) {
				field->set_attribute("from", value->getFrom());
				field->set_attribute("via", value->getVia());
				field->set_attribute("by", value->getBy());
				field->set_attribute("id", value->getId());
				field->set_attribute("for", value->getFor());
				// TODO: date of Received/relay
				// TODO: missing values or incomplete parsing of Received/relay in vmime
			} else if (auto value = mimeField->getValue<vmime::path>()) {
				std::string local = value->getLocalPart();
				std::string domain = value->getDomain();
				if (local.size() && domain.size()) field->add_child_text(local + "@" + domain);
				else field->add_child_text(local + domain);
			} else if (auto value = mimeField->getValue<vmime::encoding>()) {
				field->add_child_text(value->getName());
			} else {
				field->add_child_text("TODO: unknown header type"); // TODO: generic conversion as fallback?
			}
		}

		// TODO: check null pointers

		xmlpp::Element* body = root->add_child("body");
		appendBody(body, m.getBody());

	}
};

}
}
}