generic XML parser for arbitrary documents that generates a relation containing the SAX events
/**
* Relational pipes
* Copyright © 2018 František Kučera (Frantovo.cz, GlobalCode.info)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#pragma once
#include <cstdlib>
#include <iostream>
#include <string>
#include <vector>
#include <algorithm>
#include <xercesc/sax2/SAX2XMLReader.hpp>
#include <xercesc/sax2/XMLReaderFactory.hpp>
#include <xercesc/sax2/DefaultHandler.hpp>
#include <xercesc/sax2/Attributes.hpp>
#include <xercesc/util/XMLString.hpp>
#include <relpipe/writer/typedefs.h>
#include "StreamInputSource.h"
namespace relpipe {
namespace in {
namespace xml {
using namespace relpipe::writer;
using namespace xercesc;
class XMLCommand {
private:
class RelpipeSaxHandler : public xercesc::DefaultHandler {
private:
unique_ptr<RelationalWriter> writer;
std::wstring_convert<std::codecvt_utf8<wchar_t>> convertor; // TODO: support also other encodings.
string_t toString(const XMLCh * const chars) {
// XMLCh = char16_t
// „All XML data is handled within Xerces-C++ as strings of XMLCh characters. Regardless of the size of the type chosen, the data stored in variables of type XMLCh will always be utf-16 encoded values.“
// see https://xerces.apache.org/xerces-c/program-others-3.html
// other solution (depends on boost): https://flylib.com/books/en/2.131.1/working_with_xerces_strings.html
// TODO: review this text conversion and test on various platforms
char* x = XMLString::transcode(chars);
string s = string(x);
XMLString::release(&x);
return convertor.from_bytes(s);
}
public:
RelpipeSaxHandler(std::ostream& output) : DefaultHandler(), writer(Factory::create(output)) {
}
void startDocument() override {
//XMLString::
// TODO: remove demo
writer->startRelation(L"xml",{
{L"event", TypeId::STRING},
{L"uri", TypeId::STRING},
{L"localname", TypeId::STRING},
{L"qname", TypeId::STRING},
{L"chars", TypeId::STRING}
}, true);
}
void startElement(const XMLCh * const uri, const XMLCh * const localname, const XMLCh * const qname, const Attributes& attrs) override {
writer->writeAttribute(L"startElement");
writer->writeAttribute(toString(uri));
writer->writeAttribute(toString(localname));
writer->writeAttribute(toString(qname));
writer->writeAttribute(L"");
for (int i = 0; i < attrs.getLength(); i++) {
writer->writeAttribute(L"attribute");
writer->writeAttribute(toString(attrs.getURI(i)));
writer->writeAttribute(toString(attrs.getLocalName(i)));
writer->writeAttribute(toString(attrs.getQName(i)));
writer->writeAttribute(toString(attrs.getValue(i)));
}
}
void endElement(const XMLCh * const uri, const XMLCh * const localname, const XMLCh * const qname) override {
writer->writeAttribute(L"endElement");
writer->writeAttribute(toString(uri));
writer->writeAttribute(toString(localname));
writer->writeAttribute(toString(qname));
writer->writeAttribute(L"");
}
void characters(const XMLCh * const chars, const XMLSize_t length) override {
writer->writeAttribute(L"characters");
writer->writeAttribute(L"");
writer->writeAttribute(L"");
writer->writeAttribute(L"");
writer->writeAttribute(toString(chars));
}
void comment(const XMLCh * const chars, const XMLSize_t length) override {
writer->writeAttribute(L"comment");
writer->writeAttribute(L"");
writer->writeAttribute(L"");
writer->writeAttribute(L"");
writer->writeAttribute(toString(chars));
}
void startCDATA() override {
writer->writeAttribute(L"startCDATA");
writer->writeAttribute(L"");
writer->writeAttribute(L"");
writer->writeAttribute(L"");
writer->writeAttribute(L"");
}
void endCDATA() override {
writer->writeAttribute(L"endCDATA");
writer->writeAttribute(L"");
writer->writeAttribute(L"");
writer->writeAttribute(L"");
writer->writeAttribute(L"");
}
void endDocument() override {
writer->writeAttribute(L"endDocument");
writer->writeAttribute(L"");
writer->writeAttribute(L"");
writer->writeAttribute(L"");
writer->writeAttribute(L"");
}
};
public:
void process(std::istream& input, std::ostream& output) {
XMLPlatformUtils::Initialize();
unique_ptr<SAX2XMLReader> parser(XMLReaderFactory::createXMLReader());
parser->setFeature(XMLUni::fgSAX2CoreValidation, true);
parser->setFeature(XMLUni::fgSAX2CoreNameSpaces, true);
// TODO: optional unbuffered mode for more fluent output?
// http://xerces.apache.org/xerces-c/program-sax2-3.html#SAX2Features
// parser->setProperty(XMLUni::fgXercesLowWaterMark, ...);
// parser->setInputBufferSize(...);
RelpipeSaxHandler saxHandler(output);
parser->setContentHandler(&saxHandler);
parser->setLexicalHandler(&saxHandler); // TODO: remove – needed only for comments
parser->setErrorHandler(&saxHandler);
StreamInputSource inputSource(input);
parser->parse(inputSource);
}
};
}
}
}