generic XML parser for arbitrary documents that generates a relation containing the SAX events
--- a/src/XMLCommand.h Fri Jan 11 12:04:45 2019 +0100
+++ b/src/XMLCommand.h Fri Jan 11 15:01:02 2019 +0100
@@ -26,6 +26,7 @@
#include <xercesc/sax2/SAX2XMLReader.hpp>
#include <xercesc/sax2/XMLReaderFactory.hpp>
#include <xercesc/sax2/DefaultHandler.hpp>
+#include <xercesc/sax2/Attributes.hpp>
#include <xercesc/util/XMLString.hpp>
#include <relpipe/writer/typedefs.h>
@@ -46,6 +47,21 @@
private:
unique_ptr<RelationalWriter> writer;
+ std::wstring_convert<std::codecvt_utf8<wchar_t>> convertor; // TODO: support also other encodings.
+
+ string_t toString(const XMLCh * const chars) {
+ // XMLCh = char16_t
+ // „All XML data is handled within Xerces-C++ as strings of XMLCh characters. Regardless of the size of the type chosen, the data stored in variables of type XMLCh will always be utf-16 encoded values.“
+ // see https://xerces.apache.org/xerces-c/program-others-3.html
+ // other solution (depends on boost): https://flylib.com/books/en/2.131.1/working_with_xerces_strings.html
+
+ // TODO: review this text conversion and test on various platforms
+ char* x = XMLString::transcode(chars);
+ string s = string(x);
+ XMLString::release(&x);
+ return convertor.from_bytes(s);
+ }
+
public:
RelpipeSaxHandler(std::ostream& output) : DefaultHandler(), writer(Factory::create(output)) {
@@ -56,28 +72,75 @@
// TODO: remove demo
writer->startRelation(L"xml",{
{L"event", TypeId::STRING},
- {L"data", TypeId::STRING},
+ {L"uri", TypeId::STRING},
+ {L"localname", TypeId::STRING},
+ {L"qname", TypeId::STRING},
+ {L"chars", TypeId::STRING}
}, true);
}
void startElement(const XMLCh * const uri, const XMLCh * const localname, const XMLCh * const qname, const Attributes& attrs) override {
writer->writeAttribute(L"startElement");
+ writer->writeAttribute(toString(uri));
+ writer->writeAttribute(toString(localname));
+ writer->writeAttribute(toString(qname));
writer->writeAttribute(L"");
+
+ for (int i = 0; i < attrs.getLength(); i++) {
+ writer->writeAttribute(L"attribute");
+ writer->writeAttribute(toString(attrs.getURI(i)));
+ writer->writeAttribute(toString(attrs.getLocalName(i)));
+ writer->writeAttribute(toString(attrs.getQName(i)));
+ writer->writeAttribute(toString(attrs.getValue(i)));
+ }
}
void endElement(const XMLCh * const uri, const XMLCh * const localname, const XMLCh * const qname) override {
writer->writeAttribute(L"endElement");
+ writer->writeAttribute(toString(uri));
+ writer->writeAttribute(toString(localname));
+ writer->writeAttribute(toString(qname));
writer->writeAttribute(L"");
}
void characters(const XMLCh * const chars, const XMLSize_t length) override {
writer->writeAttribute(L"characters");
- writer->writeAttribute(to_wstring(length));
+ writer->writeAttribute(L"");
+ writer->writeAttribute(L"");
+ writer->writeAttribute(L"");
+ writer->writeAttribute(toString(chars));
+ }
+
+ void comment(const XMLCh * const chars, const XMLSize_t length) override {
+ writer->writeAttribute(L"comment");
+ writer->writeAttribute(L"");
+ writer->writeAttribute(L"");
+ writer->writeAttribute(L"");
+ writer->writeAttribute(toString(chars));
+ }
+
+ void startCDATA() override {
+ writer->writeAttribute(L"startCDATA");
+ writer->writeAttribute(L"");
+ writer->writeAttribute(L"");
+ writer->writeAttribute(L"");
+ writer->writeAttribute(L"");
+ }
+
+ void endCDATA() override {
+ writer->writeAttribute(L"endCDATA");
+ writer->writeAttribute(L"");
+ writer->writeAttribute(L"");
+ writer->writeAttribute(L"");
+ writer->writeAttribute(L"");
}
void endDocument() override {
writer->writeAttribute(L"endDocument");
writer->writeAttribute(L"");
+ writer->writeAttribute(L"");
+ writer->writeAttribute(L"");
+ writer->writeAttribute(L"");
}
};
@@ -89,9 +152,14 @@
unique_ptr<SAX2XMLReader> parser(XMLReaderFactory::createXMLReader());
parser->setFeature(XMLUni::fgSAX2CoreValidation, true);
parser->setFeature(XMLUni::fgSAX2CoreNameSpaces, true);
+ // TODO: optional unbuffered mode for more fluent output?
+ // http://xerces.apache.org/xerces-c/program-sax2-3.html#SAX2Features
+ // parser->setProperty(XMLUni::fgXercesLowWaterMark, ...);
+ // parser->setInputBufferSize(...);
RelpipeSaxHandler saxHandler(output);
parser->setContentHandler(&saxHandler);
+ parser->setLexicalHandler(&saxHandler); // TODO: remove – needed only for comments
parser->setErrorHandler(&saxHandler);
StreamInputSource inputSource(input);