generic XML parser for arbitrary documents that generates a relation containing the SAX events v_0
authorFrantišek Kučera <franta-hg@frantovo.cz>
Fri, 11 Jan 2019 15:01:02 +0100
branchv_0
changeset 5 e5cf88ce91ac
parent 4 1363ec0879ca
child 6 be83e0f457a8
generic XML parser for arbitrary documents that generates a relation containing the SAX events
src/XMLCommand.h
--- a/src/XMLCommand.h	Fri Jan 11 12:04:45 2019 +0100
+++ b/src/XMLCommand.h	Fri Jan 11 15:01:02 2019 +0100
@@ -26,6 +26,7 @@
 #include <xercesc/sax2/SAX2XMLReader.hpp>
 #include <xercesc/sax2/XMLReaderFactory.hpp>
 #include <xercesc/sax2/DefaultHandler.hpp>
+#include <xercesc/sax2/Attributes.hpp>
 #include <xercesc/util/XMLString.hpp>
 
 #include <relpipe/writer/typedefs.h>
@@ -46,6 +47,21 @@
 	private:
 		unique_ptr<RelationalWriter> writer;
 
+		std::wstring_convert<std::codecvt_utf8<wchar_t>> convertor; // TODO: support also other encodings.
+
+		string_t toString(const XMLCh * const chars) {
+			// XMLCh = char16_t
+			// „All XML data is handled within Xerces-C++ as strings of XMLCh characters. Regardless of the size of the type chosen, the data stored in variables of type XMLCh will always be utf-16 encoded values.“
+			// see https://xerces.apache.org/xerces-c/program-others-3.html
+			// other solution (depends on boost): https://flylib.com/books/en/2.131.1/working_with_xerces_strings.html
+
+			// TODO: review this text conversion and test on various platforms
+			char* x = XMLString::transcode(chars);
+			string s = string(x);
+			XMLString::release(&x);
+			return convertor.from_bytes(s);
+		}
+
 	public:
 
 		RelpipeSaxHandler(std::ostream& output) : DefaultHandler(), writer(Factory::create(output)) {
@@ -56,28 +72,75 @@
 			// TODO: remove demo
 			writer->startRelation(L"xml",{
 				{L"event", TypeId::STRING},
-				{L"data", TypeId::STRING},
+				{L"uri", TypeId::STRING},
+				{L"localname", TypeId::STRING},
+				{L"qname", TypeId::STRING},
+				{L"chars", TypeId::STRING}
 			}, true);
 		}
 
 		void startElement(const XMLCh * const uri, const XMLCh * const localname, const XMLCh * const qname, const Attributes& attrs) override {
 			writer->writeAttribute(L"startElement");
+			writer->writeAttribute(toString(uri));
+			writer->writeAttribute(toString(localname));
+			writer->writeAttribute(toString(qname));
 			writer->writeAttribute(L"");
+
+			for (int i = 0; i < attrs.getLength(); i++) {
+				writer->writeAttribute(L"attribute");
+				writer->writeAttribute(toString(attrs.getURI(i)));
+				writer->writeAttribute(toString(attrs.getLocalName(i)));
+				writer->writeAttribute(toString(attrs.getQName(i)));
+				writer->writeAttribute(toString(attrs.getValue(i)));
+			}
 		}
 
 		void endElement(const XMLCh * const uri, const XMLCh * const localname, const XMLCh * const qname) override {
 			writer->writeAttribute(L"endElement");
+			writer->writeAttribute(toString(uri));
+			writer->writeAttribute(toString(localname));
+			writer->writeAttribute(toString(qname));
 			writer->writeAttribute(L"");
 		}
 
 		void characters(const XMLCh * const chars, const XMLSize_t length) override {
 			writer->writeAttribute(L"characters");
-			writer->writeAttribute(to_wstring(length));
+			writer->writeAttribute(L"");
+			writer->writeAttribute(L"");
+			writer->writeAttribute(L"");
+			writer->writeAttribute(toString(chars));
+		}
+
+		void comment(const XMLCh * const chars, const XMLSize_t length) override {
+			writer->writeAttribute(L"comment");
+			writer->writeAttribute(L"");
+			writer->writeAttribute(L"");
+			writer->writeAttribute(L"");
+			writer->writeAttribute(toString(chars));
+		}
+
+		void startCDATA() override {
+			writer->writeAttribute(L"startCDATA");
+			writer->writeAttribute(L"");
+			writer->writeAttribute(L"");
+			writer->writeAttribute(L"");
+			writer->writeAttribute(L"");
+		}
+
+		void endCDATA() override {
+			writer->writeAttribute(L"endCDATA");
+			writer->writeAttribute(L"");
+			writer->writeAttribute(L"");
+			writer->writeAttribute(L"");
+			writer->writeAttribute(L"");
 		}
 
 		void endDocument() override {
 			writer->writeAttribute(L"endDocument");
 			writer->writeAttribute(L"");
+			writer->writeAttribute(L"");
+			writer->writeAttribute(L"");
+			writer->writeAttribute(L"");
 		}
 
 	};
@@ -89,9 +152,14 @@
 		unique_ptr<SAX2XMLReader> parser(XMLReaderFactory::createXMLReader());
 		parser->setFeature(XMLUni::fgSAX2CoreValidation, true);
 		parser->setFeature(XMLUni::fgSAX2CoreNameSpaces, true);
+		// TODO: optional unbuffered mode for more fluent output?
+		// http://xerces.apache.org/xerces-c/program-sax2-3.html#SAX2Features
+		// parser->setProperty(XMLUni::fgXercesLowWaterMark, ...);
+		// parser->setInputBufferSize(...);
 
 		RelpipeSaxHandler saxHandler(output);
 		parser->setContentHandler(&saxHandler);
+		parser->setLexicalHandler(&saxHandler); // TODO: remove – needed only for comments
 		parser->setErrorHandler(&saxHandler);
 
 		StreamInputSource inputSource(input);