# HG changeset patch # User František Kučera # Date 1547215262 -3600 # Node ID e5cf88ce91ac00d0f149b2a932f801e4e04575b2 # Parent 1363ec0879caa2debcad429f0c8f93082a157d5f generic XML parser for arbitrary documents that generates a relation containing the SAX events diff -r 1363ec0879ca -r e5cf88ce91ac src/XMLCommand.h --- a/src/XMLCommand.h Fri Jan 11 12:04:45 2019 +0100 +++ b/src/XMLCommand.h Fri Jan 11 15:01:02 2019 +0100 @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -46,6 +47,21 @@ private: unique_ptr writer; + std::wstring_convert> convertor; // TODO: support also other encodings. + + string_t toString(const XMLCh * const chars) { + // XMLCh = char16_t + // „All XML data is handled within Xerces-C++ as strings of XMLCh characters. Regardless of the size of the type chosen, the data stored in variables of type XMLCh will always be utf-16 encoded values.“ + // see https://xerces.apache.org/xerces-c/program-others-3.html + // other solution (depends on boost): https://flylib.com/books/en/2.131.1/working_with_xerces_strings.html + + // TODO: review this text conversion and test on various platforms + char* x = XMLString::transcode(chars); + string s = string(x); + XMLString::release(&x); + return convertor.from_bytes(s); + } + public: RelpipeSaxHandler(std::ostream& output) : DefaultHandler(), writer(Factory::create(output)) { @@ -56,28 +72,75 @@ // TODO: remove demo writer->startRelation(L"xml",{ {L"event", TypeId::STRING}, - {L"data", TypeId::STRING}, + {L"uri", TypeId::STRING}, + {L"localname", TypeId::STRING}, + {L"qname", TypeId::STRING}, + {L"chars", TypeId::STRING} }, true); } void startElement(const XMLCh * const uri, const XMLCh * const localname, const XMLCh * const qname, const Attributes& attrs) override { writer->writeAttribute(L"startElement"); + writer->writeAttribute(toString(uri)); + writer->writeAttribute(toString(localname)); + writer->writeAttribute(toString(qname)); writer->writeAttribute(L""); + + for (int i = 0; i < attrs.getLength(); i++) { + writer->writeAttribute(L"attribute"); + writer->writeAttribute(toString(attrs.getURI(i))); + writer->writeAttribute(toString(attrs.getLocalName(i))); + writer->writeAttribute(toString(attrs.getQName(i))); + writer->writeAttribute(toString(attrs.getValue(i))); + } } void endElement(const XMLCh * const uri, const XMLCh * const localname, const XMLCh * const qname) override { writer->writeAttribute(L"endElement"); + writer->writeAttribute(toString(uri)); + writer->writeAttribute(toString(localname)); + writer->writeAttribute(toString(qname)); writer->writeAttribute(L""); } void characters(const XMLCh * const chars, const XMLSize_t length) override { writer->writeAttribute(L"characters"); - writer->writeAttribute(to_wstring(length)); + writer->writeAttribute(L""); + writer->writeAttribute(L""); + writer->writeAttribute(L""); + writer->writeAttribute(toString(chars)); + } + + void comment(const XMLCh * const chars, const XMLSize_t length) override { + writer->writeAttribute(L"comment"); + writer->writeAttribute(L""); + writer->writeAttribute(L""); + writer->writeAttribute(L""); + writer->writeAttribute(toString(chars)); + } + + void startCDATA() override { + writer->writeAttribute(L"startCDATA"); + writer->writeAttribute(L""); + writer->writeAttribute(L""); + writer->writeAttribute(L""); + writer->writeAttribute(L""); + } + + void endCDATA() override { + writer->writeAttribute(L"endCDATA"); + writer->writeAttribute(L""); + writer->writeAttribute(L""); + writer->writeAttribute(L""); + writer->writeAttribute(L""); } void endDocument() override { writer->writeAttribute(L"endDocument"); writer->writeAttribute(L""); + writer->writeAttribute(L""); + writer->writeAttribute(L""); + writer->writeAttribute(L""); } }; @@ -89,9 +152,14 @@ unique_ptr parser(XMLReaderFactory::createXMLReader()); parser->setFeature(XMLUni::fgSAX2CoreValidation, true); parser->setFeature(XMLUni::fgSAX2CoreNameSpaces, true); + // TODO: optional unbuffered mode for more fluent output? + // http://xerces.apache.org/xerces-c/program-sax2-3.html#SAX2Features + // parser->setProperty(XMLUni::fgXercesLowWaterMark, ...); + // parser->setInputBufferSize(...); RelpipeSaxHandler saxHandler(output); parser->setContentHandler(&saxHandler); + parser->setLexicalHandler(&saxHandler); // TODO: remove – needed only for comments parser->setErrorHandler(&saxHandler); StreamInputSource inputSource(input);