# HG changeset patch # User František Kučera # Date 1554564822 -7200 # Node ID 8dfb42e5c08881f2f8a9d572adaf97c7b7ea5c1e # Parent 515a697cc9cdb48f7927b19e7c4b0f5c7516bb9f parse recfile (logical lines) diff -r 515a697cc9cd -r 8dfb42e5c088 src/RecfileCommand.h --- a/src/RecfileCommand.h Fri Apr 05 18:02:19 2019 +0200 +++ b/src/RecfileCommand.h Sat Apr 06 17:33:42 2019 +0200 @@ -35,7 +35,9 @@ enum class RecfileLineType { METADATA, - DATA + DATA, + SEPARATOR, + COMMENT, }; class RecfileHandler { @@ -49,17 +51,38 @@ virtual ~RecfileHandler() { } - void logicalLine(const string_t& name, const string_t& value, RecfileLineType type) { - std::wcerr << L"logicalLine(" << name << L", " << value << L", " << (int) type << L");" << std::endl; // TODO: remove debug + void logicalLine(RecfileLineType type, const string_t& name = L"", const string_t& value = L"") { + std::wcerr << L"logicalLine(" << (int) type << L", " << name << L", " << value << L");" << std::endl; // TODO: remove debug // TODO: writer->startRelation() // TODO: writer->writeAttribute() } }; + enum class ParserState { + START, + NAME, + VALUE, + VALUE_CONTINUATION, + COMMENT, + END, + }; + class RecfileParser { private: + wstring_convert> convertor; // TODO: support also other encodings or are recfiles always in UTF-8? RecfileHandler& handler; + + void emitLogicalLine(RecfileLineType& type, std::stringstream& name, std::stringstream& value) { + handler.logicalLine(type, convertor.from_bytes(name.str()), convertor.from_bytes(value.str())); + + name.str(""); + name.clear(); + value.str(""); + value.clear(); + type = RecfileLineType::DATA; + } + public: RecfileParser(RecfileHandler& handler) : handler(handler) { @@ -69,15 +92,72 @@ } void parse(std::istream& input) { - // TODO: parse - handler.logicalLine(L"nnn", L"vvv", RecfileLineType::METADATA); // TODO: remove debug - handler.logicalLine(L"nnn", L"vvv", RecfileLineType::DATA); // TODO: remove debug + + ParserState state = ParserState::START; + RecfileLineType type = RecfileLineType::DATA; + std::stringstream name; + std::stringstream value; + char ch; + + while (state != ParserState::END && input.good()) { + ch = input.get(); + if (input.eof()) continue; + + switch (state) { + case ParserState::START: + if (ch == '%') { + type = RecfileLineType::METADATA; + break; + } else if (ch == ' ') { + break; + } else if (ch == '\n') { + handler.logicalLine(RecfileLineType::SEPARATOR); + break; + } else if (ch == '#') { + type = RecfileLineType::COMMENT; + state = ParserState::COMMENT; + if (input.get() != ' ') input.unget(); + break; + } // else → name + case ParserState::NAME: + if (ch == ':') { + state = ParserState::VALUE; + if (input.get() != ' ') input.unget(); + } else { + name << ch; + } + break; + case ParserState::VALUE: + if (ch == '\n') state = ParserState::VALUE_CONTINUATION; + else value << ch; + break; + case ParserState::VALUE_CONTINUATION: + if (ch == '+') { + state = ParserState::VALUE; + if (value.tellp()) value << '\n'; + if (input.get() != ' ') input.unget(); + } else { + input.unget(); + state = ParserState::START; + emitLogicalLine(type, name, value); + } + break; + case ParserState::COMMENT: + if (ch == '\n') { + state = ParserState::START; + emitLogicalLine(type, name, value); + } else { + value << ch; + } + break; + default: + throw RelpipeWriterException(L"Unknown ParserState: " + std::to_wstring((int) state) + L" in RecfileParser."); // TODO: better exception + } + } + emitLogicalLine(type, name, value); } - }; - - public: void process(std::istream& input, std::ostream& output) {