src/lib/INIReader.cpp
branchv_0
changeset 26 80e129ec3408
parent 25 ee70b17950bd
child 28 0e7c57d48d1e
equal deleted inserted replaced
25:ee70b17950bd 26:80e129ec3408
    15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
    15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
    16  */
    16  */
    17 
    17 
    18 #include <vector>
    18 #include <vector>
    19 #include <regex>
    19 #include <regex>
       
    20 #include <sstream>
       
    21 #include <stdexcept>
    20 
    22 
    21 #include "INIReader.h"
    23 #include "INIReader.h"
       
    24 
       
    25 namespace relpipe {
       
    26 namespace in {
       
    27 namespace ini {
       
    28 namespace lib {
    22 
    29 
    23 class INIReaderImpl : public INIReader {
    30 class INIReaderImpl : public INIReader {
    24 private:
    31 private:
    25 	std::istream& input;
    32 	std::istream& input;
    26 	std::vector<INIContentHandler*> handlers;
    33 	std::vector<INIContentHandler*> handlers;
       
    34 
       
    35 	/** 
       
    36 	 * This might be configurable.
       
    37 	 * 
       
    38 	 * By default, we ignore all leading whitespace on continuing lines.
       
    39 	 * If there should be some spaces or tabs, they should be placed on the previous line before the „\“.
       
    40 	 * If a line break is desired, it should be written as \n (escaped) or the value should be quoted in " or '.
       
    41 	 * 
       
    42 	 * Related specifications:
       
    43 	 *  - https://docs.oracle.com/javase/8/docs/api/index.html?java/util/Properties.html
       
    44 	 */
       
    45 	bool consumeLeadingSpacesOnContinuingLines = true;
       
    46 
       
    47 	/**
       
    48 	 * This might be configurable.
       
    49 	 * 
       
    50 	 * KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section.
       
    51 	 * Line „[section_1][$i]“ means that the „section_1“ is „locked“.
       
    52 	 * We may emit this information somehow later, but for now, it is just ignored.
       
    53 	 * 
       
    54 	 * TODO: Is „section tag“ right name?
       
    55 	 * 
       
    56 	 * Related specifications:
       
    57 	 *  - https://userbase.kde.org/KDE_System_Administration/Configuration_Files#Lock_Down
       
    58 	 */
       
    59 	bool allowSectionTags = true;
       
    60 
       
    61 	/**
       
    62 	 * This might be configurable.
       
    63 	 * 
       
    64 	 * If whole key is „aaa[bbb]“ then „aaa“ is considered to be the key and „bbb“ the sub-key.
       
    65 	 * No \[ escaping is currently supported, so the key might not contain the bracket character.
       
    66 	 * 
       
    67 	 * Related specifications:
       
    68 	 *  - https://userbase.kde.org/KDE_System_Administration/Configuration_Files#Shell_Expansion
       
    69 	 *  - https://specifications.freedesktop.org/desktop-entry-spec/latest/ar01s05.html
       
    70 	 */
       
    71 	bool allowSubKeys = true;
       
    72 
       
    73 	int lineNumber = 1;
       
    74 	int eventNumber = 0;
       
    75 
       
    76 	/**
       
    77 	 * Should be always used instead of input.peek().
       
    78 	 * Skips \r.
       
    79 	 */
       
    80 	char peek() {
       
    81 		// In 2020 there is no need to manually return the carriage. However some legacy systems still do it.
       
    82 		char ch = input.peek();
       
    83 		if (ch == '\r') {
       
    84 			input.get();
       
    85 			ch = input.peek();
       
    86 		}
       
    87 		return ch;
       
    88 	}
       
    89 
       
    90 	/**
       
    91 	 * Should be always used instead of input.get().
       
    92 	 * Counts the lines and skips \r.
       
    93 	 */
       
    94 	char get() {
       
    95 		char ch = input.get();
       
    96 		if (ch == '\n') lineNumber++;
       
    97 		else if (ch == '\r') ch = get();
       
    98 		return ch;
       
    99 	}
       
   100 
       
   101 	std::string readSpacesAndTabs() {
       
   102 		std::stringstream result;
       
   103 		for (char ch = peek(); input.good() && (ch == ' ' || ch == '\t'); ch = peek()) result.put(get());
       
   104 		return result.str();
       
   105 	}
       
   106 
       
   107 	std::string readAllWhitespace() {
       
   108 		std::stringstream result;
       
   109 		for (char ch = peek(); input.good() && (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'); ch = peek()) result.put(get());
       
   110 		return result.str();
       
   111 	}
       
   112 
       
   113 	void processContinuingLine(std::stringstream& result) {
       
   114 		if (consumeLeadingSpacesOnContinuingLines) readSpacesAndTabs();
       
   115 		else result.put('\n');
       
   116 	}
       
   117 
       
   118 	std::string readUntil(char until, bool* found = nullptr) {
       
   119 		std::stringstream result;
       
   120 
       
   121 		for (char ch = peek(); input.good() && ch != until; ch = peek()) {
       
   122 			if (ch == '\\') {
       
   123 				get();
       
   124 				ch = get();
       
   125 				if (ch == until && ch == '\n') processContinuingLine(result);
       
   126 				else if (ch == until) result.put(ch);
       
   127 				else if (ch == std::istream::traits_type::eof()) break;
       
   128 				else result.put('\\').put(ch);
       
   129 				// TODO: two-stage and modular unescaping: here unescape only \+LF or more genereally: unescape only the until character and rest leave untouched
       
   130 				// second escaping stage move to separate class/wrapper (similar to hierarchical wrappers)
       
   131 			} else {
       
   132 				ch = get();
       
   133 				result.put(ch);
       
   134 			}
       
   135 		}
       
   136 
       
   137 		if (peek() == until) {
       
   138 			get();
       
   139 			if (found) *found = true;
       
   140 		} else {
       
   141 			if (found) *found = false;
       
   142 		}
       
   143 
       
   144 		return result.str();
       
   145 	}
       
   146 
       
   147 	std::string readToken(char until, char* quote = nullptr, bool* found = nullptr) {
       
   148 		std::string result;
       
   149 
       
   150 		char ch = peek();
       
   151 		if (isQuote(ch)) {
       
   152 			if (quote) *quote = ch;
       
   153 			result = readUntil(get(), found);
       
   154 		} else {
       
   155 			if (quote) *quote = 0;
       
   156 			result = readUntil(until, found);
       
   157 		}
       
   158 
       
   159 		return result;
       
   160 	}
       
   161 
       
   162 	std::string readTokenAndEatTerminator(char until, char* quote = nullptr, bool* found = nullptr) {
       
   163 		std::string result = readToken(until, quote, found);
       
   164 		if (*quote) {
       
   165 			readAllWhitespace();
       
   166 			if (get() != until) throw std::logic_error(std::string("missing „") + std::string(1, until) + "“ after quoted section name");
       
   167 		}
       
   168 		return result;
       
   169 	}
       
   170 
       
   171 	bool isComment(char ch) {
       
   172 		return ch == '#' || ch == ';';
       
   173 	}
       
   174 
       
   175 	bool isQuote(char ch) {
       
   176 		return ch == '"' || ch == '\'';
       
   177 	}
       
   178 
       
   179 	std::string trim(std::string s) {
       
   180 		return std::regex_replace(s, std::regex("^\\s+|\\s+$"), "");
       
   181 	}
       
   182 
    27 public:
   183 public:
    28 
   184 
    29 	INIReaderImpl(std::istream& input) : input(input) {
   185 	INIReaderImpl(std::istream& input) : input(input) {
    30 	}
   186 	}
    31 
   187 
    32 	void addHandler(INIContentHandler* handler) override {
   188 	void addHandler(INIContentHandler* handler) override {
    33 		handlers.push_back(handler);
   189 		handlers.push_back(handler);
    34 	}
   190 	}
    35 
   191 
    36 	void process() override {
   192 	void process() override {
    37 
       
    38 		for (INIContentHandler* handler : handlers) handler->startDocument();
   193 		for (INIContentHandler* handler : handlers) handler->startDocument();
    39 
   194 
    40 		std::regex whitespacePattrern("\\s*");
       
    41 		std::regex commentPattrern("\\s*(;|#)\\s*(.*)");
       
    42 		std::regex sectionPattrern("\\s*\\[\\s*([^\\]]+)\\s*\\]\\s*(\\[\\s*([^\\]]+)\\s*\\])?\\s*((;|#)\\s*(.*))?");
       
    43 		std::regex entryQuotedPattrern("\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*(\"|')((?:(?!\\5).)*)(\\5)?\\s*((;|#)\\s*(.*))?");
       
    44 		std::regex entryPlainPattrern("\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*(.*?)\\s*");
       
    45 
       
    46 		std::smatch match;
       
    47 		bool inSection = false;
   195 		bool inSection = false;
    48 		std::string line;
   196 
    49 		int lineNumber = 0;
   197 		while (input.good()) { // TODO: condition
    50 		int eventNumber = 0;
   198 			{
    51 
   199 				std::string whitespace = readAllWhitespace();
    52 
   200 				if (whitespace.size()) {
    53 		while (std::getline(input, line)) {
   201 					INIContentHandler::WhitespaceEvent event;
    54 			lineNumber++;
   202 					event.lineNumber = lineNumber;
    55 
   203 					event.eventNumber = ++eventNumber;
    56 			if (std::regex_match(line, match, whitespacePattrern)) {
   204 					event.whitespace = whitespace;
    57 				INIContentHandler::WhitespaceEvent event;
   205 					for (INIContentHandler* handler : handlers) handler->whitespace(event);
       
   206 				}
       
   207 			}
       
   208 
       
   209 			bool found;
       
   210 			char quote;
       
   211 
       
   212 			char ch = peek();
       
   213 
       
   214 			if (ch == std::istream::traits_type::eof()) {
       
   215 				break;
       
   216 			} else if (ch == '[') {
       
   217 				if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
       
   218 				inSection = true;
       
   219 				get();
       
   220 				readAllWhitespace();
       
   221 				INIContentHandler::SectionStartEvent event;
    58 				event.lineNumber = lineNumber;
   222 				event.lineNumber = lineNumber;
    59 				event.eventNumber = ++eventNumber;
   223 				event.eventNumber = ++eventNumber;
    60 				event.whitespace = match[0];
   224 				event.name = readTokenAndEatTerminator(']', &quote, &found);
    61 				for (INIContentHandler* handler : handlers) handler->whitespace(event);
   225 
    62 			} else if (std::regex_match(line, match, commentPattrern)) {
   226 				readSpacesAndTabs();
       
   227 				if (allowSectionTags && peek() == '[') {
       
   228 					get();
       
   229 					event.tag = readTokenAndEatTerminator(']', &quote, &found);
       
   230 				}
       
   231 
       
   232 				readSpacesAndTabs();
       
   233 				ch = peek();
       
   234 				if (isComment(ch)) {
       
   235 					get();
       
   236 					readSpacesAndTabs();
       
   237 					event.comment = readUntil('\n', &found);
       
   238 				} else if (ch == '\n') {
       
   239 					get();
       
   240 				} else {
       
   241 					throw std::logic_error(std::string("unexpected content after the section: '") + event.name + "'");
       
   242 				}
       
   243 
       
   244 				for (INIContentHandler* handler : handlers) handler->startSection(event);
       
   245 			} else if (isComment(ch)) {
       
   246 				get();
       
   247 				readSpacesAndTabs();
    63 				INIContentHandler::CommentEvent event;
   248 				INIContentHandler::CommentEvent event;
    64 				event.lineNumber = lineNumber;
   249 				event.lineNumber = lineNumber;
    65 				event.eventNumber = ++eventNumber;
   250 				event.eventNumber = ++eventNumber;
    66 				event.comment = match[2];
   251 				event.comment = readUntil('\n', &found);
    67 				for (INIContentHandler* handler : handlers) handler->comment(event);
   252 				for (INIContentHandler* handler : handlers) handler->comment(event);
    68 			} else if (std::regex_match(line, match, sectionPattrern)) {
   253 			} else {
    69 				if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
   254 				std::string fullKey = readToken('=', &quote, &found);
    70 				inSection = true;
   255 				if (!found) throw std::logic_error(std::string("missing = after key: '") + fullKey + "'");
    71 				INIContentHandler::SectionStartEvent event;
   256 				if (!quote) fullKey = trim(fullKey);
    72 				event.lineNumber = lineNumber;
   257 				readSpacesAndTabs();
    73 				event.eventNumber = ++eventNumber;
   258 
    74 				event.name = match[1];
   259 				if (quote) {
    75 				event.comment = match[6];
   260 					ch = get();
    76 				// event.tag = match[3];
   261 					if (ch == '=') readSpacesAndTabs();
    77 				// KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section.
   262 					else throw std::logic_error(std::string("missing = after quoted key: '") + fullKey + "'");
    78 				// see <https://userbase.kde.org/KDE_System_Administration/Configuration_Files>, „[$i]“ means that the section is „locked“
   263 				}
    79 				// We may emit this information somehow later, but for now, it is just ignored.
   264 
    80 				for (INIContentHandler* handler : handlers) handler->startSection(event);
   265 				std::string value = readToken('\n', &quote, &found);
    81 			} else if (std::regex_match(line, match, entryQuotedPattrern)) {
   266 				if (!quote) value = trim(value);
       
   267 
    82 				INIContentHandler::EntryEvent event;
   268 				INIContentHandler::EntryEvent event;
    83 				event.lineNumber = lineNumber;
   269 				event.lineNumber = lineNumber;
    84 				event.eventNumber = ++eventNumber;
   270 				event.eventNumber = ++eventNumber;
    85 				event.key = match[2];
   271 				event.key = fullKey;
    86 				event.subKey = match[4];
   272 				event.fullKey = fullKey;
    87 				event.fullKey = match[1];
   273 				event.value = value;
    88 				event.value = match[6];
   274 
    89 				event.comment = match[10];
   275 				if (allowSubKeys) {
    90 
   276 					std::smatch match;
    91 				// the "/' at the end is missing → line continues
   277 					if (std::regex_match(fullKey, match, std::regex("([^\\[]+)\\[([^\\[]+)\\]"))) {
    92 				if (match.length(7) == 0) {
   278 						event.key = match[1];
    93 					std::regex endPattern(std::string("(.*?)") + (match[5] == "'" ? "'" : "\"") + "\\s*((;|#)\\s*(.*))?");
   279 						event.subKey = match[2];
    94 					while (std::getline(input, line)) {
   280 						event.fullKey = fullKey;
    95 						lineNumber++;
       
    96 						event.value += "\n";
       
    97 						if (std::regex_match(line, match, endPattern)) {
       
    98 							event.value += std::string(match[1]);
       
    99 							event.comment = match[4];
       
   100 							break;
       
   101 						} else {
       
   102 							event.value += line;
       
   103 						}
       
   104 					}
   281 					}
   105 				}
   282 				}
   106 
   283 
       
   284 				if (quote) {
       
   285 					readSpacesAndTabs();
       
   286 					ch = peek();
       
   287 					if (isComment(ch)) {
       
   288 						get();
       
   289 						readSpacesAndTabs();
       
   290 						event.comment = readUntil('\n', &found);
       
   291 					} else if (ch == '\n') {
       
   292 						get();
       
   293 					} else {
       
   294 						throw std::logic_error(std::string("unexpected content after the quoted value: key='") + fullKey + "' value='" + event.value + "'");
       
   295 					}
       
   296 				}
       
   297 
   107 				for (INIContentHandler* handler : handlers) handler->entry(event);
   298 				for (INIContentHandler* handler : handlers) handler->entry(event);
   108 			} else if (std::regex_match(line, match, entryPlainPattrern)) {
       
   109 				INIContentHandler::EntryEvent event;
       
   110 				event.lineNumber = lineNumber;
       
   111 				event.eventNumber = ++eventNumber;
       
   112 				event.key = match[2];
       
   113 				event.subKey = match[4];
       
   114 				event.fullKey = match[1];
       
   115 				event.value = match[5];
       
   116 
       
   117 				// the \ at the end → line continues
       
   118 				while (line.back() == '\\' && std::getline(input, line)) {
       
   119 					lineNumber++;
       
   120 					line = std::regex_replace(line, std::regex("^\\s+|\\s+$"), ""); // trim the spaces: continuing lines might be aligned to the first line (desired spaces – if any – should be at the line end before the \ character)
       
   121 					event.value = event.value.substr(0, event.value.size() - 1); // cut the trailing \ backslash
       
   122 					event.value = event.value + line;
       
   123 				}
       
   124 
       
   125 				for (INIContentHandler* handler : handlers) handler->entry(event);
       
   126 			} else {
       
   127 				// TODO: warning, error, or support unknown content
       
   128 			}
   299 			}
   129 
   300 		}
   130 			// General feautres:
   301 		// TODO: error at the end, catch premature/unexpected EOF
   131 			// TODO: probably switch to state-machine approach instead of regular expressions or use an existing library
   302 		// TODO: unescape + trim values + ignore \r
   132 			// TODO: warning/error handler
   303 		// TODO: count lines
   133 			// TODO: support also quoted or multiline keys?
       
   134 			// TODO: support also escaped characters
       
   135 			// TODO: support also Java .properties and manifest.mf formats?
       
   136 			// TODO: support also quoted sections ["qoted section"] – useful for hierarchy (the path element may contain the separator character)
       
   137 			// TODO: support also nested sections – hierarchy
       
   138 			// TODO: support also nested keys e.g. key.sub.subsub.subsubsub=value – translate them to nested sections
       
   139 			// TODO: support also option for alternative key-value separator (: instead of =)
       
   140 			// TODO: support also other encodings (currently only UTF-8 is supported)
       
   141 
       
   142 			// Lossless conversions:
       
   143 			// TODO: emit also the quote style ('/"/)
       
   144 			// TODO: emit also the comment style (;/#) ?
       
   145 			// TODO: emit also the whitespace before key name, around =, after "values"/'values', around [sections] ?
       
   146 			// TODO: emit also the line-end type (LF/CRLF) ?
       
   147 		}
       
   148 
       
   149 		if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
   304 		if (inSection) for (INIContentHandler* handler : handlers) handler->endSection();
   150 
       
   151 		for (INIContentHandler* handler : handlers) handler->endDocument();
   305 		for (INIContentHandler* handler : handlers) handler->endDocument();
   152 	}
   306 	}
       
   307 
       
   308 	// General feautres:
       
   309 	// TODO: warning/error handler
       
   310 	// TODO: support also escaped characters
       
   311 	// TODO: support also Java .properties and manifest.mf formats?
       
   312 	// TODO: support also nested sections – hierarchy
       
   313 	// TODO: support also nested keys e.g. key.sub.subsub.subsubsub=value – translate them to nested sections
       
   314 	// TODO: support also option for alternative key-value separator (: instead of =)
       
   315 	// TODO: support also other encodings (currently only UTF-8 is supported)
       
   316 	// TODO: better exceptions
       
   317 
       
   318 	// Lossless conversions:
       
   319 	// TODO: emit also the quote style ('/"/)
       
   320 	// TODO: emit also the comment style (;/#) ?
       
   321 	// TODO: emit also the whitespace before key name, around =, after "values"/'values', around [sections] ?
       
   322 	// TODO: emit also the line-end type (LF/CRLF) ?
       
   323 
   153 };
   324 };
   154 
   325 
   155 INIReader* INIReader::create(std::istream& input) {
   326 INIReader* INIReader::create(std::istream& input) {
   156 	return new INIReaderImpl(input);
   327 	return new INIReaderImpl(input);
   157 }
   328 }
       
   329 
       
   330 }
       
   331 }
       
   332 }
       
   333 }