15 * along with this program. If not, see <http://www.gnu.org/licenses/>. |
15 * along with this program. If not, see <http://www.gnu.org/licenses/>. |
16 */ |
16 */ |
17 |
17 |
18 #include <vector> |
18 #include <vector> |
19 #include <regex> |
19 #include <regex> |
|
20 #include <sstream> |
|
21 #include <stdexcept> |
20 |
22 |
21 #include "INIReader.h" |
23 #include "INIReader.h" |
|
24 |
|
25 namespace relpipe { |
|
26 namespace in { |
|
27 namespace ini { |
|
28 namespace lib { |
22 |
29 |
23 class INIReaderImpl : public INIReader { |
30 class INIReaderImpl : public INIReader { |
24 private: |
31 private: |
25 std::istream& input; |
32 std::istream& input; |
26 std::vector<INIContentHandler*> handlers; |
33 std::vector<INIContentHandler*> handlers; |
|
34 |
|
35 /** |
|
36 * This might be configurable. |
|
37 * |
|
38 * By default, we ignore all leading whitespace on continuing lines. |
|
39 * If there should be some spaces or tabs, they should be placed on the previous line before the „\“. |
|
40 * If a line break is desired, it should be written as \n (escaped) or the value should be quoted in " or '. |
|
41 * |
|
42 * Related specifications: |
|
43 * - https://docs.oracle.com/javase/8/docs/api/index.html?java/util/Properties.html |
|
44 */ |
|
45 bool consumeLeadingSpacesOnContinuingLines = true; |
|
46 |
|
47 /** |
|
48 * This might be configurable. |
|
49 * |
|
50 * KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section. |
|
51 * Line „[section_1][$i]“ means that the „section_1“ is „locked“. |
|
52 * We may emit this information somehow later, but for now, it is just ignored. |
|
53 * |
|
54 * TODO: Is „section tag“ right name? |
|
55 * |
|
56 * Related specifications: |
|
57 * - https://userbase.kde.org/KDE_System_Administration/Configuration_Files#Lock_Down |
|
58 */ |
|
59 bool allowSectionTags = true; |
|
60 |
|
61 /** |
|
62 * This might be configurable. |
|
63 * |
|
64 * If whole key is „aaa[bbb]“ then „aaa“ is considered to be the key and „bbb“ the sub-key. |
|
65 * No \[ escaping is currently supported, so the key might not contain the bracket character. |
|
66 * |
|
67 * Related specifications: |
|
68 * - https://userbase.kde.org/KDE_System_Administration/Configuration_Files#Shell_Expansion |
|
69 * - https://specifications.freedesktop.org/desktop-entry-spec/latest/ar01s05.html |
|
70 */ |
|
71 bool allowSubKeys = true; |
|
72 |
|
73 int lineNumber = 1; |
|
74 int eventNumber = 0; |
|
75 |
|
76 /** |
|
77 * Should be always used instead of input.peek(). |
|
78 * Skips \r. |
|
79 */ |
|
80 char peek() { |
|
81 // In 2020 there is no need to manually return the carriage. However some legacy systems still do it. |
|
82 char ch = input.peek(); |
|
83 if (ch == '\r') { |
|
84 input.get(); |
|
85 ch = input.peek(); |
|
86 } |
|
87 return ch; |
|
88 } |
|
89 |
|
90 /** |
|
91 * Should be always used instead of input.get(). |
|
92 * Counts the lines and skips \r. |
|
93 */ |
|
94 char get() { |
|
95 char ch = input.get(); |
|
96 if (ch == '\n') lineNumber++; |
|
97 else if (ch == '\r') ch = get(); |
|
98 return ch; |
|
99 } |
|
100 |
|
101 std::string readSpacesAndTabs() { |
|
102 std::stringstream result; |
|
103 for (char ch = peek(); input.good() && (ch == ' ' || ch == '\t'); ch = peek()) result.put(get()); |
|
104 return result.str(); |
|
105 } |
|
106 |
|
107 std::string readAllWhitespace() { |
|
108 std::stringstream result; |
|
109 for (char ch = peek(); input.good() && (ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'); ch = peek()) result.put(get()); |
|
110 return result.str(); |
|
111 } |
|
112 |
|
113 void processContinuingLine(std::stringstream& result) { |
|
114 if (consumeLeadingSpacesOnContinuingLines) readSpacesAndTabs(); |
|
115 else result.put('\n'); |
|
116 } |
|
117 |
|
118 std::string readUntil(char until, bool* found = nullptr) { |
|
119 std::stringstream result; |
|
120 |
|
121 for (char ch = peek(); input.good() && ch != until; ch = peek()) { |
|
122 if (ch == '\\') { |
|
123 get(); |
|
124 ch = get(); |
|
125 if (ch == until && ch == '\n') processContinuingLine(result); |
|
126 else if (ch == until) result.put(ch); |
|
127 else if (ch == std::istream::traits_type::eof()) break; |
|
128 else result.put('\\').put(ch); |
|
129 // TODO: two-stage and modular unescaping: here unescape only \+LF or more genereally: unescape only the until character and rest leave untouched |
|
130 // second escaping stage move to separate class/wrapper (similar to hierarchical wrappers) |
|
131 } else { |
|
132 ch = get(); |
|
133 result.put(ch); |
|
134 } |
|
135 } |
|
136 |
|
137 if (peek() == until) { |
|
138 get(); |
|
139 if (found) *found = true; |
|
140 } else { |
|
141 if (found) *found = false; |
|
142 } |
|
143 |
|
144 return result.str(); |
|
145 } |
|
146 |
|
147 std::string readToken(char until, char* quote = nullptr, bool* found = nullptr) { |
|
148 std::string result; |
|
149 |
|
150 char ch = peek(); |
|
151 if (isQuote(ch)) { |
|
152 if (quote) *quote = ch; |
|
153 result = readUntil(get(), found); |
|
154 } else { |
|
155 if (quote) *quote = 0; |
|
156 result = readUntil(until, found); |
|
157 } |
|
158 |
|
159 return result; |
|
160 } |
|
161 |
|
162 std::string readTokenAndEatTerminator(char until, char* quote = nullptr, bool* found = nullptr) { |
|
163 std::string result = readToken(until, quote, found); |
|
164 if (*quote) { |
|
165 readAllWhitespace(); |
|
166 if (get() != until) throw std::logic_error(std::string("missing „") + std::string(1, until) + "“ after quoted section name"); |
|
167 } |
|
168 return result; |
|
169 } |
|
170 |
|
171 bool isComment(char ch) { |
|
172 return ch == '#' || ch == ';'; |
|
173 } |
|
174 |
|
175 bool isQuote(char ch) { |
|
176 return ch == '"' || ch == '\''; |
|
177 } |
|
178 |
|
179 std::string trim(std::string s) { |
|
180 return std::regex_replace(s, std::regex("^\\s+|\\s+$"), ""); |
|
181 } |
|
182 |
27 public: |
183 public: |
28 |
184 |
29 INIReaderImpl(std::istream& input) : input(input) { |
185 INIReaderImpl(std::istream& input) : input(input) { |
30 } |
186 } |
31 |
187 |
32 void addHandler(INIContentHandler* handler) override { |
188 void addHandler(INIContentHandler* handler) override { |
33 handlers.push_back(handler); |
189 handlers.push_back(handler); |
34 } |
190 } |
35 |
191 |
36 void process() override { |
192 void process() override { |
37 |
|
38 for (INIContentHandler* handler : handlers) handler->startDocument(); |
193 for (INIContentHandler* handler : handlers) handler->startDocument(); |
39 |
194 |
40 std::regex whitespacePattrern("\\s*"); |
|
41 std::regex commentPattrern("\\s*(;|#)\\s*(.*)"); |
|
42 std::regex sectionPattrern("\\s*\\[\\s*([^\\]]+)\\s*\\]\\s*(\\[\\s*([^\\]]+)\\s*\\])?\\s*((;|#)\\s*(.*))?"); |
|
43 std::regex entryQuotedPattrern("\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*(\"|')((?:(?!\\5).)*)(\\5)?\\s*((;|#)\\s*(.*))?"); |
|
44 std::regex entryPlainPattrern("\\s*(([^=\\]]+?[^=\\s\\]]*)(\\[([^\\]]+)\\])?)\\s*=\\s*(.*?)\\s*"); |
|
45 |
|
46 std::smatch match; |
|
47 bool inSection = false; |
195 bool inSection = false; |
48 std::string line; |
196 |
49 int lineNumber = 0; |
197 while (input.good()) { // TODO: condition |
50 int eventNumber = 0; |
198 { |
51 |
199 std::string whitespace = readAllWhitespace(); |
52 |
200 if (whitespace.size()) { |
53 while (std::getline(input, line)) { |
201 INIContentHandler::WhitespaceEvent event; |
54 lineNumber++; |
202 event.lineNumber = lineNumber; |
55 |
203 event.eventNumber = ++eventNumber; |
56 if (std::regex_match(line, match, whitespacePattrern)) { |
204 event.whitespace = whitespace; |
57 INIContentHandler::WhitespaceEvent event; |
205 for (INIContentHandler* handler : handlers) handler->whitespace(event); |
|
206 } |
|
207 } |
|
208 |
|
209 bool found; |
|
210 char quote; |
|
211 |
|
212 char ch = peek(); |
|
213 |
|
214 if (ch == std::istream::traits_type::eof()) { |
|
215 break; |
|
216 } else if (ch == '[') { |
|
217 if (inSection) for (INIContentHandler* handler : handlers) handler->endSection(); |
|
218 inSection = true; |
|
219 get(); |
|
220 readAllWhitespace(); |
|
221 INIContentHandler::SectionStartEvent event; |
58 event.lineNumber = lineNumber; |
222 event.lineNumber = lineNumber; |
59 event.eventNumber = ++eventNumber; |
223 event.eventNumber = ++eventNumber; |
60 event.whitespace = match[0]; |
224 event.name = readTokenAndEatTerminator(']', "e, &found); |
61 for (INIContentHandler* handler : handlers) handler->whitespace(event); |
225 |
62 } else if (std::regex_match(line, match, commentPattrern)) { |
226 readSpacesAndTabs(); |
|
227 if (allowSectionTags && peek() == '[') { |
|
228 get(); |
|
229 event.tag = readTokenAndEatTerminator(']', "e, &found); |
|
230 } |
|
231 |
|
232 readSpacesAndTabs(); |
|
233 ch = peek(); |
|
234 if (isComment(ch)) { |
|
235 get(); |
|
236 readSpacesAndTabs(); |
|
237 event.comment = readUntil('\n', &found); |
|
238 } else if (ch == '\n') { |
|
239 get(); |
|
240 } else { |
|
241 throw std::logic_error(std::string("unexpected content after the section: '") + event.name + "'"); |
|
242 } |
|
243 |
|
244 for (INIContentHandler* handler : handlers) handler->startSection(event); |
|
245 } else if (isComment(ch)) { |
|
246 get(); |
|
247 readSpacesAndTabs(); |
63 INIContentHandler::CommentEvent event; |
248 INIContentHandler::CommentEvent event; |
64 event.lineNumber = lineNumber; |
249 event.lineNumber = lineNumber; |
65 event.eventNumber = ++eventNumber; |
250 event.eventNumber = ++eventNumber; |
66 event.comment = match[2]; |
251 event.comment = readUntil('\n', &found); |
67 for (INIContentHandler* handler : handlers) handler->comment(event); |
252 for (INIContentHandler* handler : handlers) handler->comment(event); |
68 } else if (std::regex_match(line, match, sectionPattrern)) { |
253 } else { |
69 if (inSection) for (INIContentHandler* handler : handlers) handler->endSection(); |
254 std::string fullKey = readToken('=', "e, &found); |
70 inSection = true; |
255 if (!found) throw std::logic_error(std::string("missing = after key: '") + fullKey + "'"); |
71 INIContentHandler::SectionStartEvent event; |
256 if (!quote) fullKey = trim(fullKey); |
72 event.lineNumber = lineNumber; |
257 readSpacesAndTabs(); |
73 event.eventNumber = ++eventNumber; |
258 |
74 event.name = match[1]; |
259 if (quote) { |
75 event.comment = match[6]; |
260 ch = get(); |
76 // event.tag = match[3]; |
261 if (ch == '=') readSpacesAndTabs(); |
77 // KDE uses some weird INI dialect that allows [section][x] syntax where „x“ is kind of „tag“ that signalizes some properties of given section. |
262 else throw std::logic_error(std::string("missing = after quoted key: '") + fullKey + "'"); |
78 // see <https://userbase.kde.org/KDE_System_Administration/Configuration_Files>, „[$i]“ means that the section is „locked“ |
263 } |
79 // We may emit this information somehow later, but for now, it is just ignored. |
264 |
80 for (INIContentHandler* handler : handlers) handler->startSection(event); |
265 std::string value = readToken('\n', "e, &found); |
81 } else if (std::regex_match(line, match, entryQuotedPattrern)) { |
266 if (!quote) value = trim(value); |
|
267 |
82 INIContentHandler::EntryEvent event; |
268 INIContentHandler::EntryEvent event; |
83 event.lineNumber = lineNumber; |
269 event.lineNumber = lineNumber; |
84 event.eventNumber = ++eventNumber; |
270 event.eventNumber = ++eventNumber; |
85 event.key = match[2]; |
271 event.key = fullKey; |
86 event.subKey = match[4]; |
272 event.fullKey = fullKey; |
87 event.fullKey = match[1]; |
273 event.value = value; |
88 event.value = match[6]; |
274 |
89 event.comment = match[10]; |
275 if (allowSubKeys) { |
90 |
276 std::smatch match; |
91 // the "/' at the end is missing → line continues |
277 if (std::regex_match(fullKey, match, std::regex("([^\\[]+)\\[([^\\[]+)\\]"))) { |
92 if (match.length(7) == 0) { |
278 event.key = match[1]; |
93 std::regex endPattern(std::string("(.*?)") + (match[5] == "'" ? "'" : "\"") + "\\s*((;|#)\\s*(.*))?"); |
279 event.subKey = match[2]; |
94 while (std::getline(input, line)) { |
280 event.fullKey = fullKey; |
95 lineNumber++; |
|
96 event.value += "\n"; |
|
97 if (std::regex_match(line, match, endPattern)) { |
|
98 event.value += std::string(match[1]); |
|
99 event.comment = match[4]; |
|
100 break; |
|
101 } else { |
|
102 event.value += line; |
|
103 } |
|
104 } |
281 } |
105 } |
282 } |
106 |
283 |
|
284 if (quote) { |
|
285 readSpacesAndTabs(); |
|
286 ch = peek(); |
|
287 if (isComment(ch)) { |
|
288 get(); |
|
289 readSpacesAndTabs(); |
|
290 event.comment = readUntil('\n', &found); |
|
291 } else if (ch == '\n') { |
|
292 get(); |
|
293 } else { |
|
294 throw std::logic_error(std::string("unexpected content after the quoted value: key='") + fullKey + "' value='" + event.value + "'"); |
|
295 } |
|
296 } |
|
297 |
107 for (INIContentHandler* handler : handlers) handler->entry(event); |
298 for (INIContentHandler* handler : handlers) handler->entry(event); |
108 } else if (std::regex_match(line, match, entryPlainPattrern)) { |
|
109 INIContentHandler::EntryEvent event; |
|
110 event.lineNumber = lineNumber; |
|
111 event.eventNumber = ++eventNumber; |
|
112 event.key = match[2]; |
|
113 event.subKey = match[4]; |
|
114 event.fullKey = match[1]; |
|
115 event.value = match[5]; |
|
116 |
|
117 // the \ at the end → line continues |
|
118 while (line.back() == '\\' && std::getline(input, line)) { |
|
119 lineNumber++; |
|
120 line = std::regex_replace(line, std::regex("^\\s+|\\s+$"), ""); // trim the spaces: continuing lines might be aligned to the first line (desired spaces – if any – should be at the line end before the \ character) |
|
121 event.value = event.value.substr(0, event.value.size() - 1); // cut the trailing \ backslash |
|
122 event.value = event.value + line; |
|
123 } |
|
124 |
|
125 for (INIContentHandler* handler : handlers) handler->entry(event); |
|
126 } else { |
|
127 // TODO: warning, error, or support unknown content |
|
128 } |
299 } |
129 |
300 } |
130 // General feautres: |
301 // TODO: error at the end, catch premature/unexpected EOF |
131 // TODO: probably switch to state-machine approach instead of regular expressions or use an existing library |
302 // TODO: unescape + trim values + ignore \r |
132 // TODO: warning/error handler |
303 // TODO: count lines |
133 // TODO: support also quoted or multiline keys? |
|
134 // TODO: support also escaped characters |
|
135 // TODO: support also Java .properties and manifest.mf formats? |
|
136 // TODO: support also quoted sections ["qoted section"] – useful for hierarchy (the path element may contain the separator character) |
|
137 // TODO: support also nested sections – hierarchy |
|
138 // TODO: support also nested keys e.g. key.sub.subsub.subsubsub=value – translate them to nested sections |
|
139 // TODO: support also option for alternative key-value separator (: instead of =) |
|
140 // TODO: support also other encodings (currently only UTF-8 is supported) |
|
141 |
|
142 // Lossless conversions: |
|
143 // TODO: emit also the quote style ('/"/) |
|
144 // TODO: emit also the comment style (;/#) ? |
|
145 // TODO: emit also the whitespace before key name, around =, after "values"/'values', around [sections] ? |
|
146 // TODO: emit also the line-end type (LF/CRLF) ? |
|
147 } |
|
148 |
|
149 if (inSection) for (INIContentHandler* handler : handlers) handler->endSection(); |
304 if (inSection) for (INIContentHandler* handler : handlers) handler->endSection(); |
150 |
|
151 for (INIContentHandler* handler : handlers) handler->endDocument(); |
305 for (INIContentHandler* handler : handlers) handler->endDocument(); |
152 } |
306 } |
|
307 |
|
308 // General feautres: |
|
309 // TODO: warning/error handler |
|
310 // TODO: support also escaped characters |
|
311 // TODO: support also Java .properties and manifest.mf formats? |
|
312 // TODO: support also nested sections – hierarchy |
|
313 // TODO: support also nested keys e.g. key.sub.subsub.subsubsub=value – translate them to nested sections |
|
314 // TODO: support also option for alternative key-value separator (: instead of =) |
|
315 // TODO: support also other encodings (currently only UTF-8 is supported) |
|
316 // TODO: better exceptions |
|
317 |
|
318 // Lossless conversions: |
|
319 // TODO: emit also the quote style ('/"/) |
|
320 // TODO: emit also the comment style (;/#) ? |
|
321 // TODO: emit also the whitespace before key name, around =, after "values"/'values', around [sections] ? |
|
322 // TODO: emit also the line-end type (LF/CRLF) ? |
|
323 |
153 }; |
324 }; |
154 |
325 |
155 INIReader* INIReader::create(std::istream& input) { |
326 INIReader* INIReader::create(std::istream& input) { |
156 return new INIReaderImpl(input); |
327 return new INIReaderImpl(input); |
157 } |
328 } |
|
329 |
|
330 } |
|
331 } |
|
332 } |
|
333 } |