src/CSVCommand.cpp
branchv_0
changeset 20 90ae67de2f68
parent 16 15ee963675af
child 21 22eb4838e8d0
equal deleted inserted replaced
19:0d858e0eedf8 20:90ae67de2f68
    15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
    15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
    16  */
    16  */
    17 #include <cstdlib>
    17 #include <cstdlib>
    18 #include <vector>
    18 #include <vector>
    19 #include <memory>
    19 #include <memory>
       
    20 #include <locale>
    20 #include <regex>
    21 #include <regex>
    21 #include <algorithm>
    22 #include <algorithm>
    22 #include <unistd.h>
    23 #include <unistd.h>
    23 
    24 
    24 #include <relpipe/writer/RelationalWriter.h>
    25 #include <relpipe/writer/RelationalWriter.h>
    85 		}
    86 		}
    86 	}
    87 	}
    87 	return false;
    88 	return false;
    88 }
    89 }
    89 
    90 
       
    91 /**
       
    92  * Data types might be encoded in the attribute names: name::type e.g. some_attribute::integer
       
    93  * 
       
    94  * TODO: share this code through relpipe-lib-infertypes (when available)
       
    95  */
       
    96 void tryParseTypes(vector<AttributeMetadata>& metadata, std::shared_ptr<writer::RelationalWriter> writer, Configuration& configuration) {
       
    97 	std::wregex pattern(L"(.*)::(.*)");
       
    98 	std::wsmatch match;
       
    99 
       
   100 	if (configuration.readTypes == Configuration::ReadTypes::AUTO || configuration.readTypes == Configuration::ReadTypes::TRUE) {
       
   101 		bool hasTypes = true;
       
   102 		std::vector<TypeId> types;
       
   103 		std::vector<string_t> names;
       
   104 		for (AttributeMetadata& am : metadata) {
       
   105 			if (std::regex_match(am.attributeName, match, pattern)) {
       
   106 				names.push_back(match[1]);
       
   107 				if (configuration.readTypes == Configuration::ReadTypes::TRUE) {
       
   108 					types.push_back(writer->toTypeId(match[2])); // must be valid type name otherwise exception is thrown
       
   109 				} else {
       
   110 					try {
       
   111 						types.push_back(writer->toTypeId(match[2]));
       
   112 					} catch (...) {
       
   113 						hasTypes = false; // ignore exception and keep original names and default type (string)
       
   114 					}
       
   115 				}
       
   116 			} else {
       
   117 				hasTypes = false;
       
   118 			}
       
   119 		}
       
   120 
       
   121 		if (hasTypes) {
       
   122 			for (int i = 0, count = metadata.size(); i < count; i++) {
       
   123 				metadata[i].attributeName = names[i];
       
   124 				metadata[i].typeId = types[i];
       
   125 			}
       
   126 		} else if (configuration.readTypes == Configuration::ReadTypes::TRUE) {
       
   127 			throw RelpipeWriterException(L"Types were expected in the CSV header, but not found.");
       
   128 		}
       
   129 	}
       
   130 }
       
   131 
    90 void CSVCommand::process(std::istream& input, std::shared_ptr<writer::RelationalWriter> writer, Configuration& configuration) {
   132 void CSVCommand::process(std::istream& input, std::shared_ptr<writer::RelationalWriter> writer, Configuration& configuration) {
    91 	wstring_convert < codecvt_utf8<wchar_t>> convertor; // UTF-8 is required for CSV
   133 	wstring_convert < codecvt_utf8<wchar_t>> convertor; // UTF-8 is required for CSV
    92 	vector<AttributeMetadata> metadata;
   134 	vector<AttributeMetadata> metadata;
    93 	bool headerDone = false;
   135 	bool headerDone = false;
    94 	bool lastInRecord = false;
   136 	bool lastInRecord = false;
   103 			am.attributeName = convertor.from_bytes(currentValue.str());
   145 			am.attributeName = convertor.from_bytes(currentValue.str());
   104 			am.typeId = TypeId::STRING;
   146 			am.typeId = TypeId::STRING;
   105 			metadata.push_back(am);
   147 			metadata.push_back(am);
   106 			if (lastInRecord) {
   148 			if (lastInRecord) {
   107 
   149 
   108 				// TODO: allow types on CLI and names from CSV?
       
   109 				// TODO: allow types on the second line of the CSV?
       
   110 				// TODO: allow regex pattern+replacement for extracting name and type from the first line of the CSV?
       
   111 				// TODO: allow attribute filtering, subset, like relpipe-tr-cur?
       
   112 				// TODO: allow skipping lines, like tail -n +2 ?
       
   113 				
       
   114 				vector<string_t> firstLine;
   150 				vector<string_t> firstLine;
   115 
   151 
   116 				if (metadata.size() == configuration.attributes.size()) {
   152 				if (metadata.size() == configuration.attributes.size()) {
   117 					for (int i = 0; i < metadata.size(); i++) {
   153 					for (int i = 0; i < metadata.size(); i++) {
   118 						firstLine.push_back(metadata[i].attributeName);
   154 						firstLine.push_back(metadata[i].attributeName);
   119 						metadata[i].attributeName = configuration.attributes[i].name;
   155 						metadata[i].attributeName = configuration.attributes[i].name;
   120 						metadata[i].typeId = configuration.attributes[i].type;
   156 						metadata[i].typeId = configuration.attributes[i].type;
   121 					}
   157 					}
   122 				} else if (configuration.attributes.size() == 0) {
   158 				} else if (configuration.attributes.size() == 0) {
   123 					// first line contains attribute names and type is always string
   159 					// first line contains attribute names and maybe also types
       
   160 					tryParseTypes(metadata, writer, configuration);
   124 				} else {
   161 				} else {
   125 					throw RelpipeWriterException(L"Declared attribute count (" + std::to_wstring(configuration.attributes.size()) + L") does not match with number of columns of the first line (" + std::to_wstring(metadata.size()) + L")");
   162 					throw RelpipeWriterException(L"Declared attribute count (" + std::to_wstring(configuration.attributes.size()) + L") does not match with number of columns of the first line (" + std::to_wstring(metadata.size()) + L")");
   126 				}
   163 				}
   127 
   164 
   128 				headerDone = true;
   165 				headerDone = true;