#include "stdafx.h"
#include "DatasetImport.h"
#include <iostream>
#include <fstream>
#include <regex>

using namespace std;

void split(vector<string> *v, string str, string delim)
{
	string word = "";
	for (int i = 0; i < str.size(); i++)
		if (delim.find(str[i]) == string::npos)
				word += str[i];
			else
				if (word.size() > 0)
				{
					v->push_back(word);
					word = "";
				}
	if (word.size() > 0)
		v->push_back(word);
}

vector<string> *parseDataset(char* name)
{
	fstream in(name);
	string line;
	vector<string> *result = new vector<string>;
	if (in.is_open())
		while (getline(in, line))
			split(result, line, " .,<>@-=():_';\"");
	in.close();
	return result;
}

// работает крайне медленно :(
vector<string> *parseDatasetRX(char* name)
{
	fstream in(name);
	string line;
	vector<string> *result = new vector<string>;
	regex reg("(([0-9]|[A-Z]|[a-z])+)");
	if (in.is_open())
		while (getline(in, line))
			for (auto it = sregex_iterator(line.cbegin(), line.cend(), reg); it != std::sregex_iterator(); ++it)
				result->push_back(it->str());
	in.close();
	return result;
}

