I am parsing a string in C++ using the following:
using namespace std;
string parsed,input=\"text to be parsed\";
stringstream input_stringstream(input);
i
Answer is already there, but selected-answer uses erase function which is very costly, think of some very big string(in MBs). Therefore I use below function.
vector<string> split(const string& i_str, const string& i_delim)
{
vector<string> result;
size_t found = i_str.find(i_delim);
size_t startIndex = 0;
while(found != string::npos)
{
result.push_back(string(i_str.begin()+startIndex, i_str.begin()+found));
startIndex = found + i_delim.size();
found = i_str.find(i_delim, startIndex);
}
if(startIndex != i_str.size())
result.push_back(string(i_str.begin()+startIndex, i_str.end()));
return result;
}
This method uses std::string::find
without mutating the original string by remembering the beginning and end of the previous substring token.
#include <iostream>
#include <string>
int main()
{
std::string s = "scott>=tiger";
std::string delim = ">=";
auto start = 0U;
auto end = s.find(delim);
while (end != std::string::npos)
{
std::cout << s.substr(start, end - start) << std::endl;
start = end + delim.length();
end = s.find(delim, start);
}
std::cout << s.substr(start, end);
}
Here's my take on this. It handles the edge cases and takes an optional parameter to remove empty entries from the results.
bool endsWith(const std::string& s, const std::string& suffix)
{
return s.size() >= suffix.size() &&
s.substr(s.size() - suffix.size()) == suffix;
}
std::vector<std::string> split(const std::string& s, const std::string& delimiter, const bool& removeEmptyEntries = false)
{
std::vector<std::string> tokens;
for (size_t start = 0, end; start < s.length(); start = end + delimiter.length())
{
size_t position = s.find(delimiter, start);
end = position != string::npos ? position : s.length();
std::string token = s.substr(start, end - start);
if (!removeEmptyEntries || !token.empty())
{
tokens.push_back(token);
}
}
if (!removeEmptyEntries &&
(s.empty() || endsWith(s, delimiter)))
{
tokens.push_back("");
}
return tokens;
}
Examples
split("a-b-c", "-"); // [3]("a","b","c")
split("a--c", "-"); // [3]("a","","c")
split("-b-", "-"); // [3]("","b","")
split("--c--", "-"); // [5]("","","c","","")
split("--c--", "-", true); // [1]("c")
split("a", "-"); // [1]("a")
split("", "-"); // [1]("")
split("", "-", true); // [0]()
Function:
std::vector<std::string> WSJCppCore::split(const std::string& sWhat, const std::string& sDelim) {
std::vector<std::string> vRet;
size_t nPos = 0;
size_t nLen = sWhat.length();
size_t nDelimLen = sDelim.length();
while (nPos < nLen) {
std::size_t nFoundPos = sWhat.find(sDelim, nPos);
if (nFoundPos != std::string::npos) {
std::string sToken = sWhat.substr(nPos, nFoundPos - nPos);
vRet.push_back(sToken);
nPos = nFoundPos + nDelimLen;
if (nFoundPos + nDelimLen == nLen) { // last delimiter
vRet.push_back("");
}
} else {
std::string sToken = sWhat.substr(nPos, nLen - nPos);
vRet.push_back(sToken);
break;
}
}
return vRet;
}
Unit-tests:
bool UnitTestSplit::run() {
bool bTestSuccess = true;
struct LTest {
LTest(
const std::string &sStr,
const std::string &sDelim,
const std::vector<std::string> &vExpectedVector
) {
this->sStr = sStr;
this->sDelim = sDelim;
this->vExpectedVector = vExpectedVector;
};
std::string sStr;
std::string sDelim;
std::vector<std::string> vExpectedVector;
};
std::vector<LTest> tests;
tests.push_back(LTest("1 2 3 4 5", " ", {"1", "2", "3", "4", "5"}));
tests.push_back(LTest("|1f|2п|3%^|44354|5kdasjfdre|2", "|", {"", "1f", "2п", "3%^", "44354", "5kdasjfdre", "2"}));
tests.push_back(LTest("|1f|2п|3%^|44354|5kdasjfdre|", "|", {"", "1f", "2п", "3%^", "44354", "5kdasjfdre", ""}));
tests.push_back(LTest("some1 => some2 => some3", "=>", {"some1 ", " some2 ", " some3"}));
tests.push_back(LTest("some1 => some2 => some3 =>", "=>", {"some1 ", " some2 ", " some3 ", ""}));
for (int i = 0; i < tests.size(); i++) {
LTest test = tests[i];
std::string sPrefix = "test" + std::to_string(i) + "(\"" + test.sStr + "\")";
std::vector<std::string> vSplitted = WSJCppCore::split(test.sStr, test.sDelim);
compareN(bTestSuccess, sPrefix + ": size", vSplitted.size(), test.vExpectedVector.size());
int nMin = std::min(vSplitted.size(), test.vExpectedVector.size());
for (int n = 0; n < nMin; n++) {
compareS(bTestSuccess, sPrefix + ", element: " + std::to_string(n), vSplitted[n], test.vExpectedVector[n]);
}
}
return bTestSuccess;
}
You can also use regex for this:
std::vector<std::string> split(const std::string str, const std::string regex_str)
{
std::regex regexz(regex_str);
std::vector<std::string> list(std::sregex_token_iterator(str.begin(), str.end(), regexz, -1),
std::sregex_token_iterator());
return list;
}
which is equivalent to :
std::vector<std::string> split(const std::string str, const std::string regex_str)
{
std::sregex_token_iterator token_iter(str.begin(), str.end(), regexz, -1);
std::sregex_token_iterator end;
std::vector<std::string> list;
while (token_iter != end)
{
list.emplace_back(*token_iter++);
}
return list;
}
and use it like this :
#include <iostream>
#include <string>
#include <regex>
std::vector<std::string> split(const std::string str, const std::string regex_str)
{ // a yet more concise form!
return { std::sregex_token_iterator(str.begin(), str.end(), std::regex(regex_str), -1), std::sregex_token_iterator() };
}
int main()
{
std::string input_str = "lets split this";
std::string regex_str = " ";
auto tokens = split(input_str, regex_str);
for (auto& item: tokens)
{
std::cout<<item <<std::endl;
}
}
play with it online! http://cpp.sh/9sumb
you can simply use substrings, characters,etc like normal, or use actual regular experssions to do the splitting.
its also concize and C++11!
A very simple/naive approach:
vector<string> words_seperate(string s){
vector<string> ans;
string w="";
for(auto i:s){
if(i==' '){
ans.push_back(w);
w="";
}
else{
w+=i;
}
}
ans.push_back(w);
return ans;
}
Or you can use boost library split function:
vector<string> result;
boost::split(result, input, boost::is_any_of("\t"));
Or You can try TOKEN or strtok:
char str[] = "DELIMIT-ME-C++";
char *token = strtok(str, "-");
while (token)
{
cout<<token;
token = strtok(NULL, "-");
}
Or You can do this:
char split_with=' ';
vector<string> words;
string token;
stringstream ss(our_string);
while(getline(ss , token , split_with)) words.push_back(token);