A lot of questions have been already asked about this topic on SO.
(and many others).
Among the numerous answers, none of them was really helpful to me so far. If I missed
I have a different take on the solution. Let pandas take care of creating the table and deleting None values and let us take care of writing a proper tokenizer.
def tokenize(str):
idx = [x for x, v in enumerate(str) if v == '\"']
if len(idx) % 2 != 0:
idx = idx[:-1]
memory = {}
for i in range(0, len(idx), 2):
val = str[idx[i]:idx[i+1]+1]
key = "_"*(len(val)-1)+"{0}".format(i)
memory[key] = val
str = str.replace(memory[key], key, 1)
return [memory.get(token, token) for token in str.split(",")]
print (tokenize("1,2,3,4,5"))
print (tokenize(",,3,\"Hello, World!\",5,6"))
print (tokenize(",,3,\"Hello,,,, World!\",5,6"))
print (tokenize(",,3,\"Hello, World!\",5,6,,3,\"Hello, World!\",5,6"))
print (tokenize(",,3,\"Hello, World!\",5,6,,3,\"Hello,,5,6"))
Output
['1', '2', '3', '4', '5']
['', '', '3', '"Hello, World!"', '5', '6']
['', '', '3', '"Hello,,,, World!"', '5', '6']
['', '', '3', '"Hello, World!"', '5', '6', '', '3', '"Hello, World!"', '5', '6']
['', '', '3', '"Hello, World!"', '5', '6', '', '3', '"Hello', '', '5', '6']
with open("test1.csv", "r") as fp:
lines = fp.readlines()
lines = list(map(lambda x: tokenize(x.strip()), lines))
df = pd.DataFrame(lines).replace(np.nan, '')
Now we can teak the tokenizer function as per our needs