i have a txt file, which has the following structure
start
id=1
date=21.05.2018
summ=500
end
start
id=7
date=23.05.2018
summ=500
owner=guest
end
You can simply parse the text file, provided you keep some context: start a new dictionnary on each start line and add it to the list on each end line.
Code could be:
def parse(fd):
"""Parse a file, fd is expected to be a file object"""
resul = [] # the list of dictionaries to return
d = None # an individual dict initialized to None
linenum = 0
for line in fd:
line = line.strip()
linenum += 1
if line.startswith('end'):
if d is not None:
resul.append(d)
d = None
elif line.startswith('start'):
d = {}
elif len(line) != 0:
key, val = line.split('=', 1)
d[key] = val
return resul
Syntax errors in the file (missing start or end lines, other incorrect lines) are not handled here:
=
sign) should cause an exception ValueError: not enough values to unpack (expected 2, got 1)Using Regex.
import re
with open(filename, "r") as infile:
data = infile.read()
data = re.findall("(?<=\\bstart\\b).*?(?=\\bend\\b)", data, flags=re.DOTALL) #Find the required data from text
r = []
for i in data:
val = filter(None, i.split("\n"))
d = {}
for j in val:
s = j.split("=") #Split by "=" to form key-value pair
d[s[0]] = s[1]
r.append(d) #Append to list
print(r)
Output:
[{'date': '21.05.2018', 'summ': '500', 'id': '1'}, {'date': '23.05.2018', 'owner': 'guest', 'summ': '500', 'id': '7'}]
Easiest algorithm I could think of, if I get your question right.
d ={}
arr = []
for line in plines:
if line == 'start':
continue
elif line =='end':
arr.append(d)
continue
else:
list_key_value = line.split('=')
d[list_key_value[0]] = int(list_key_value[1]) if
type(list_key_value[1]) == 'int' else str(list_key_value[1])
print (arr)
Output:
[{'id': '7', 'date': '23.05.2018', 'summ': '500', 'owner': 'guest'},
{'id': '7', 'date': '23.05.2018', 'summ': '500', 'owner': 'guest'}]
You could also try something like this:
from itertools import takewhile
with open('data.txt') as in_file:
items = [line.strip() for line in in_file.read().split()]
# ['start', 'id=1', 'date=21.05.2018', 'summ=500', 'end', 'start', 'id=7', 'date=23.05.2018', 'summ=500', 'owner=guest']
pos = [i for i, item in enumerate(items) if item == 'start']
# [0, 5]
blocks = [list(takewhile(lambda x: x != 'end', items[i+1:])) for i in pos]
# [['id=1', 'date=21.05.2018', 'summ=500'], ['id=7', 'date=23.05.2018', 'summ=500', 'owner=guest']]
print([dict(x.split('=') for x in block) for block in blocks])
Which Outputs:
[{'id': '1', 'date': '21.05.2018', 'summ': '500'}, {'id': '7', 'date': '23.05.2018', 'summ': '500', 'owner': 'guest'}]
You can build a simple parser with recursion that attempts to find data between start
and end
blocks:
import re
class Parser:
def __init__(self, source:str):
self.source = iter(filter(None, source.split('\n')))
self.results = []
self.parse()
@staticmethod
def to_dict(between_blocks):
return dict(re.split('\s*\=\s*', i) for i in between_blocks)
def parse(self):
_line = next(self.source, None)
if _line is not None:
if _line == 'start':
scope = []
while True:
_temp = next(self.source, None)
if _temp is None:
raise Exception("Missing 'end' tag")
if _temp != 'end':
scope.append(_temp)
else:
break
self.results.append(Parser.to_dict(filter(None, scope)))
self.parse()
def __repr__(self):
return f'{Parsed}({self.results})'
print(Parser(open('filename.txt').read())).results)
Output:
[{'id': '1', 'date': '21.05.2018', 'summ': '500'}, {'id': '7', 'date': '23.05.2018', 'summ': '500', 'owner': 'guest'}]
Tests:
tests = [[
"""
start
id=1
date=21.05.2018
summ=500
""", Exception],
[
"""
start
name = someone
age = 18
id = 23
end
start
name = someoneelse
age = 45
id = 55
end
start
name = lastname
age = 34
id = 5
end
""", None]
]
for text, is_error in tests:
try:
_ = Parser(text)
except:
assert is_error == Exception
else:
assert is_error is None
print('all tests passed')
Output:
all tests passed