I have the file which stores the data in the below format
TIME[04.26_12:30:30:853664]ID[ROLL:201987623]MARKS[PHY:100|MATH:200|CHEM:400]
TIME[03.27_12:29:30.553669
let's try to sort at one time.
import re
from itertools import groupby
from typing import Tuple
regex = re.compile(r"^.*TIME\[([^]]+)\]ID\[ROLL:([^]]+)\].+$")
def func1(arg) -> bool:
return regex.match(arg)
def func2(arg) -> Tuple[str, int]:
match = regex.match(arg)
if match:
return match.group(1), int(match.group(2))
return "", 0
def func3(arg) -> int:
match = regex.match(arg)
if match:
return int(match.group(2))
return 0
with open('b.txt') as fr:
collection = filter(func1, fr)
collection = sorted(collection, key=func2)
for key, group in groupby(collection, key=func3):
with open(f"ROLL_{key}", mode="w") as fw:
fw.writelines(group)
And if you want read file by chunk, use this:
import re
from functools import partial
from itertools import groupby
from typing import Tuple
regex = re.compile(r"^.*TIME\[([^]]+)\]ID\[ROLL:([^]]+)\].+$")
def func1(arg) -> bool:
return regex.match(arg)
def func2(arg) -> Tuple[str, int]:
match = regex.match(arg)
if match:
return match.group(1), int(match.group(2))
return "", 0
def func3(arg) -> int:
match = regex.match(arg)
if match:
return int(match.group(2))
return 0
def read_in_chunks(file_object, chunk_size=1024*1024):
while True:
data = file_object.read(chunk_size)
if not data:
break
yield data
with open('b.txt') as fr:
for chunk in read_in_chunks(fr):
collection = filter(func1, chunk.splitlines())
collection = sorted(collection, key=func2)
for key, group in groupby(collection, key=func3):
with open(f"ROLL_{key}", mode="wa") as fw:
fw.writelines(group)