Python Memory Error when reading large files , need ideas to apply mutiprocessing in below case?

前端 未结 2 630
伪装坚强ぢ
伪装坚强ぢ 2021-01-26 12:47

I have the file which stores the data in the below format

TIME[04.26_12:30:30:853664]ID[ROLL:201987623]MARKS[PHY:100|MATH:200|CHEM:400]
TIME[03.27_12:29:30.553669         


        
相关标签:
2条回答
  • 2021-01-26 13:00

    let's try to sort at one time.

    import re
    from itertools import groupby
    from typing import Tuple
    
    regex = re.compile(r"^.*TIME\[([^]]+)\]ID\[ROLL:([^]]+)\].+$")
    def func1(arg) -> bool:
        return regex.match(arg)
    
    
    def func2(arg) -> Tuple[str, int]:
        match = regex.match(arg)
        if match:
            return match.group(1), int(match.group(2))
        return "", 0
    
    def func3(arg) -> int:
        match = regex.match(arg)
        if match:
            return int(match.group(2))
        return 0
    
    with open('b.txt') as fr:
        collection = filter(func1, fr)
        collection = sorted(collection, key=func2)
        for key, group in groupby(collection, key=func3):
            with open(f"ROLL_{key}", mode="w") as fw:
                fw.writelines(group)
    
    0 讨论(0)
  • 2021-01-26 13:17

    And if you want read file by chunk, use this:

    import re
    from functools import partial
    from itertools import groupby
    from typing import Tuple
    
    regex = re.compile(r"^.*TIME\[([^]]+)\]ID\[ROLL:([^]]+)\].+$")
    def func1(arg) -> bool:
        return regex.match(arg)
    
    
    def func2(arg) -> Tuple[str, int]:
        match = regex.match(arg)
        if match:
            return match.group(1), int(match.group(2))
        return "", 0
    
    def func3(arg) -> int:
        match = regex.match(arg)
        if match:
            return int(match.group(2))
        return 0
    
    def read_in_chunks(file_object, chunk_size=1024*1024):
        while True:
            data = file_object.read(chunk_size)
            if not data:
                break
            yield data
    
    with open('b.txt') as fr:
        for chunk in read_in_chunks(fr):
            collection = filter(func1, chunk.splitlines())
            collection = sorted(collection, key=func2)
            for key, group in groupby(collection, key=func3):
                with open(f"ROLL_{key}", mode="wa") as fw:
                    fw.writelines(group)
    
    0 讨论(0)
提交回复
热议问题