Stripping everything but alphanumeric chars from a string in Python

前端 未结 11 1298
不思量自难忘°
不思量自难忘° 2020-11-22 10:52

What is the best way to strip all non alphanumeric characters from a string, using Python?

The solutions presented in the PHP variant of this question will probably

11条回答
  •  情歌与酒
    2020-11-22 11:30

    Timing with random strings of ASCII printables:

    from inspect import getsource
    from random import sample
    import re
    from string import printable
    from timeit import timeit
    
    pattern_single = re.compile(r'[\W]')
    pattern_repeat = re.compile(r'[\W]+')
    translation_tb = str.maketrans('', '', ''.join(c for c in map(chr, range(256)) if not c.isalnum()))
    
    
    def generate_test_string(length):
        return ''.join(sample(printable, length))
    
    
    def main():
        for i in range(0, 60, 10):
            for test in [
                lambda: ''.join(c for c in generate_test_string(i) if c.isalnum()),
                lambda: ''.join(filter(str.isalnum, generate_test_string(i))),
                lambda: re.sub(r'[\W]', '', generate_test_string(i)),
                lambda: re.sub(r'[\W]+', '', generate_test_string(i)),
                lambda: pattern_single.sub('', generate_test_string(i)),
                lambda: pattern_repeat.sub('', generate_test_string(i)),
                lambda: generate_test_string(i).translate(translation_tb),
    
            ]:
                print(timeit(test), i, getsource(test).lstrip('            lambda: ').rstrip(',\n'), sep='\t')
    
    
    if __name__ == '__main__':
        main()
    

    Result (Python 3.7):

           Time       Length                           Code                           
    6.3716264850008880  00  ''.join(c for c in generate_test_string(i) if c.isalnum())
    5.7285426190064750  00  ''.join(filter(str.isalnum, generate_test_string(i)))
    8.1875841680011940  00  re.sub(r'[\W]', '', generate_test_string(i))
    8.0002205439959650  00  re.sub(r'[\W]+', '', generate_test_string(i))
    5.5290945199958510  00  pattern_single.sub('', generate_test_string(i))
    5.4417179649972240  00  pattern_repeat.sub('', generate_test_string(i))
    4.6772285089973590  00  generate_test_string(i).translate(translation_tb)
    23.574712151996210  10  ''.join(c for c in generate_test_string(i) if c.isalnum())
    22.829975890002970  10  ''.join(filter(str.isalnum, generate_test_string(i)))
    27.210196289997840  10  re.sub(r'[\W]', '', generate_test_string(i))
    27.203713296003116  10  re.sub(r'[\W]+', '', generate_test_string(i))
    24.008979928999906  10  pattern_single.sub('', generate_test_string(i))
    23.945240008994006  10  pattern_repeat.sub('', generate_test_string(i))
    21.830899796994345  10  generate_test_string(i).translate(translation_tb)
    38.731336012999236  20  ''.join(c for c in generate_test_string(i) if c.isalnum())
    37.942474347000825  20  ''.join(filter(str.isalnum, generate_test_string(i)))
    42.169366310001350  20  re.sub(r'[\W]', '', generate_test_string(i))
    41.933375883003464  20  re.sub(r'[\W]+', '', generate_test_string(i))
    38.899814646996674  20  pattern_single.sub('', generate_test_string(i))
    38.636144253003295  20  pattern_repeat.sub('', generate_test_string(i))
    36.201238164998360  20  generate_test_string(i).translate(translation_tb)
    49.377356811004574  30  ''.join(c for c in generate_test_string(i) if c.isalnum())
    48.408927293996385  30  ''.join(filter(str.isalnum, generate_test_string(i)))
    53.901889764994850  30  re.sub(r'[\W]', '', generate_test_string(i))
    52.130339455994545  30  re.sub(r'[\W]+', '', generate_test_string(i))
    50.061149017004940  30  pattern_single.sub('', generate_test_string(i))
    49.366573111998150  30  pattern_repeat.sub('', generate_test_string(i))
    46.649754120997386  30  generate_test_string(i).translate(translation_tb)
    63.107938601999194  40  ''.join(c for c in generate_test_string(i) if c.isalnum())
    65.116287978999030  40  ''.join(filter(str.isalnum, generate_test_string(i)))
    71.477421126997800  40  re.sub(r'[\W]', '', generate_test_string(i))
    66.027950693998720  40  re.sub(r'[\W]+', '', generate_test_string(i))
    63.315361931003280  40  pattern_single.sub('', generate_test_string(i))
    62.342320287003530  40  pattern_repeat.sub('', generate_test_string(i))
    58.249303059004890  40  generate_test_string(i).translate(translation_tb)
    73.810345625002810  50  ''.join(c for c in generate_test_string(i) if c.isalnum())
    72.593953348005020  50  ''.join(filter(str.isalnum, generate_test_string(i)))
    76.048324580995540  50  re.sub(r'[\W]', '', generate_test_string(i))
    75.106637657001560  50  re.sub(r'[\W]+', '', generate_test_string(i))
    74.681338128997600  50  pattern_single.sub('', generate_test_string(i))
    72.430461594005460  50  pattern_repeat.sub('', generate_test_string(i))
    69.394243567003290  50  generate_test_string(i).translate(translation_tb)
    

    str.maketrans & str.translate is fastest, but includes all non-ASCII characters. re.compile & pattern.sub is slower, but is somehow faster than ''.join & filter.

提交回复
热议问题