How do I find the duplicates in a list and create another list with them?

前端 未结 30 1633
梦谈多话
梦谈多话 2020-11-22 00:56

How can I find the duplicates in a Python list and create another list of the duplicates? The list only contains integers.

30条回答
  •  长发绾君心
    2020-11-22 01:00

    Some other tests. Of course to do...

    set([x for x in l if l.count(x) > 1])
    

    ...is too costly. It's about 500 times faster (the more long array gives better results) to use the next final method:

    def dups_count_dict(l):
        d = {}
    
        for item in l:
            if item not in d:
                d[item] = 0
    
            d[item] += 1
    
        result_d = {key: val for key, val in d.iteritems() if val > 1}
    
        return result_d.keys()
    

    Only 2 loops, no very costly l.count() operations.

    Here is a code to compare the methods for example. The code is below, here is the output:

    dups_count: 13.368s # this is a function which uses l.count()
    dups_count_dict: 0.014s # this is a final best function (of the 3 functions)
    dups_count_counter: 0.024s # collections.Counter
    

    The testing code:

    import numpy as np
    from time import time
    from collections import Counter
    
    class TimerCounter(object):
        def __init__(self):
            self._time_sum = 0
    
        def start(self):
            self.time = time()
    
        def stop(self):
            self._time_sum += time() - self.time
    
        def get_time_sum(self):
            return self._time_sum
    
    
    def dups_count(l):
        return set([x for x in l if l.count(x) > 1])
    
    
    def dups_count_dict(l):
        d = {}
    
        for item in l:
            if item not in d:
                d[item] = 0
    
            d[item] += 1
    
        result_d = {key: val for key, val in d.iteritems() if val > 1}
    
        return result_d.keys()
    
    
    def dups_counter(l):
        counter = Counter(l)    
    
        result_d = {key: val for key, val in counter.iteritems() if val > 1}
    
        return result_d.keys()
    
    
    
    def gen_array():
        np.random.seed(17)
        return list(np.random.randint(0, 5000, 10000))
    
    
    def assert_equal_results(*results):
        primary_result = results[0]
        other_results = results[1:]
    
        for other_result in other_results:
            assert set(primary_result) == set(other_result) and len(primary_result) == len(other_result)
    
    
    if __name__ == '__main__':
        dups_count_time = TimerCounter()
        dups_count_dict_time = TimerCounter()
        dups_count_counter = TimerCounter()
    
        l = gen_array()
    
        for i in range(3):
            dups_count_time.start()
            result1 = dups_count(l)
            dups_count_time.stop()
    
            dups_count_dict_time.start()
            result2 = dups_count_dict(l)
            dups_count_dict_time.stop()
    
            dups_count_counter.start()
            result3 = dups_counter(l)
            dups_count_counter.stop()
    
            assert_equal_results(result1, result2, result3)
    
        print 'dups_count: %.3f' % dups_count_time.get_time_sum()
        print 'dups_count_dict: %.3f' % dups_count_dict_time.get_time_sum()
        print 'dups_count_counter: %.3f' % dups_count_counter.get_time_sum()
    

提交回复
热议问题