I have some data that looks something like this:
ID1 ID2 ID3
ID1 ID4 ID5
ID3 ID5 ID7 ID6
...
...
where each row is a group.
M
So, I timed a few different options, and after a few iterations, came up with the following strategies. I thought that sets2 would be the winner, but listToSet2 was faster for every single type of group.
All of the functions except for listFilter were in the same ballpark - listFilter was much slower.
import random
import collections
small = [[random.randint(1,25) for _ in range(5)] for i in range(100)]
medium = [[random.randint(1,250) for _ in range(5)] for i in range(1000)]
mediumLotsReps = [[random.randint(1,25) for _ in range(5)] for i in range(1000)]
bigGroups = [[random.randint(1,250) for _ in range(75)] for i in range(100)]
huge = [[random.randint(1,2500) for _ in range(5)] for i in range(10000)]
def sets(groups):
results = collections.defaultdict(set)
for group in groups:
for i in group:
for j in group:
if i is not j:
results[i].add(j)
return results
def listToSet(groups):
results = collections.defaultdict(list)
for group in groups:
for i,j in enumerate(group):
results[j] += group[:i] + group[i:]
return {k:set(v) for k, v in results.iteritems()}
def listToSet2(groups):
results = collections.defaultdict(list)
for group in groups:
for i,j in enumerate(group):
results[j] += group
return {k:set(v)-set([k]) for k, v in results.iteritems()}
def sets2(groups):
results = collections.defaultdict(set)
for group in groups:
for i in group:
results[i] |= set(group)
return {k:v - set([k]) for k, v in results.iteritems()}
def listFilter(groups):
results = collections.defaultdict(list)
for group in groups:
for i,j in enumerate(group):
filteredGroup = group[:i] + group[i:]
results[j] += ([k for k in filteredGroup if k not in results[j]])
return results