问题
Say I have a list like so:
[1, 4, None, 6, 9, None, 3, 9, 4 ]
I decide to split this into nested lists on None
, to get this:
[ [ 1, 4 ], [ 6, 9 ], [ 3, 9, 4 ] ]
Of course, I could have wanted to do this on (9, None)
in which case, we would have got:
[ [ 1, 4 ], [ 6 ], [ 3 ], [ 4 ] ]
This is trivial to do using list append through iteration ( in a for loop )
I am interested to know whether this can be done in something faster - like a list comprehension?
If not, why not ( for example, a list comprehension cannot return more than one list element per iteration? )
回答1:
>>> def isplit(iterable,splitters):
return [list(g) for k,g in itertools.groupby(iterable,lambda x:x in splitters) if not k]
>>> isplit(L,(None,))
[[1, 4], [6, 9], [3, 9, 4]]
>>> isplit(L,(None,9))
[[1, 4], [6], [3], [4]]
benchmark code:
import timeit
kabie=("isplit_kabie",
"""
import itertools
def isplit_kabie(iterable,splitters):
return [list(g) for k,g in itertools.groupby(iterable,lambda x:x in splitters) if not k]
""" )
ssplit=("ssplit",
"""
def ssplit(seq,splitters):
seq=list(seq)
if splitters and seq:
result=[]
begin=0
for end in range(len(seq)):
if seq[end] in splitters:
if end > begin:
result.append(seq[begin:end])
begin=end+1
if begin<len(seq):
result.append(seq[begin:])
return result
return [seq]
""" )
ssplit2=("ssplit2",
"""
def ssplit2(seq,splitters):
seq=list(seq)
if splitters and seq:
splitters=set(splitters).intersection(seq)
if splitters:
result=[]
begin=0
for end in range(len(seq)):
if seq[end] in splitters:
if end > begin:
result.append(seq[begin:end])
begin=end+1
if begin<len(seq):
result.append(seq[begin:])
return result
return [seq]
""" )
emile=("magicsplit",
"""
def _itersplit(l, *splitters):
current = []
for item in l:
if item in splitters:
yield current
current = []
else:
current.append(item)
yield current
def magicsplit(l, splitters):
return [subl for subl in _itersplit(l, *splitters) if subl]
""" )
emile_improved=("magicsplit2",
"""
def _itersplit(l, *splitters):
current = []
for item in l:
if item in splitters:
if current:
yield current
current = []
else:
current.append(item)
if current:
yield current
def magicsplit2(l, splitters):
if splitters and l:
return [i for i in _itersplit(l, *splitters)]
return [list(l)]
""" )
karl=("ssplit_karl",
"""
def ssplit_karl(original,splitters):
indices = [i for (i, x) in enumerate(original) if x in splitters]
ends = indices + [len(original)]
begins = [0] + [x + 1 for x in indices]
return [original[begin:end] for (begin, end) in zip(begins, ends)]
""" )
ryan=("split_on",
"""
from functools import reduce
def split_on (seq, delims, remove_empty=True):
'''Split seq into lists using delims as a delimiting elements.
For example, split_on(delims=2, list=xrange(0,5)) yields [ [0,1], [3,4] ].
delims can be either a single delimiting element or a list or
tuple of multiple delimiting elements. If you wish to use a list
or tuple as a delimiter, you must enclose it in another list or
tuple.
If remove_empty is False, then consecutive delimiter elements or delimiter elements at the beginning or end of the longlist'''
delims=set(delims)
def reduce_fun(lists, elem):
if elem in delims:
if remove_empty and lists[-1] == []:
# Avoid adding multiple empty lists
pass
else:
lists.append([])
else:
lists[-1].append(elem)
return lists
result_list = reduce(reduce_fun, seq, [ [], ])
# Maybe remove trailing empty list
if remove_empty and result_list[-1] == []:
result_list.pop()
return result_list
""" )
cases=(kabie, emile, emile_improved, ssplit ,ssplit2 ,ryan)
data=(
([1, 4, None, 6, 9, None, 3, 9, 4 ],(None,)),
([1, 4, None, 6, 9, None, 3, 9, 4 ]*5,{None,9,7}),
((),()),
(range(1000),()),
("Split me",('','')),
("split me "*100,' '),
("split me,"*100,' ,'*20),
("split me, please!"*100,' ,!'),
(range(100),range(100)),
(range(100),range(101,1000)),
(range(100),range(50,150)),
(list(range(100))*30,(99,)),
)
params="seq,splitters"
def benchmark(func,code,data,params='',times=10000,rounds=3,debug=''):
assert(func.isidentifier())
tester = timeit.Timer(stmt='{func}({params})'.format(
func=func,params=params),
setup="{code}\n".format(code=code)+
(params and "{params}={data}\n".format(params=params,data=data)) +
(debug and """ret=repr({func}({params}))
print({func}.__name__.rjust(16),":",ret[:30]+"..."+ret[-15:] if len(ret)>50 else ret)
""".format(func=func,params=params)))
results = [tester.timeit(times) for i in range(rounds)]
if not debug:
print("{:>16s} takes:{:6.4f},avg:{:.2e},best:{:.4f},worst:{:.4f}".format(
func,sum(results),sum(results)/times/rounds,min(results),max(results)))
def testAll(cases,data,params='',times=10000,rounds=3,debug=''):
if debug:
times,rounds = 1,1
for dat in data:
sdat = tuple(map(repr,dat))
print("{}x{} times:".format(times,rounds),
','.join("{}".format(d[:8]+"..."+d[-5:] if len(d)>16 else d)for d in map(repr,dat)))
for func,code in cases:
benchmark(func,code,dat,params,times,rounds,debug)
if __name__=='__main__':
testAll(cases,data,params,500,10)#,debug=True)
Output on i3-530, Windows7, Python 3.1.2:
500x10 times: [1, 4, N...9, 4],(None,)
isplit_kabie takes:0.0605,avg:1.21e-05,best:0.0032,worst:0.0074
magicsplit takes:0.0287,avg:5.74e-06,best:0.0016,worst:0.0036
magicsplit2 takes:0.0174,avg:3.49e-06,best:0.0017,worst:0.0018
ssplit takes:0.0149,avg:2.99e-06,best:0.0015,worst:0.0016
ssplit2 takes:0.0198,avg:3.96e-06,best:0.0019,worst:0.0021
split_on takes:0.0229,avg:4.59e-06,best:0.0023,worst:0.0024
500x10 times: [1, 4, N...9, 4],{9, None, 7}
isplit_kabie takes:0.1448,avg:2.90e-05,best:0.0144,worst:0.0146
magicsplit takes:0.0636,avg:1.27e-05,best:0.0063,worst:0.0065
magicsplit2 takes:0.0891,avg:1.78e-05,best:0.0064,worst:0.0162
ssplit takes:0.0593,avg:1.19e-05,best:0.0058,worst:0.0061
ssplit2 takes:0.1004,avg:2.01e-05,best:0.0069,worst:0.0142
split_on takes:0.0929,avg:1.86e-05,best:0.0090,worst:0.0096
500x10 times: (),()
isplit_kabie takes:0.0041,avg:8.14e-07,best:0.0004,worst:0.0004
magicsplit takes:0.0040,avg:8.04e-07,best:0.0004,worst:0.0004
magicsplit2 takes:0.0022,avg:4.35e-07,best:0.0002,worst:0.0002
ssplit takes:0.0023,avg:4.59e-07,best:0.0002,worst:0.0003
ssplit2 takes:0.0023,avg:4.53e-07,best:0.0002,worst:0.0002
split_on takes:0.0072,avg:1.45e-06,best:0.0007,worst:0.0009
500x10 times: range(0, 1000),()
isplit_kabie takes:0.8892,avg:1.78e-04,best:0.0881,worst:0.0895
magicsplit takes:0.6614,avg:1.32e-04,best:0.0654,worst:0.0673
magicsplit2 takes:0.0958,avg:1.92e-05,best:0.0094,worst:0.0099
ssplit takes:0.0943,avg:1.89e-05,best:0.0093,worst:0.0095
ssplit2 takes:0.0943,avg:1.89e-05,best:0.0093,worst:0.0096
split_on takes:1.3348,avg:2.67e-04,best:0.1328,worst:0.1340
500x10 times: 'Split me',('', '')
isplit_kabie takes:0.0234,avg:4.68e-06,best:0.0023,worst:0.0024
magicsplit takes:0.0126,avg:2.52e-06,best:0.0012,worst:0.0013
magicsplit2 takes:0.0138,avg:2.76e-06,best:0.0013,worst:0.0015
ssplit takes:0.0119,avg:2.39e-06,best:0.0012,worst:0.0012
ssplit2 takes:0.0075,avg:1.50e-06,best:0.0007,worst:0.0008
split_on takes:0.0191,avg:3.83e-06,best:0.0018,worst:0.0023
500x10 times: 'split m... me ',' '
isplit_kabie takes:2.0803,avg:4.16e-04,best:0.2060,worst:0.2098
magicsplit takes:0.9219,avg:1.84e-04,best:0.0920,worst:0.0925
magicsplit2 takes:1.0221,avg:2.04e-04,best:0.1018,worst:0.1034
ssplit takes:0.8294,avg:1.66e-04,best:0.0818,worst:0.0834
ssplit2 takes:0.9911,avg:1.98e-04,best:0.0983,worst:0.1014
split_on takes:1.5672,avg:3.13e-04,best:0.1543,worst:0.1694
500x10 times: 'split m... me,',' , , , ... , ,'
isplit_kabie takes:2.1847,avg:4.37e-04,best:0.2164,worst:0.2275
magicsplit takes:3.7135,avg:7.43e-04,best:0.3693,worst:0.3783
magicsplit2 takes:3.8104,avg:7.62e-04,best:0.3795,worst:0.3884
ssplit takes:0.9522,avg:1.90e-04,best:0.0939,worst:0.0956
ssplit2 takes:1.0140,avg:2.03e-04,best:0.1009,worst:0.1023
split_on takes:1.5747,avg:3.15e-04,best:0.1563,worst:0.1615
500x10 times: 'split m...ase!',' ,!'
isplit_kabie takes:3.3443,avg:6.69e-04,best:0.3324,worst:0.3380
magicsplit takes:2.0594,avg:4.12e-04,best:0.2054,worst:0.2076
magicsplit2 takes:2.1850,avg:4.37e-04,best:0.2180,worst:0.2191
ssplit takes:1.4881,avg:2.98e-04,best:0.1484,worst:0.1493
ssplit2 takes:1.8779,avg:3.76e-04,best:0.1868,worst:0.1920
split_on takes:2.9596,avg:5.92e-04,best:0.2946,worst:0.2980
500x10 times: range(0, 100),range(0, 100)
isplit_kabie takes:0.9445,avg:1.89e-04,best:0.0933,worst:0.1023
magicsplit takes:0.5878,avg:1.18e-04,best:0.0583,worst:0.0593
magicsplit2 takes:0.5597,avg:1.12e-04,best:0.0554,worst:0.0588
ssplit takes:0.8568,avg:1.71e-04,best:0.0852,worst:0.0874
ssplit2 takes:0.1399,avg:2.80e-05,best:0.0121,worst:0.0242
split_on takes:0.1462,avg:2.92e-05,best:0.0145,worst:0.0148
500x10 times: range(0, 100),range(101, 1000)
isplit_kabie takes:19.9749,avg:3.99e-03,best:1.9789,worst:2.0330
magicsplit takes:9.4997,avg:1.90e-03,best:0.9369,worst:0.9640
magicsplit2 takes:9.4394,avg:1.89e-03,best:0.9267,worst:0.9665
ssplit takes:19.2363,avg:3.85e-03,best:1.8936,worst:1.9516
ssplit2 takes:0.2032,avg:4.06e-05,best:0.0201,worst:0.0205
split_on takes:0.3329,avg:6.66e-05,best:0.0323,worst:0.0344
500x10 times: range(0, 100),range(50, 150)
isplit_kabie takes:1.1394,avg:2.28e-04,best:0.1130,worst:0.1153
magicsplit takes:0.7288,avg:1.46e-04,best:0.0721,worst:0.0760
magicsplit2 takes:0.7220,avg:1.44e-04,best:0.0705,worst:0.0774
ssplit takes:1.0835,avg:2.17e-04,best:0.1059,worst:0.1116
ssplit2 takes:0.1092,avg:2.18e-05,best:0.0105,worst:0.0116
split_on takes:0.1639,avg:3.28e-05,best:0.0162,worst:0.0168
500x10 times: [0, 1, 2..., 99],(99,)
isplit_kabie takes:3.2579,avg:6.52e-04,best:0.3225,worst:0.3360
magicsplit takes:2.2937,avg:4.59e-04,best:0.2274,worst:0.2344
magicsplit2 takes:2.6054,avg:5.21e-04,best:0.2587,worst:0.2642
ssplit takes:1.5251,avg:3.05e-04,best:0.1495,worst:0.1729
ssplit2 takes:1.7298,avg:3.46e-04,best:0.1696,worst:0.1858
split_on takes:4.1041,avg:8.21e-04,best:0.4033,worst:0.4291
Slightly modified Ryan's code, hope you don't mind. ssplit was based on the idea of Karl. Added statements handling some special cases to became ssplit2 which is the best solution I may provide.
回答2:
Something like this:
def _itersplit(l, splitters):
current = []
for item in l:
if item in splitters:
yield current
current = []
else:
current.append(item)
yield current
def magicsplit(l, *splitters):
return [subl for subl in _itersplit(l, splitters) if subl]
gets me:
>>> l = [1, 4, None, 6, 9, None, 3, 9, 4 ]
>>> magicsplit(l, None)
[[1, 4], [6, 9], [3, 9, 4]]
>>> magicsplit(l, None, 9)
[[1, 4], [6], [3], [4]]
回答3:
You could find the indices of the "delimiter" elements. For example, the indices of None are 2 and 5 (zero-based).
You could use these, along with the length of the list, to construct a list of tuples [ (0,1), (3,4), (6,8) ]
representing the start indices and end indices of the sublists. Then you can use a list comprehension over this list of tuples to extract sublists.
回答4:
The problem with trying to use a list comprehension for this is that the comprehension is inherently stateless, and you need state to perform the splitting operation. In particular, you need to remember, from one element to the next, what elements were found after the previous 'split' marker.
However, you could use one list comprehension to extract the indices of the split elements, and then another to use those indices to chop up the list. We need to translate the split indices into (begin, end) indices for the necessary 'pieces'. What we'll do is transform the list of split indices into two separate lists of 'begins' and 'ends', and then zip them together.
The whole thing looks like:
splitters = (9, None) # for example
indices = [i for (i, x) in enumerate(original) if x in splitters]
ends = indices + [len(original)]
begins = [0] + [x + 1 for x in indices]
result = [original[begin:end] for (begin, end) in zip(begins, ends)]
回答5:
Here's an implementation using reduce, with a few bells and whistles:
#!/usr/bin/env python
def split_on (delims, seq, remove_empty=True):
'''Split seq into lists using delims as a delimiting elements.
For example, split_on(delims=2, list=xrange(0,5)) yields [ [0,1], [3,4] ].
delims can be either a single delimiting element or a list or
tuple of multiple delimiting elements. If you wish to use a list
or tuple as a delimiter, you must enclose it in another list or
tuple.
If remove_empty is False, then consecutive delimiter elements or delimiter elements at the beginning or end of the longlist'''
if type(delims) not in (type(list()), type(tuple())):
delims = ( delims, )
def reduce_fun(lists, elem):
if elem in delims:
if remove_empty and lists[-1] == []:
# Avoid adding multiple empty lists
pass
else:
lists.append([])
else:
lists[-1].append(elem)
return lists
result_list = reduce(reduce_fun, seq, [ [], ])
# Maybe remove trailing empty list
if remove_empty and result_list[-1] == []:
result_list.pop()
return result_list
mylist = [1, 4, None, 6, 9, None, 3, 9, 4 ]
print(split_on(None, mylist))
print(split_on((9, None), mylist))
来源:https://stackoverflow.com/questions/4322705/split-a-list-into-nested-lists-on-a-value