So you have an array
1
2
3
60
70
80
100
220
230
250
For a better understanding:
This is a simple algorithm implemented in python that check whether or not a value is too far (in terms of standard deviation) from the mean of a cluster:
from math import sqrt
def stat(lst):
"""Calculate mean and std deviation from the input list."""
n = float(len(lst))
mean = sum(lst) / n
stdev = sqrt((sum(x*x for x in lst) / n) - (mean * mean))
return mean, stdev
def parse(lst, n):
cluster = []
for i in lst:
if len(cluster) <= 1: # the first two values are going directly in
cluster.append(i)
continue
mean,stdev = stat(cluster)
if abs(mean - i) > n * stdev: # check the "distance"
yield cluster
cluster[:] = [] # reset cluster to the empty list
cluster.append(i)
yield cluster # yield the last cluster
This will return what you expect in your example with 5 < n < 9
:
>>> array = [1, 2, 3, 60, 70, 80, 100, 220, 230, 250]
>>> for cluster in parse(array, 7):
... print(cluster)
[1, 2, 3]
[60, 70, 80, 100]
[220, 230, 250]