I find a function to detect outliers from columns but I do not know how to remove the outliers
is there a function for excluding or removing outliers from the colum
Here are 2 methods for one-dimentional datasets.
import numpy as np
# Function to Detection Outlier on one-dimentional datasets.
anomalies = []
def find_anomalies(data):
# Set upper and lower limit to 3 standard deviation
data_std = np.std(data)
data_mean = np.mean(data)
anomaly_cut_off = data_std * 3
lower_limit = data_mean - anomaly_cut_off
upper_limit = data_mean + anomaly_cut_off
# Generate outliers
for outlier in data:
if outlier > upper_limit or outlier < lower_limit:
anomalies.append(outlier)
return anomalies
q1, q3= np.percentile(data,[25,75]) # get percentiles
iqr = q3 - q1 # the IQR value
lower_bound = q1 - (1.5 * iqr) # lower bound
upper_bound = q3 + (1.5 * iqr) # upper bound
np.sum(data > upper_bound) # how many datapoints are above the upper bound?