问题
I'm trying to calculate some summary information to help me check for outliers in different groups in a dataset. I can get the sort of output I want using dplyr::group_by()
and dplyr::summarise()
- a dataframe with summary information for each group for a given variable. Something like this:
Sepal.Length_outlier_check <- iris %>%
dplyr::group_by(Species) %>%
dplyr::summarise(min = min(Sepal.Length, na.rm = TRUE),
max = max(Sepal.Length, na.rm = TRUE),
median = median(Sepal.Length, na.rm = TRUE),
MAD = mad(Sepal.Length, na.rm = TRUE),
MAD_lowlim = median - (3 * MAD),
MAD_highlim = median + (3 * MAD),
Outliers_low = any(Sepal.Length < MAD_lowlim, na.rm = TRUE),
Outliers_high = any(Sepal.Length > MAD_highlim, na.rm = TRUE)
)
Sepal.Length_outlier_check
However, I'd like to be able to put this in a For loop to be able to produce similar summary dataframes for each of the different variables in the dataset. I'm new to using loops, but I was thinking it might need to look something like this:
vars <- list(colnames(iris))
for (i in vars) {
x <- iris %>%
dplyr::group_by(Species) %>%
dplyr::summarise(min = min(i, na.rm = TRUE),
max = max(i, na.rm = TRUE),
median = median(i, na.rm = TRUE),
MAD = mad(i, na.rm = TRUE),
MAD_lowlim = median - (3 * MAD),
MAD_highlim = median + (3 * MAD),
Outliers_low = any(i < MAD_lowlim, na.rm = TRUE),
Outliers_high = any(i > MAD_highlim, na.rm = TRUE)
)
assign(paste(i, "Outlier_check", sep = "_"), x)
}
I know that doesn't work though because in the summary functions i
isn't actually referencing any data. I'm not sure what I need to do to make it work though! I'd be very grateful for your help, or any suggestions for how to accomplish all of this more elegantly.
I'm reluctant to use dplyr::summarise_all() because it outputs one summary table for all the variables, and as the real dataset I'm working on has many variables this summary table would become too large to be able to easily review it.
Thanks.
回答1:
You could also create these per-variable/species summaries without loops or separate functions, simply by gather
ing the non-Species columns, grouping, and summarizing:
library(tidyverse)
iris.summary <- iris %>%
gather(variable, value, -Species) %>%
group_by(variable, Species) %>%
summarize(
min = min(value, na.rm = TRUE),
max = max(value, na.rm = TRUE),
median = median(value, na.rm = TRUE),
MAD = mad(value, na.rm = TRUE),
MAD_lowlim = median - (3 * MAD),
MAD_highlim = median + (3 * MAD),
Outliers_low = any(value < MAD_lowlim, na.rm = TRUE),
Outliers_high = any(value > MAD_highlim, na.rm = TRUE)
)
variable Species min max median MAD MAD_lowlim MAD_highlim Outliers_low Outliers_high
<chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <lgl>
1 Petal.Length setosa 1 1.9 1.5 0.148 1.06 1.94 TRUE FALSE
2 Petal.Length versicolor 3 5.1 4.35 0.519 2.79 5.91 FALSE FALSE
3 Petal.Length virginica 4.5 6.9 5.55 0.667 3.55 7.55 FALSE FALSE
4 Petal.Width setosa 0.1 0.6 0.2 0 0.2 0.2 TRUE TRUE
5 Petal.Width versicolor 1 1.8 1.3 0.222 0.633 1.97 FALSE FALSE
6 Petal.Width virginica 1.4 2.5 2 0.297 1.11 2.89 FALSE FALSE
7 Sepal.Length setosa 4.3 5.8 5 0.297 4.11 5.89 FALSE FALSE
8 Sepal.Length versicolor 4.9 7 5.9 0.519 4.34 7.46 FALSE FALSE
9 Sepal.Length virginica 4.9 7.9 6.5 0.593 4.72 8.28 FALSE FALSE
10 Sepal.Width setosa 2.3 4.4 3.4 0.371 2.29 4.51 FALSE FALSE
11 Sepal.Width versicolor 2 3.4 2.8 0.297 1.91 3.69 FALSE FALSE
12 Sepal.Width virginica 2.2 3.8 3 0.297 2.11 3.89 FALSE FALSE
回答2:
This is actually pretty tricky and I wondered the same myself when I asked this question.
Here is one way to do it
for(i in colnames(iris)[1:4]) {
iris$artificialcolumn <- iris[,which(colnames(iris)==i)]
print(i)
x <- iris %>%
dplyr::group_by(Species) %>%
dplyr::summarise(min = min(artificialcolumn , na.rm = TRUE),
max = max(artificialcolumn, na.rm = TRUE),
median = median(artificialcolumn, na.rm = TRUE),
MAD = mad(artificialcolumn, na.rm = TRUE),
MAD_lowlim = median - (3 * MAD),
MAD_highlim = median + (3 * MAD),
Outliers_low = any(artificialcolumn < MAD_lowlim, na.rm = TRUE),
Outliers_high = any(artificialcolumn > MAD_highlim, na.rm = TRUE)
)
}
x
and the result:
> x
# A tibble: 3 x 9
Species min max median MAD MAD_lowlim MAD_highlim Outliers_low Outliers_high
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <lgl>
1 setosa 0.1 0.6 0.2 0 0.2 0.2 TRUE TRUE
2 versicolor 1 1.8 1.3 0.222 0.633 1.97 FALSE FALSE
3 virginica 1.4 2.5 2 0.297 1.11 2.89 FALSE FALSE
The fifth column is a factor, so this returns an error.
回答3:
The main problem can be solved by using get(i)
.
As for the results, it's better to save them in a list instead of having several (in this case 4) unrelated objects in the global environment.
library(dplyr)
vars <- colnames(iris)
vars <- vars[-which(vars == "Species")]
Outlier_check <- vector("list", length(vars))
for (i in vars) {
Outlier_check[[i]] <- iris %>%
group_by(Species) %>%
summarise(min = min(get(i), na.rm = TRUE),
max = max(get(i), na.rm = TRUE),
median = median(get(i), na.rm = TRUE),
MAD = mad(get(i), na.rm = TRUE),
MAD_lowlim = median - (3 * MAD),
MAD_highlim = median + (3 * MAD),
Outliers_low = any(get(i) < MAD_lowlim, na.rm = TRUE),
Outliers_high = any(get(i) > MAD_highlim, na.rm = TRUE)
)
}
Outlier_check$Sepal.Length
## A tibble: 3 x 9
# Species min max median MAD MAD_lowlim
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 setosa 4.3 5.8 5 0.297 4.11
#2 versic… 4.9 7 5.9 0.519 4.34
#3 virgin… 4.9 7.9 6.5 0.593 4.72
## ... with 3 more variables: MAD_highlim <dbl>,
## Outliers_low <lgl>, Outliers_high <lgl>
回答4:
You can also write a function to make it easier and more flexible. Using tidy evaluation approach, you would use rlang::sym()
to convert string to variable then unquote it inside summarise()
with !!
(bang bang).
library(dplyr)
check_outlier <- function(df, .groupvar, .checkvar) {
.groupvar <- sym(.groupvar)
.checkvar <- sym(.checkvar)
df_outlier_check <- df %>%
dplyr::group_by(!! .groupvar) %>%
dplyr::summarise(min = min(!! .checkvar, na.rm = TRUE),
max = max(!! .checkvar, na.rm = TRUE),
median = median(!! .checkvar, na.rm = TRUE),
MAD = mad(!! .checkvar, na.rm = TRUE),
MAD_lowlim = median - (3 * MAD),
MAD_highlim = median + (3 * MAD),
Outliers_low = any(!! .checkvar < MAD_lowlim, na.rm = TRUE),
Outliers_high = any(!! .checkvar > MAD_highlim, na.rm = TRUE)
)
return(df_outlier_check)
}
# test function
check_outlier(iris, "Species", "Sepal.Length")
#> # A tibble: 3 x 9
#> Species min max median MAD MAD_lowlim MAD_highlim Outliers_low
#> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
#> 1 setosa 4.3 5.8 5 0.297 4.11 5.89 FALSE
#> 2 versic~ 4.9 7 5.9 0.519 4.34 7.46 FALSE
#> 3 virgin~ 4.9 7.9 6.5 0.593 4.72 8.28 FALSE
#> # ... with 1 more variable: Outliers_high <lgl>
Loop through all variables and combine results into a single data frame using purrr::map_df()
library(purrr)
vars <- c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")
vars %>%
set_names() %>%
map_df(~ check_outlier(iris, "Species", .x), .id = 'Variable')
#> # A tibble: 12 x 10
#> Variable Species min max median MAD MAD_lowlim MAD_highlim
#> <chr> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Sepal.L~ setosa 4.3 5.8 5 0.297 4.11 5.89
#> 2 Sepal.L~ versic~ 4.9 7 5.9 0.519 4.34 7.46
#> 3 Sepal.L~ virgin~ 4.9 7.9 6.5 0.593 4.72 8.28
#> 4 Sepal.W~ setosa 2.3 4.4 3.4 0.371 2.29 4.51
#> 5 Sepal.W~ versic~ 2 3.4 2.8 0.297 1.91 3.69
#> 6 Sepal.W~ virgin~ 2.2 3.8 3 0.297 2.11 3.89
#> 7 Petal.L~ setosa 1 1.9 1.5 0.148 1.06 1.94
#> 8 Petal.L~ versic~ 3 5.1 4.35 0.519 2.79 5.91
#> 9 Petal.L~ virgin~ 4.5 6.9 5.55 0.667 3.55 7.55
#> 10 Petal.W~ setosa 0.1 0.6 0.2 0 0.2 0.2
#> 11 Petal.W~ versic~ 1 1.8 1.3 0.222 0.633 1.97
#> 12 Petal.W~ virgin~ 1.4 2.5 2 0.297 1.11 2.89
#> # ... with 2 more variables: Outliers_low <lgl>, Outliers_high <lgl>
Created on 2018-10-20 by the reprex package (v0.2.1.9000)
来源:https://stackoverflow.com/questions/52908192/how-to-correctly-use-group-by-and-summarise-in-a-for-loop-in-r