问题
trying tell r to read through the rows of my dataframe and add the column with the highest value in the row to a new column in the dataframe called "MOST_COMMON_CANCER"
I tried the following code but got an error.
BASE_DF2 <- BASE_DF2%>%mutate(MOST_COMMON_CANCER=colnames(BASE_DF2[8:26])[max.col(BASE_DF2[8:26],ties.method="first")],.keep="all",.after=c_INCS_RATE)
Error: Problem with `mutate()` input `MOST_COMMON_CANCER`.
x Input `MOST_COMMON_CANCER` can't be recycled to size 1.
i Input `MOST_COMMON_CANCER` is `colnames(BASE_DF2[8:26])[max.col(BASE_DF2[8:26], ties.method = "first")]`.
i Input `MOST_COMMON_CANCER` must be size 1, not 490.
i The error occurred in group 1: YEAR_OF_DIAGNOSIS = "2015", STATE_ABBR = "CA", COUNTY_NAME = "ALAMEDA".
here is the dput for my data frame, though I've shrunk it from the original 80 columns
dput(head(BASE_DF2[1:31]))
structure(list(YEAR_OF_DIAGNOSIS = structure(c(1L, 2L, 1L, 2L,
1L, 2L), .Label = c("2015", "2016"), class = "factor"), STATE_ABBR = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("CA", "KY", "MA", "NM", "NY"), class = "factor"),
COUNTY_NAME = c("ALAMEDA", "ALAMEDA", "AMADOR", "AMADOR",
"BUTTE", "BUTTE"), AGE_AT_DIAGNOSIS = c(64.0595588235294,
64.4077743902439, 65.5079365079365, 66, 66.5040322580645,
66.4507575757576), `%_<_HIGH_SCHOOL_EDUCATION` = c(12.46,
12.46, 10.29, 10.29, 11.25, 11.25), `%_PERSONS_<150%_OF_POVERTY` = c(17.82,
17.82, 18.68, 18.68, 31.63, 31.63), `MEDIAN_FAMILY_INCOME_(IN_TENS)_ACS_2013-2017` = c(10360,
10360, 7415, 7415, 6105, 6105), Leukemia = c(59, 72, 0, 3,
13, 6), Miscellaneous = c(33, 36, 2, 3, 3, 4), Colorectal = c(124,
124, 6, 7, 25, 24), Musculoskeletal = c(10, 15, 1, 0, 3,
2), Brain_Nervous_System = c(26, 20, 1, 1, 2, 2), Breast = c(208,
214, 8, 10, 37, 42), Cervical_Uterine = c(54, 73, 2, 1, 7,
10), UGI_Tract = c(52, 51, 5, 1, 17, 9), Head = c(91, 65,
3, 1, 15, 15), Pancreatic_Biliary = c(104, 80, 5, 4, 10,
13), Lymphoma = c(56, 77, 1, 4, 15, 22), Throat = c(17, 19,
0, 0, 2, 2), Kidney_Ureter = c(48, 45, 5, 1, 5, 7), Lung = c(154,
128, 8, 6, 33, 37), Skin_Melanoma = c(80, 52, 9, 5, 17, 25
), Female_reproductive = c(28, 32, 0, 2, 6, 2), Male_reproductive = c(6,
9, 1, 0, 1, 3), Bladder = c(54, 53, 2, 2, 10, 7), Prostate = c(156,
147, 4, 2, 27, 32), TOTAL_CANCER = c(1360, 1312, 63, 53,
248, 264), c_INCS_RATE = c(0.000832039389723579, 0.000794693964081287,
0.00170127730820124, 0.00141601432044671, 0.00110403283607338,
0.0011669488266418), population = c(1634538, 1650950, 37031,
37429, 224631, 226231), AIR_1990 = c(3889287, 3889287, 222121,
222121, 252194, 252194), OnSite_LAND_1990 = c(231928, 231928,
1460, 1460, 515, 515)), row.names = c(NA, -6L), groups = structure(list(
YEAR_OF_DIAGNOSIS = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("2015",
"2016"), class = "factor"), STATE_ABBR = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Label = c("CA", "KY", "MA", "NM", "NY"
), class = "factor"), COUNTY_NAME = c("ALAMEDA", "AMADOR",
"BUTTE", "ALAMEDA", "AMADOR", "BUTTE"), .rows = structure(list(
1L, 3L, 5L, 2L, 4L, 6L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 6L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Run `rlang::last_error()` to see where the error occurred.
I was able to get this output(below), which I imagine I could assign to a vector then add to the dataframe but I would like to keep things neat and streamlined.
colnames(BASE_DF2[8:26])[max.col(BASE_DF2[8:26],ties.method="first")]
[1] "Breast" "Breast" "Skin_Melanoma" "Breast" "Breast"
[6] "Breast"
My question was flagged because of a similar question. My question is similar as it used that question as a basis for my code however I have additional parameters that have me stuck.
回答1:
Notice that your data is grouped, also you can use .
to refer to dataframe here.
library(dplyr)
BASE_DF2%>%
ungroup %>%
mutate(MOST_COMMON_CANCER = colnames(.[8:26])[max.col(.[8:26],
ties.method="first")], .after=c_INCS_RATE)
回答2:
What about the following
df <- data.frame(first = c(1, 2, 3, 4, 5), second = c(5, 4, 3, 2, 1))
col.names <- colnames(df)
apply(df,
MARGIN = 1,
function(row)
{
max.in.row <- max(row)
max.col.idxs <- max(row) == row
# We use [1] here to ensure that only the first entry
# is returned. If we do not do this, this might return
# more than one column name because there is a change that
# multiple columns might have the same value, which is the
# maximumg
max.col.name <- col.names[max.col.idxs][1]
return(max.col.name)
}
)
HTH
来源:https://stackoverflow.com/questions/65198238/add-column-to-my-data-frame-listing-columns-with-the-highest-row-value