问题
Below, I first find if variables X
and Y
have a value that is repeated less than 4 times. I find and list these values in low
.
I wonder, using BASE R, how can I transform low
which is a list of table
s to my desired output shown below?
Note: The data below is toy, a functional answer is appreciated.
data <- data.frame(id = c(rep("AA",4), rep("BB",2), rep("CC",2)), X = c(1,1,1,1,1,1,3,3),
Y = c(9,9,9,7,6,6,6,6),
Z = 1:8)
mods <- c("X","Y")
A <- setNames(lapply(seq_along(mods), function(i) table(data[[mods[i]]], dnn = NULL)), mods)
low <- setNames(lapply(seq_along(A), function(i) A[[i]][which(A[[i]] < 4)]), names(A))
Desired output:
data.frame(id = c("CC", "AA", "AA"), value = c(3, 7, 9), var.name = c("X", "Y", "Y"), occur = c(2, 1, 3))
# id value var.name occur # `value` comes from the `names(low[[i]])`# i = 1,2
# 1 CC 3 X 2 # `occur` comes from `as.numeric(low[[i]])`
# 2 AA 7 Y 1
# 3 AA 9 Y 3
回答1:
We split the subset of columns of 'data' with 'id', loop through the list
with lapply
, do an inner join with merge
with the corresponding stack
ed 'low' list
of table
s, Filter
out the elements that are having number of rows 0 or length
0 to create 'lst1'. From 'lst1', create additional columns from the inner and outer names
with Map
and rbind
the elements
lst1 <- Filter(length, lapply(split(data[c('X', 'Y')], data$id),
function(dat) Filter(nrow, Map(merge, lapply(dat,
function(x) stack(table(x))), lapply(low, stack)))))
do.call(rbind, c(Map(cbind, id = names(lst1), lapply(lst1,
function(x) do.call(rbind, c(Map(cbind, x, var.name = names(x)),
make.row.names = FALSE)))), make.row.names = FALSE))
# id values ind var.name
#1 AA 1 7 Y
#2 AA 3 9 Y
#3 CC 2 3 X
回答2:
data <- data.frame(id = c(rep("AA",4), rep("BB",2), rep("CC",2)), X = c(1,1,1,1,1,1,3,3),
Y = c(9,9,9,7,6,6,6,6),
Z = 1:8)
to_check <- setdiff(names(data), "id")
results <- vector(mode = "list", length = length(to_check))
# This function sorts the input and then uses the run-length
# encoding to determining values and their frequencies.
# table is an option, but the output is a lot harder
# to work with.
count_occurrences <- function(x, max_occurrences = 3L) {
x <- sort(x)
run_lengths <- rle(x)
chosen <- which(run_lengths$lengths <= max_occurrences)
if (length(chosen)) {
values <- run_lengths[["values"]][chosen]
occur <- run_lengths[["lengths"]][chosen]
} else {
values <- NA
occur <- NA
}
data.frame(value = values, occur = occur)
}
for (k in seq_along(results)) {
# `tapply` will split the first vector based upon the values in `INDEX`
counts <- tapply(data[[to_check[k]]],
INDEX = data$id,
FUN = count_occurrences,
max_occurrences = 3)
# Construct a data.frame of the results, repeating each name for the number
# of rows (values meeting the criterion) returned.
# I've used `unlist(sapply(...)` for the other two because
# not every result in the list will have the same number of rows,
# and `vapply()` requires specifying the output type and shape.
results[[k]] <- data.frame(id = rep(names(counts), times = vapply(counts, nrow, integer(1L))),
value = unlist(sapply(counts, `[[`, "value",
USE.NAMES = FALSE), use.names = FALSE),
occur = unlist(sapply(counts, `[[`, "occur",
USE.NAMES = FALSE), use.names = FALSE))
results[[k]]["var.name"] <- to_check[k]
}
desired_result <- Reduce(rbind, results)
desired_result
# id value occur var.name
# 1 AA NA NA X
# 2 BB 1 2 X
# 3 CC 3 2 X
# 4 AA 7 1 Y
# 5 AA 9 3 Y
# 6 BB 6 2 Y
# 7 CC 6 2 Y
# 8 AA 1 1 Z
# 9 AA 2 1 Z
# 10 AA 3 1 Z
# 11 AA 4 1 Z
# 12 BB 5 1 Z
# 13 BB 6 1 Z
# 14 CC 7 1 Z
# 15 CC 8 1 Z
# subset as [!is.na(value)] to drop the ids with no values with frequencies less
# than `max_occurrence`
来源:https://stackoverflow.com/questions/58895249/changing-a-list-of-tables-to-a-data-frame-in-r