问题
Similar questions have been asked already but none was able to solve my specific problem. I have a .R
file ("Mycalculus.R") containing many basic calculus that I need to apply to subsets of a dataframe: one subset for each year where the modalities of "year" are factors (yearA, yearB, yearC) not numeric values. The file generates a new dataframe that I need to save in a Rda file. Here is what I expect the code to look like with a for
loop (this one obviously do not work):
id <- identif(unlist(df$year))
for (i in 1:length(id)){
data <- subset(df, year == id[i])
source ("Mycalculus.R", echo=TRUE)
save(content_df1,file="myresults.Rda")
}
Here is an exact of the main data.frame df:
obs year income gender ageclass weight
1 yearA 1000 F 1 10
2 yearA 1200 M 2 25
3 yearB 1400 M 2 5
4 yearB 1350 M 1 11
Here is what the sourced file "Mycalculus.R" do: it applies numerous basic calculus to columns of the dataframe called "data", and creates two new dataframes df1 and then df2 based on df1. Here is an extract:
data <- data %>%
group_by(gender) %>%
mutate(Income_gender = weighted.mean(income, weight))
data <- data %>%
group_by(ageclass) %>%
mutate(Income_ageclass = weighted.mean(income, weight))
library(GiniWegNeg)
gini=c(Gini_RSV(data$Income_gender, weight), Gini_RSV(data$Income_ageclass,weight))
df1=data.frame(gini)
colnames(df1) <- c("Income_gender","Income_ageclass")
rownames(df1) <- c("content_df1")
df2=(1/5)*df1$Income_gender+df2$Income_ageclass
colnames(df2) <- c("myresult")
rownames(df2) <- c("content_df2")
So that in the end, I get two dataframes like this:
Income_Gender Income_Ageclass
content_df1 .... ....
And for df2:
myresult
content_df2 ....
But I need to save df1 and Rf2 as a Rda file where the row names of content_df1 and content_df2 are given per subset, something like this:
Income_Gender Income_Ageclass
content_df1_yearA .... ....
content_df1_yearB .... ....
content_df1_yearC .... ....
and
myresult
content_df2_yearA ....
content_df2_yearB ....
content_df2_yearC ....
Currently, my program does not use any loop and is doing the job but messily. Basically the code is more than 2500 lines of code. (please don't throw tomatoes at me).
Anyone could help me with this specific request? Thank you in advance.
回答1:
Consider incorporating all in one script with a defined function of needed arguments, called by lapply()
. Lapply then returns a list of dataframes that you can rowbind into one final df.
library(dplyr)
library(GiniWegNeg)
runIncomeCalc <- function(data, y){
data <- data %>%
group_by(gender) %>%
mutate(Income_gender = weighted.mean(income, weight))
data <- data %>%
group_by(ageclass) %>%
mutate(Income_ageclass = weighted.mean(income, weight))
gini <- c(Gini_RSV(data$Income_gender, weight), Gini_RSV(data$Income_ageclass,weight))
df1 <- data.frame(gini)
colnames(df1) <- c("Income_gender","Income_ageclass")
rownames(df1) <- c(paste0("content_df1_", y))
return(df1)
}
runResultsCalc <- function(df, y){
df2 <- (1/5) * df$Income_gender + df$Income_ageclass
colnames(df2) <- c("myresult")
rownames(df2) <- c(paste0("content_df2_", y)
return(df2)
}
dfIncList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
runIncomeCalc(yeardata, i)
})
dfResList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
df <- runIncomeCalc(yeardata, i)
runResultsCalc(df, i)
})
df1 <- do.call(rbind, dfIncList)
df2 <- do.call(rbind, dfResList)
Now if you need to source across scripts. Create same two functions, runIncomeCalc and runResultsCalc in Mycalculus.R and then call each in other script:
library(dplyr)
library(GiniWegNeg)
if(!exists("runIncomeCalc", mode="function")) source("Mycalculus.R")
dfIncList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
runIncomeCalc(yeardata, i)
})
dfResList <- lapply(unique(df$year), function(i) {
yeardata <- subset(df, year == i)
df <- runIncomeCalc(yeardata, i)
runResultsCalc(df, i)
})
df1 <- do.call(rbind, dfIncList)
df2 <- do.call(rbind, dfResList)
回答2:
If you functional-ize your steps you can create a workflow like the following:
calcFunc <- function(df) {
## Do something to the df, then return it
df
}
processFunc <- function(fname) {
## Read in your table
x <- read.table(fname)
## Do the calculation
x <- calcFunc(x)
## Make a new file name (remember to change the file extension)
new_fname <- sub("something", "else", fname)
## Write the .RData file
save(x, file = new_fname)
}
### Your workflow
## Generate a vector of files
my_files <- list.files()
## Do the work
res <- lapply(my_files, processFunc)
Alternatively, don't save the files. Omit the save
call in the processFunc
, and return a list of data.frame objects. Then use either data.table::rbindlist(res)
or do.call(rbind, list)
to make one large data.frame object.
来源:https://stackoverflow.com/questions/38750824/loop-over-a-subset-source-a-file-and-save-results-in-a-dataframe