How to take subsets of lists in a tibble

北战南征 提交于 2020-01-05 04:40:13

问题


I have annual financial data for several stocks. I needed to blow it out to become monthly data and, thanks to an answer to this question I'd asked earlier, I have a solution which involves mutating the date column into lists of dates:

library(tidyverse)
library(lubridate)

factors.subset.raw = structure(list(
    sec_id = c(1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1572L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L, 1676L), 
    metric = c("EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "EPS_GROWTH", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY", "ND_EQUITY"), 
    date = structure(c(9464, 9829, 10193, 10560, 10925, 11291, 11656, 12020, 12384, 12752, 13117, 13482, 13847, 14211, 14578, 14943, 15308, 15674, 16038, 16402, 16769, 17135, 9342, 9708, 10073, 10438, 10802, 11200, 11565, 13756, 14120, 14487, 14852, 15217, 15583, 15947, 16311, 16678, 17044, 9464, 9829, 10193, 10560, 10925, 11291, 11656, 12020, 12384, 12752, 13117, 13482, 13847, 14211, 14578, 14943, 15308, 15674, 16038, 16402, 16769, 17135, 9342, 9708, 10073, 10438, 10802, 11200, 11565, 13756, 14120, 14487, 14852, 15217, 15583, 15947, 16311, 16678, 17044), 
    class = "Date"), value = c(0.250468, 0.091548, -0.100863, 0.058375, 0.24784, 0.178765, 0.099276, 0.25472, -0.033291, 0.124165, 0.050947, 0.243008, 0.1205, -0.239625, -0.231221, 0.365649, 0.163779, 0.024976, 0.08388, 0.154777, 0.016473, -0.272928, -0.018711, -0.162076, -0.599241, -4.071504, -0.37761, 1.694085, 0.045113, 0.329818, 0.199564, -0.616418, 1.164773, 0.877078, -0.325099, -0.294199, 0.272016, -0.706077, -2.57027, 4.500261, 4.734375, 4.090376, 3.322846, 3.640895, 4.645253, 4.783054, 3.946184, 3.847828, 4.077601, 4.778736, 5.453883, 5.14355, 5.084551, 3.370378, 3.076065, 2.812879, 2.87688, 2.430692, 3.029766, 3.062665, 3.349906, 0.396299, 0.60174, 0.527478, 1.048755, 1.136417, 0.668333, 0.523115, 0.259175, 0.164024, 0.118469, 0.061141, 0.096251, 0.346829, 0.401832, 0.300988, 0.344943, 0.432505)), 
    row.names = c(NA, -78L), class = c("tbl_df", "tbl", "data.frame"), .Names = c("sec_id", "metric", "date", "value"))

factors.subset.monthly = factors.subset.raw %>%
    group_by(sec_id, metric) %>%
    mutate(date = ceiling_date(date, 'month')) %>%
    mutate(date = map2(date, lead(date - 1, default = today()), seq, by = 'month'))

Now it suffices to add %>% unnest() %>% mutate(date = date - 1) to the above to transform my annual data to monthly, with all dates being end of month.

My problem occurs when there is a big gap in the data. When this happens, I only want to fill forward at most 18 months.

I've tried adding pipes which cutoff the date column, but so far I can't seem to figure it out. This little gem gives me incompatible size errors, for example:

factors.subset.monthly %>%
    mutate(count.date = as.numeric(lapply(date, length))) %>%
    mutate(count.cutoff = ifelse(count.date <= 18, count.date, 18)) %>%
    mutate(date = date[1:count.cutoff])

回答1:


You need to use map/lapply to iterate over the list column, but then you can simply use head to limit it to 18 observations:

library(tidyverse)
library(lubridate)

df <- factors.subset.monthly %>% mutate(date = map(date, head, 18))

any(lengths(factors.subset.monthly$date) > 18)
#> [1] TRUE
any(lengths(df$date) > 18)
#> [1] FALSE

You could also just include head when you make factors.subset.monthly:

factors.subset.raw %>%
    group_by(sec_id, metric) %>%
    mutate(date = ceiling_date(date, 'month'),
           date = map2(date, lead(date - 1, default = today()), 
                       ~head(seq(.x, .y, by = 'month'), 18)))

You could also use the minimum of the target date or 18 months past the start date for the to parameter of seq, but adding 18 months is somewhat difficult due to their irregular length.



来源:https://stackoverflow.com/questions/43991620/how-to-take-subsets-of-lists-in-a-tibble

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!