Bin formation in a R data.frame

。_饼干妹妹 提交于 2020-01-14 05:23:06

问题


I have a data.frame with two columns:

category quantity
a          20
b          30
c          100
d          10
e          1
f          23
g          3
h          200

I need to write a function with two parameters: dataframe, bin_size which runs a cumsum over the quantity column, does a split of the subsequent row if the the cumsum exceeds the bin_size and adds a running bin number as an additional column.

Say, by entering this:

function(dataframe, 50)

in the above example should give me:

category    quantity    cumsum  bin_nbr
a            20        20         1
b            30        50         1
c            50        50         2
c            50        50         3
d            10        10         4
e            1         11         4
f           23         34         4
g            3         37         4
h            13        50         4
h            50        50         5
h            50        50         6
h            50        50         7
h            37        37         8

Explanation:

row a + b sum up to 50 --> bin_nbr 1
row c is 100 -> split into 2 rows @ 50 -> bin nbr 2, bin_nbr 3
row d,e,f,g sum up to 37 -> bin_nbr 4
I need another 13 from row h to fill in bin_nbr 4 to 50
The rest of the remaining quantity from h will be spitted into 4 bins -> bin_nbr 5, 6, 7, 8

回答1:


I couldn't think of a clean way to do this with apply/data.table etc since you have an inter-row dependency and a changing size data frame. You can probably do it in an iterative/recursive manner, but I felt it would be quicker to figure out to just write the loop. One challenge is that it is difficult to know the final size of your object, so this is likely to be slow. You can mitigate the problem somewhat by switching from a df to a matrix (code should work fine, except transform bits) if performance is an issue in this application.

fun <- function(df, binsize){
  df$cumsum <- cumsum(df$quantity)
  df$bin <- 1
  i <- 1
  repeat {
    if((extra <- (df[i, "cumsum"] - binsize)) > 0) { # Bin finished halfway through
      top <- if(i > 1L) df[1L:(i - 1L), ] else df[0L, ]
      mid <- transform(df[i, ], quantity=quantity-extra, cumsum=cumsum-extra)
      bot <- transform(df[i, ], quantity=extra, cumsum=extra, bin=bin + 1L)
      end <- if(i >= nrow(df)) df[0L, ] else df[(i + 1L):nrow(df), ]
      end <- transform(end, cumsum=cumsum(end$quantity) + extra, bin=bin + 1L)
      df <- rbind(top, mid, bot, end)
    } else if (extra == 0 && nrow(df) > i) {  # Bin finished cleanly
      df[(i + 1L):nrow(df), ]$cumsum <- df[(i + 1L):nrow(df), ]$cumsum - binsize
      df[(i + 1L):nrow(df), ]$bin <- df[(i + 1L):nrow(df), ]$bin + 1L
    }
    if(nrow(df) < (i <- i + 1)) break
  }
  rownames(df) <- seq(len=nrow(df))
  df
}
fun(df, binsize) 

#    category quantity cumsum bin
# 1         a       20     20   1
# 2         b       30     50   1
# 3         c       50     50   2
# 4         c       50     50   3
# 5         d       10     10   4
# 6         e        1     11   4
# 7         f       23     34   4
# 8         g        3     37   4
# 9         h       13     50   4
# 10        h       50     50   5
# 11        h       50     50   6
# 12        h       50     50   7
# 13        h       37     37   8



回答2:


Another solution with a loop:

DF <- read.table(text="category quantity
a          20
b          30
c          100
d          10
e          1
f          23
g          3
h          200", header=TRUE)

bin_size <- 50
n_bin <- ceiling(sum(DF$quantity)/bin_size)

DF$bin <- findInterval(cumsum(DF$quantity)-1, c(0, seq_len(n_bin)*50))
DF$cumsum <- cumsum(DF$quantity)

result <- lapply(seq_along(DF[,1]), function(i, df) {
  if (i==1) {
    d <- df[i, "bin"]
  } else {
    d <- df[i, "bin"]-df[i-1, "bin"]
  }
  if (d > 1) {    
    res <- data.frame(
      category = df[i, "category"],
      bin_nbr = df[i, "bin"]-seq_len(d+1)+1
    )        
    res[,"quantity"] <- bin_size
    if (i!=1) {
      res[nrow(res),"quantity"] <- df[i-1, "bin"]*bin_size-df[i-1, "cumsum"]
    }  else {
      res[nrow(res),"quantity"] <- 0
    }
    res[1,"quantity"] <- df[i, "quantity"]-sum(res[-1,"quantity"])
    return(res[res$quantity > 0,])
  } else {
    return(data.frame(
      category = df[i, "category"],
      quantity = df[i, "quantity"],
      bin_nbr = df[i, "bin"]
    ))
  }
}, df=DF)

res <- do.call(rbind, result)
res <- res[order(res$category, res$bin_nbr),]
library(plyr)
res <- ddply(res, .(bin_nbr), transform, cumsum=cumsum(quantity))
res

#    category quantity bin_nbr cumsum
# 1         a       20       1     20
# 2         b       30       1     50
# 3         c       50       2     50
# 4         c       50       3     50
# 5         d       10       4     10
# 6         e        1       4     11
# 7         f       23       4     34
# 8         g        3       4     37
# 9         h       13       4     50
# 10        h       50       5     50
# 11        h       50       6     50
# 12        h       50       7     50
# 13        h       37       8     37



回答3:


This amounts to merging the bin boundaries with the data which gives this loop-free solution:

library(zoo)

fun <- function(DF, binsize = 50) {
  nr <- nrow(DF)
  DF2 <- data.frame(cumsum = seq(0, sum(DF$quantity), binsize) + binsize, bin_nbr = 1:nr)
  DF.cs <- transform(DF, cumsum = cumsum(DF$quantity))
  m <- na.locf(merge(DF.cs, DF2, all = TRUE), fromLast = TRUE)
  m$bin_nbr <- as.numeric(m$bin_nbr)
  cs <- as.numeric(m$cumsum)
  m$quantity <- c(cs[1], diff(cs))
  m$cumsum <- ave(m$quantity, m$bin_nbr, FUN = cumsum)
  na.omit(m)[c("category", "quantity", "cumsum", "bin_nbr")]
}

giving:

> fun(DF)
   category quantity cumsum bin_nbr
1         a       20     20       1
2         b       30     50       1
3         c       50     50       2
4         c       50     50       3
5         d       10     10       4
6         e        1     11       4
7         f       23     34       4
8         g        3     37       4
9         h       13     50       4
10        h       50     50       5
11        h       50     50       6
12        h       50     50       7
13        h       37     37       8

Note: For purposes of reproducing the result above this is the input we used:

Lines <- "category quantity
a          20
b          30
c          100
d          10
e          1
f          23
g          3
h          200
"
DF <- read.table(text = Lines, header = TRUE, as.is = TRUE)

REVISION An error in the code was corrected.



来源:https://stackoverflow.com/questions/21046209/bin-formation-in-a-r-data-frame

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!