问题
I want to create a function that creates a new column in a dataframe that starts with all 0's in all rows but will create 1's based on the following. It starts looking at the highest % in the percent column. That will produce a 1 in the newly created "algorithm" column in the same row. Then it will look at the minimum and maximum row of the starting row. Lets say the highest found (starting value) is 13,8% in row 6, the next rows that it will look at are 5 and 7. Then it will look at the percentages in here and decides the highest % and creates a 1 in the "algorithm" column ( lets say it is 8,3% in row 7). Next it will look at the min and max row again ( row 5 and row 8, because row 6&7 are already took into account).
Then an important factor as well is that it has to stop at a certain percentage with looking for more rows, lets say at 95% it is stopping. This is based on the total percentage from the "percent" column that summed up should be 95% .
This is the main idea, but Im not sure how to do this.
Moreover, it also in the end has to look further than the min and max row since those 2 rows can also be both for example be 8%, so it has to look 1 row further and choose that row based on the highest value.
Not tested yet, but this is what im thinking about currently.
(While(total_perc < p_min_performance)
prev_row_value <t (minrow -1)
next_rpw_value <t (maxrow +1)
prev > next > t(prev,) >1
minrow <- minrow-1
maxrow <- maxrow+1
Sample code:
algorithm <- data.frame(pc4 = c(5464),
timeinterval = c('08:45:00', '09:00:00', '09:15:00', '09:30:00',
'09:45:00', '10:00:00', '10:15:00', '10:30:00', '10:45:00', '11:00:00',
'11:15:00', '11:30:00'),
stops = c(1, 5, 8, 7, 5, 10, 6, 4, 7, 6, 5, 8)) %>%
mutate(percent = round(stops/sum(stops), digits = 6)*100) %>%
mutate(idgroup = seq_along(timeinterval))
Not sure where to start yet. EDIT: THe 1's in the algorithm_clumn can also be the corresponding percentages, which maybe makes it easier to count it up until lets say 95%.
The structure should look like this (its an example, the data in the algorithm_column could be anything based on what it is finding in the data)
EDIT:
algorithm
# pc4 timeinterval stops percent idgroup algorithm_column
#1 5464 08:45:00 1 1.3889 1 0
#2 5464 09:00:00 5 6.9444 2 1
#3 5464 09:15:00 8 11.1111 3 1
#4 5464 09:30:00 7 9.7222 4 1
#5 5464 09:45:00 5 6.9444 5 1
#6 5464 10:00:00 10 13.8889 6 1
#7 5464 10:15:00 6 8.3333 7 1
#8 5464 10:30:00 4 5.5556 8 1
#9 5464 10:45:00 7 9.7222 9 1
#10 5464 11:00:00 6 8.3333 10 1
#11 5464 11:15:00 5 6.9444 11 1
#12 5464 11:30:00 8 11.1111 12 0
The code of Ronak is working:
algorithm$algorithm_column <- 0
output <- do.call(rbind, lapply(split(algorithm, algorithm$pc4),
function(x) {
all_index <- x$idgroup
next_comb <- all_index
while(sum(x$percent[x$algorithm_column == 1]) <= 95) {
inds <- next_comb[which.max(x$percent[next_comb])]
x$algorithm_column[inds] <- 1
nos <- which(all_index == inds)
next_comb <- all_index[c(nos - 1, nos + 1)]
all_index <- setdiff(all_index, inds)
}
x
}))
EDIT: The function is not working in some cases because when it reaches two 0's in the next rows it will take the FIRST max of those rows and it will find only 0's in the first part of the data set and then continues to the next highest value. For example, this dataset:
algorithm1 <- data.frame(pc4 = c(8035),
timeinterval = c('03:00:00','03:30:00','04:00:00','04:30:00','05:00:00','05:30:00','06:00:00','06:30:00','07:00:00','07:30:00','08:00:00','08:30:00','09:00:00','09:30:00','10:00:00','10:30:00','11:00:00','11:30:00','12:00:00','12:30:00','13:00:00','13:30:00','14:00:00','14:30:00','15:00:00','15:30:00','16:00:00','16:30:00'),
stops = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 70, 0, 6, 0, 0, 0, 3, 0, 3, 3, 0, 5, 0, 0, 0)) %>%
group_by(pc4) %>%
mutate(percent = round(stops/sum(stops), digits = 6)*100) %>%
mutate(idgroup = seq_along(timeinterval)) %>%
mutate(algorithm_column = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
I added the iteration to the function to see the order, so you can see that it takes the 0's first.
p_min_performance <- 95 # SET PERCENTAGE!
#Naar 0
algorithm1$algorithm_column <- 0
algorithm1$iteration <- 0
it <- 0
algorithm1 <- do.call(rbind, lapply(split(algorithm1, algorithm1$pc4),
function(x) {
#Index voor maximum percentage
all_index <- x$idgroup
next_comb <- all_index
#While loop algorithm
while (sum(x$percent[x$algorithm_column == 1]) <= p_min_performance) {
it <- it + 1
inds <- next_comb[which.max(x$percent[next_comb])]
x$algorithm_column[inds] <- 1
x$iteration[inds] <- it
nos <- which(all_index == inds)
next_comb <- all_index[c(nos - 1, nos + 1)]
all_index <- setdiff(all_index, inds)
}
x
}))
Output how it is now: (where timeinterval will be from 03:00 to 15:00)
pc4 tinterval stops percen id_g a_col iteration
1 8035 03:00:00 0 0.0000 1 1 14
2 8035 03:30:00 0 0.0000 2 1 13
3 8035 04:00:00 0 0.0000 3 1 12
4 8035 04:30:00 0 0.0000 4 1 11
5 8035 05:00:00 0 0.0000 5 1 10
6 8035 05:30:00 0 0.0000 6 1 9
7 8035 06:00:00 0 0.0000 7 1 8
8 8035 06:30:00 0 0.0000 8 1 7
9 8035 07:00:00 0 0.0000 9 1 6
10 8035 07:30:00 0 0.0000 10 1 5
11 8035 08:00:00 0 0.0000 11 1 4
12 8035 08:30:00 0 0.0000 12 1 3
13 8035 09:00:00 9 9.0909 13 1 2
14 8035 09:30:00 70 70.7071 14 1 1
15 8035 10:00:00 0 0.0000 15 1 15
16 8035 10:30:00 6 6.0606 16 1 16
17 8035 11:00:00 0 0.0000 17 1 17
18 8035 11:30:00 0 0.0000 18 1 18
19 8035 12:00:00 0 0.0000 19 1 19
20 8035 12:30:00 3 3.0303 20 1 20
21 8035 13:00:00 0 0.0000 21 1 21
22 8035 13:30:00 3 3.0303 22 1 22
23 8035 14:00:00 3 3.0303 23 1 23
24 8035 14:30:00 0 0.0000 24 1 24
25 8035 15:00:00 5 5.0505 25 1 25
26 8035 15:30:00 0 0.0000 26 0 0
27 8035 16:00:00 0 0.0000 27 0 0
28 8035 16:30:00 0 0.0000 28 0 0
But this should be: (where timeinterval will be from 09:00 to 15:00)
pc4 tinterval stops percen id_g a_col iteration
1 8035 03:00:00 0 0.0000 1 0 0
2 8035 03:30:00 0 0.0000 2 0 0
3 8035 04:00:00 0 0.0000 3 0 0
4 8035 04:30:00 0 0.0000 4 0 0
5 8035 05:00:00 0 0.0000 5 0 0
6 8035 05:30:00 0 0.0000 6 0 0
7 8035 06:00:00 0 0.0000 7 0 0
8 8035 06:30:00 0 0.0000 8 0 0
9 8035 07:00:00 0 0.0000 9 0 0
10 8035 07:30:00 0 0.0000 10 0 0
11 8035 08:00:00 0 0.0000 11 0 0
12 8035 08:30:00 0 0.0000 12 0 0
13 8035 09:00:00 9 9.0909 13 1 2
14 8035 09:30:00 70 70.7071 14 1 1
15 8035 10:00:00 0 0.0000 15 1 3
16 8035 10:30:00 6 6.0606 16 1 4
17 8035 11:00:00 0 0.0000 17 1 5
18 8035 11:30:00 0 0.0000 18 1 6
19 8035 12:00:00 0 0.0000 19 1 7
20 8035 12:30:00 3 3.0303 20 1 8
21 8035 13:00:00 0 0.0000 21 1 9
22 8035 13:30:00 3 3.0303 22 1 10
23 8035 14:00:00 3 3.0303 23 1 11
24 8035 14:30:00 0 0.0000 24 1 12
25 8035 15:00:00 5 5.0505 25 1 13
26 8035 15:30:00 0 0.0000 26 0 0
27 8035 16:00:00 0 0.0000 27 0 0
28 8035 16:30:00 0 0.0000 28 0 0
So the algorithm in the end should look at the rows further then only the row next to the highest value if these are both 0.
I was now busy with creating chunks of it but im a but stuck..
runAlgorithm <- function(x, min_performance = 95) {
x$algorithm_column <- 0
x$iteration <- 0
it <- 0
all_index <- x$idgroup
next_comb <- all_index
inds <- next_comb[which.max(x$percent[next_comb])]
x$algorithm_column[inds] <- 1
x$iteration[inds] <- it
#While loop algorithm
while (sum(x$percent[x$algorithm_column == 1]) <= min_performance) {
prev_values <- x$percent[1:inds - 1]
next_values <- x$percent[inds + 1:length(x$percent)]
first_non_zero_prev <- if_else(sum(prev_values) > 0L, which.max(prev_values
> 0), NA)
first_non_zero_next <- if_else(sum(next_values) > 0L, which.max(next_values
> 0), NA)
next_value <- case_when(
is.na(first_non_zero_prev) & !is.na(first_non_zero_next) ~ next_comb[2],
!is.na(first_non_zero_prev) & is.na(first_non_zero_next) ~ next_comb[1],
first_non_zero_prev <= first_non_zero_next ~ next_comb[2],
first_non_zero_prev > first_non_zero_next ~ next_comb[1]
)
inds <- next_comb[which.max(x$percent[next_value])]
x$algorithm_column[inds] <- 1
x$iteration[inds] <- it
nos <- which(all_index == inds)
next_comb <- all_index[c(nos - 1, nos + 1)]
all_index <- setdiff(all_index, inds)
}
return(x)
}
df_test <- groep_test[1:48,]
output <- runAlgorithm(df_test)
回答1:
Here is one way to do it.
#Remaining index
all_index <- algorithm$idgroup
#Initialising to 0
algorithm$algorithm_column <- 0
#Index to check for maximum
next_comb <- all_index
#While more than 20% of the rows are remaining.
#Change this to whatever number you wish. For 95% use 0.05
while(sum(x$percent[x$algorithm_column == 1]) <= 95) {
#Get maximum index
inds <- next_comb[which.max(algorithm$percent[next_comb])]
#Change the value to 1
algorithm$algorithm_column[inds] <- 1
nos <- which(all_index == inds)
#Get the next two indices
next_comb <- all_index[c(nos - 1, nos + 1)]
#Remove the previously used index.
all_index <- setdiff(all_index, inds)
}
Since we have limited number of rows, this stops after filling 10 rows out of 12.
algorithm
# pc4 timeinterval stops percent idgroup algorithm_column
#1 5464 08:45:00 1 1.3889 1 0
#2 5464 09:00:00 5 6.9444 2 1
#3 5464 09:15:00 8 11.1111 3 1
#4 5464 09:30:00 7 9.7222 4 1
#5 5464 09:45:00 5 6.9444 5 1
#6 5464 10:00:00 10 13.8889 6 1
#7 5464 10:15:00 6 8.3333 7 1
#8 5464 10:30:00 4 5.5556 8 1
#9 5464 10:45:00 7 9.7222 9 1
#10 5464 11:00:00 6 8.3333 10 1
#11 5464 11:15:00 5 6.9444 11 1
#12 5464 11:30:00 8 11.1111 12 0
For multiple groups, we can split
the data based on pc4
and apply the same for each group.
algorithm$algorithm_column <- 0
output <- do.call(rbind, lapply(split(algorithm, algorithm$pc4), function(x) {
all_index <- x$idgroup
next_comb <- all_index
while(sum(x$percent[x$algorithm_column == 1]) <= 95) {
inds <- next_comb[which.max(x$percent[next_comb])]
x$algorithm_column[inds] <- 1
nos <- which(all_index == inds)
next_comb <- all_index[c(nos - 1, nos + 1)]
all_index <- setdiff(all_index, inds)
}
x
}))
回答2:
Here's a solution that is not based on a loop. Basically, it uses the cumsum()
to determine which rows cross the maximum_threshold
. The rowSums(matrix(...))
combines rows 5 and 7, then 4 and 8, etc. from your example.
Based on your comments you could add this to a dplyr
chain, including with group_by()
.
f_algo_return <- function(pct, max_threshold = 70){
# initialize return variable
algo <- vector(mode = 'integer', length = length(pct))
#make rows
max_row <- which.max(pct)
#if we have odd number of rows, we need to prevent subsetting pct[0]
len_out <- min(abs(max_row - c(1, length(pct))))
all_rows <- c(max_row,
(max_row - len_out):(max_row-1),
(max_row+1):(max_row + len_out)
)
#subset the pct
pct <- pct[all_rows]
thresh <- cumsum(c(pct[1], rowSums(matrix(pct[-1], ncol = 2)))) < max_threshold
sub_rows <- all_rows[c(thresh[1], rev(thresh[-1]), thresh[-1])]
#initialize and update new variable
algo[sub_rows] <- 1L
return(algo)
}
f_algo_return(DF[['percent']])
# [1] 0 0 1 1 1 1 1 1 1 0 0 0
data:
DF <- data.frame(pc4 = c(5464),
timeinterval = c('08:45:00', '09:00:00', '09:15:00', '09:30:00',
'09:45:00', '10:00:00', '10:15:00', '10:30:00', '10:45:00', '11:00:00',
'11:15:00', '11:30:00'),
stops = c(1, 5, 8, 7, 5, 10, 6, 4, 7, 6, 5, 8)) %>%
mutate(percent = round(stops/sum(stops), digits = 6)*100) %>%
mutate(idgroup = seq_along(timeinterval))
来源:https://stackoverflow.com/questions/57817324/create-while-loop-function-that-takes-next-largest-value-untill-condition-is-met