How to get minimum of each group for each day based on hour criteria

前端 未结 4 1474
北海茫月
北海茫月 2020-12-22 01:29

I have given two dataframes below for you to test

df = pd.DataFrame({
    \'subject_id\':[1,1,1,1,1,1,1,1,1,1,1],
    \'time_1\' :[\'2173-04-03 12:35:00\',\'         


        
相关标签:
4条回答
  • 2020-12-22 02:16

    Try this

    from datetime import timedelta
    
    df1['time_1']= pd.to_datetime(df1['time_1'])
    df1['date'] = df1['time_1'].dt.date
    df1['t_d'] = df1.groupby(['date'])['time_1'].diff().shift(-1)
    mask= df1['t_d']>pd.Timedelta(1,'h')
    dfa=df1[mask]
    dfb=df1[~mask].groupby('date').first().reset_index()
    df_f = dfa.merge(dfb, how='outer')
    df_f.drop_duplicates(subset='date', keep='first', inplace=True)
    df_f.drop(['date','t_d'], axis=1, inplace=True)
    df_f.sort_values('time_1')
    
    0 讨论(0)
  • 2020-12-22 02:25
    df = pd.DataFrame({
     'subject_id':[1,1,1,1,1,1,1,1,1,1],
     'time_1' :['2173-04-03 12:35:00','2173-04-03 17:00:00','2173-04-03 20:00:00','2173-04-04 11:00:00','2173-04-04 11:30:00','2173-04-04 12:00:00','2173-04-04 16:00:00','2173-04-04 22:00:00','2173-04-05 04:00:00','2173-04-05 06:30:00'],
      'val' :[5,5,5,10,5,10,5,8,8,10]
     })
    
    # Separate Date and time
    df['time_1']=pd.to_datetime(df['time_1'])
    df['new_date'] = [d.date() for d in df['time_1']]
    df['new_time'] = [d.time() for d in df['time_1']]
    
    
    # find time diff in group with the first element to check > 1 hr
    df['shift_val'] = df['val'].shift()
    df1=df.assign(time_diff=df.groupby(['subject_id','new_date']).time_1.apply(lambda x: x - x.iloc[0]))
    
    # Verify if time diff > 1 and value is not changed
    df2=df1.loc[(df1['time_diff']/ np.timedelta64(1, 'h') >= 1) & (df1.val == df1.groupby('new_date').first().val[0])]
    df3=df1.loc[(df1['time_diff']/ np.timedelta64(1, 'h') <= 1) & (df1.val == df1.shift_val)]
    
    # Get the minimum within the group
    df4=df2.append(df3).groupby(['new_date'], sort=False).min()
    
    # drop unwanted columns
    df4.drop(['new_time','shift_val','time_diff'],axis=1, inplace=True)
    
    df4
    

    Output

              subject_id    time_1     val
    new_date            
    2173-04-03  1   2173-04-03 17:00:00 5
    2173-04-04  1   2173-04-04 16:00:00 5
    2173-04-05  1   2173-04-05 04:00:00 8
    
    0 讨论(0)
  • 2020-12-22 02:28

    Try this.

    from datetime import timedelta
    
    def f(x):
        dif = (x.iloc[0]-x.iloc[-1])//timedelta(minutes=1)
        return dif
    df1['time_1']= pd.to_datetime(df1['time_1'])
    df1['flag']= df1.val.diff().ne(0).cumsum()
    df1['t_d']=df1.groupby('flag')['time_1'].transform(f)
    df1['date'] = df1['time_1'].dt.date
    mask= df1['t_d'].ne(0)
    dfa=df1[mask].groupby(['flag','date']).first().reset_index()
    dfb=df1[~mask].groupby('date').first().reset_index().dropna(how='any')
    df_f = dfa.merge(dfb, how='outer')
    df_f.drop_duplicates(subset='date', keep='first', inplace=True)
    df_f.drop(['flag','date','t_d'], axis=1, inplace=True)
    df_f
    

    Output.

     subject_id     time_1         val
    0   1   2173-04-03 12:35:00     5
    1   1   2173-04-04 11:30:00     5
    2   1   2173-04-05 16:00:00     5
    5   1   2173-04-06 04:00:00     3
    
    0 讨论(0)
  • 2020-12-22 02:34

    I came up with an approach like below and it is working. Any suggestions are welcome

    s=pd.to_timedelta(24,unit='h')-(df.time_1-df.time_1.dt.normalize())
    df['tdiff'] = df.groupby(df.time_1.dt.date).time_1.diff().shift(-1).fillna(s)
    df['t_d'] = df['tdiff'].dt.total_seconds()/3600
    df['hr'] = df['time_1'].dt.hour
    df['date'] = df['time_1'].dt.date
    df['day'] = pd.DatetimeIndex(df['time_1']).day
    
    # here I get the freq and cumsum of each val for each day and each hour. Since sort = 'False', timeorder is retained as is
    
    temp_1 = pd.DataFrame(df.groupby(['subject_id','date','hr','val'], sort=False)['t_d'].agg({'cumduration':sum,'freq':'count'}).reset_index())
    
    # here i remove the `hour` component and sum the value duration in same day but different hours (for example `5` was in 12th hour and 13th hour. we sum them)
    
    temp_2 = pd.DataFrame(temp_1.groupby(['subject_id','date','val'], sort=False)['cumduration'].agg({'sum_of_cumduration':sum,'freq':'count'}).reset_index())
    
    # Later, I create a mask for `> 1` hr criteria  
    
    mask = temp_2.groupby(['subject_id','date'])['sum_of_cumduration'].apply(lambda x: x > 1)
    output_1 = pd.DataFrame(temp_2[mask].groupby(['subject_id','date'])['val'].min()).reset_index()
    
     # I check for `< 1 ` hr records here 
    
    output_2 = pd.DataFrame(temp_2[~mask].groupby(['subject_id','date'])['val'].min()).reset_index()
    
     # I finally check for `subject_id` and `date` and then append
    output = output_1.append(output_2[~output_2['subject_id'].isin(output_1['subject_id'])])
    
    output
    

    0 讨论(0)
提交回复
热议问题