Split (explode) pandas dataframe string entry to separate rows

后端 未结 22 3598
一向
一向 2020-11-21 05:03

I have a pandas dataframe in which one column of text strings contains comma-separated values. I want to split each CSV field and create a new row per entry (as

22条回答
  •  自闭症患者
    2020-11-21 05:32

    upgraded MaxU's answer with MultiIndex support

    def explode(df, lst_cols, fill_value='', preserve_index=False):
        """
        usage:
            In [134]: df
            Out[134]:
               aaa  myid        num          text
            0   10     1  [1, 2, 3]  [aa, bb, cc]
            1   11     2         []            []
            2   12     3     [1, 2]      [cc, dd]
            3   13     4         []            []
    
            In [135]: explode(df, ['num','text'], fill_value='')
            Out[135]:
               aaa  myid num text
            0   10     1   1   aa
            1   10     1   2   bb
            2   10     1   3   cc
            3   11     2
            4   12     3   1   cc
            5   12     3   2   dd
            6   13     4
        """
        # make sure `lst_cols` is list-alike
        if (lst_cols is not None
            and len(lst_cols) > 0
            and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
            lst_cols = [lst_cols]
        # all columns except `lst_cols`
        idx_cols = df.columns.difference(lst_cols)
        # calculate lengths of lists
        lens = df[lst_cols[0]].str.len()
        # preserve original index values    
        idx = np.repeat(df.index.values, lens)
        res = (pd.DataFrame({
                    col:np.repeat(df[col].values, lens)
                    for col in idx_cols},
                    index=idx)
                 .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                                for col in lst_cols}))
        # append those rows that have empty lists
        if (lens == 0).any():
            # at least one list in cells is empty
            res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                      .fillna(fill_value))
        # revert the original index order
        res = res.sort_index()
        # reset index if requested
        if not preserve_index:        
            res = res.reset_index(drop=True)
    
        # if original index is MultiIndex build the dataframe from the multiindex
        # create "exploded" DF
        if isinstance(df.index, pd.MultiIndex):
            res = res.reindex(
                index=pd.MultiIndex.from_tuples(
                    res.index,
                    names=['number', 'color']
                )
        )
        return res
    

提交回复
热议问题