How to make Pareto Chart in python?

后端 未结 4 1493
野性不改
野性不改 2021-02-01 11:11

Pareto is very popular diagarm in Excel and Tableu. In excel we can easily draw a Pareto diagram but I found no easy way to draw the diagram in Python.

I have a pandas d

相关标签:
4条回答
  • 2021-02-01 11:20

    You would probably want to create a new column with the percentage in it and plot one column as bar chart and the other as a line chart in a twin axes.

    import pandas as pd
    import matplotlib.pyplot as plt
    from matplotlib.ticker import PercentFormatter
    
    df = pd.DataFrame({'country': [177.0, 7.0, 4.0, 2.0, 2.0, 1.0, 1.0, 1.0]})
    df.index = ['USA', 'Canada', 'Russia', 'UK', 'Belgium', 'Mexico', 'Germany', 'Denmark']
    df = df.sort_values(by='country',ascending=False)
    df["cumpercentage"] = df["country"].cumsum()/df["country"].sum()*100
    
    
    fig, ax = plt.subplots()
    ax.bar(df.index, df["country"], color="C0")
    ax2 = ax.twinx()
    ax2.plot(df.index, df["cumpercentage"], color="C1", marker="D", ms=7)
    ax2.yaxis.set_major_formatter(PercentFormatter())
    
    ax.tick_params(axis="y", colors="C0")
    ax2.tick_params(axis="y", colors="C1")
    plt.show()
    

    0 讨论(0)
  • 2021-02-01 11:22

    pareto chart for pandas.dataframe

    import pandas as pd
    import matplotlib.pyplot as plt
    from matplotlib.ticker import PercentFormatter
    
    
    def _plot_pareto_by(df_, group_by, column):
    
        df = df_.groupby(group_by)[column].sum().reset_index()
        df = df.sort_values(by=column,ascending=False)
    
        df["cumpercentage"] = df[column].cumsum()/df[column].sum()*100
    
    
        fig, ax = plt.subplots(figsize=(20,5))
        ax.bar(df[group_by], df[column], color="C0")
        ax2 = ax.twinx()
        ax2.plot(df[group_by], df["cumpercentage"], color="C1", marker="D", ms=7)
        ax2.yaxis.set_major_formatter(PercentFormatter())
    
        ax.tick_params(axis="y", colors="C0")
        ax2.tick_params(axis="y", colors="C1")
    
        for tick in ax.get_xticklabels():
            tick.set_rotation(45)
        plt.show()
    

    0 讨论(0)
  • 2021-02-01 11:25

    Another way is using the secondary_y parameter without using twinx():

    df['pareto'] = 100 *df.country.cumsum() / df.country.sum()
    fig, axes = plt.subplots()
    ax1 = df.plot(use_index=True, y='country',  kind='bar', ax=axes)
    ax2 = df.plot(use_index=True, y='pareto', marker='D', color="C1", kind='line', ax=axes, secondary_y=True)
    ax2.set_ylim([0,110])
    

    The parameter use_index=True is needed because your index is your x axis in this case. Otherwise you could've used x='x_Variable'.

    0 讨论(0)
  • 2021-02-01 11:26

    More generalized version of ImportanceOfBeingErnest's code:

    def create_pareto_chart(df, by_variable, quant_variable):
        df.index = by_variable
        df["cumpercentage"] = quant_variable.cumsum()/quant_variable.sum()*100
    
        fig, ax = plt.subplots()
        ax.bar(df.index, quant_variable, color="C0")
        ax2 = ax.twinx()
        ax2.plot(df.index, df["cumpercentage"], color="C1", marker="D", ms=7)
        ax2.yaxis.set_major_formatter(PercentFormatter())
    
        ax.tick_params(axis="y", colors="C0")
        ax2.tick_params(axis="y", colors="C1")
        plt.show()
    

    And this one includes Pareto by grouping according to a threshold, too. For example: If you set it to 70, it will group minorities beyond 70 into one group called "Other".

    def create_pareto_chart(by_variable, quant_variable, threshold):
    
        total=quant_variable.sum()
        df = pd.DataFrame({'by_var':by_variable, 'quant_var':quant_variable})
        df["cumpercentage"] = quant_variable.cumsum()/quant_variable.sum()*100
        df = df.sort_values(by='quant_var',ascending=False)
        df_above_threshold = df[df['cumpercentage'] < threshold]
        df=df_above_threshold
        df_below_threshold = df[df['cumpercentage'] >= threshold]
        sum = total - df['quant_var'].sum()
        restbarcumsum = 100 - df_above_threshold['cumpercentage'].max()
        rest = pd.Series(['OTHERS', sum, restbarcumsum],index=['by_var','quant_var', 'cumpercentage'])
        df = df.append(rest,ignore_index=True)
        df.index = df['by_var']
        df = df.sort_values(by='cumpercentage',ascending=True)
    
        fig, ax = plt.subplots()
        ax.bar(df.index, df["quant_var"], color="C0")
        ax2 = ax.twinx()
        ax2.plot(df.index, df["cumpercentage"], color="C1", marker="D", ms=7)
        ax2.yaxis.set_major_formatter(PercentFormatter())
    
        ax.tick_params(axis="x", colors="C0", labelrotation=70)
        ax.tick_params(axis="y", colors="C0")
        ax2.tick_params(axis="y", colors="C1")
    
        plt.show()
    
    0 讨论(0)
提交回复
热议问题