python数据分析库pandas使用之一

泪湿孤枕 提交于 2020-02-03 02:46:27

Day1

Pandas基本操作

titanic数据集: 密码:pje4

数据读取

import pandas as pd
df = pd.read_csv('data/titanic.csv')
#read_excel/read_json等
#.head()可以读取前几条数据
df.head(6)

在这里插入图片描述

#.info()返回当前的信息
df.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
'''
#返回当前索引
df.index
'''
RangeIndex(start=0, stop=891, step=1)
'''
#返回列名
df.columns
'''
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
'''
#返回类型
df.dtypes
'''
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
'''
#返回值
df.values
'''
array([[1, 0, 3, ..., 7.25, nan, 'S'],
       [2, 1, 1, ..., 71.2833, 'C85', 'C'],
       [3, 1, 3, ..., 7.925, nan, 'S'],
       ...,
       [889, 0, 3, ..., 23.45, nan, 'S'],
       [890, 1, 1, ..., 30.0, 'C148', 'C'],
       [891, 0, 3, ..., 7.75, nan, 'Q']], dtype=object)
'''

自己创建一个Dataframe结构

data = {'country':['a','b','c'],
        'population':[10,12,14]
       }
df_data = pd.DataFrame(data)
df_data

在这里插入图片描述

单独拿出数据集中的一列

age = df['Age']
age[:5]
#age是series结构:dataframe中的一行/列

'''
0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64
'''

将其中一列的属性作为索引

df = df.set_index('Name')
df.head()

在这里插入图片描述

df['Age'][:5]
'''
Name
Braund, Mr. Owen Harris                                22.0
Cumings, Mrs. John Bradley (Florence Briggs Thayer)    38.0
Heikkinen, Miss. Laina                                 26.0
Futrelle, Mrs. Jacques Heath (Lily May Peel)           35.0
Allen, Mr. William Henry                               35.0
Name: Age, dtype: float64
'''
age = df['Age']
age['Braund, Mr. Owen Harris']
#22.0
age = age + 5
age[:5]
'''
Name
Braund, Mr. Owen Harris                                27.0
Cumings, Mrs. John Bradley (Florence Briggs Thayer)    43.0
Heikkinen, Miss. Laina                                 31.0
Futrelle, Mrs. Jacques Heath (Lily May Peel)           40.0
Allen, Mr. William Henry                               40.0
Name: Age, dtype: float64
'''
age.mean()
#34.69911764705882
age.std()
#14.526497332334042
#age.max() age.min()
#更全面直接的方式
df.describe()

在这里插入图片描述

Pandas索引结构

import pandas as pd
df = pd.read_csv('data/titanic.csv')
#.head()可以读取前几条数据
df.head(6)
df[['Age','Fare']][:5]

在这里插入图片描述

  • loc:用label来定位
  • iloc:用position来定位
#iloc
#获得五条数据0:5,但限制为两个属性1:3
df.iloc[0:5,1:3]

在这里插入图片描述

df1 = df.set_index('Name')
df1.loc['Braund, Mr. Owen Harris']
'''
PassengerId            1
Survived               0
Pclass                 3
Sex                 male
Age                   22
SibSp                  1
Parch                  0
Ticket         A/5 21171
Fare                7.25
Cabin                NaN
Embarked               S
Name: Braund, Mr. Owen Harris, dtype: object
'''

df1.loc['Braund, Mr. Owen Harris':'Allen, Mr. William Henry',:]

在这里插入图片描述

#赋值
df1 = df.set_index('Name')
df1.loc['Braund, Mr. Owen Harris','Fare'] = 1000
df1.head()

在这里插入图片描述

#bool类型索引
df = df.set_index('Name')
df['Fare'] > 40
'''
Name
Braund, Mr. Owen Harris                                False
Cumings, Mrs. John Bradley (Florence Briggs Thayer)     True
Heikkinen, Miss. Laina                                 False
Futrelle, Mrs. Jacques Heath (Lily May Peel)            True
Allen, Mr. William Henry                               False
                                                       ...  
Montvila, Rev. Juozas                                  False
Graham, Miss. Margaret Edith                           False
Johnston, Miss. Catherine Helen "Carrie"               False
Behr, Mr. Karl Howell                                  False
Dooley, Mr. Patrick                                    False
Name: Fare, Length: 891, dtype: bool
'''
df[df['Fare'] > 40]#见图1
df[df['Sex'] == 'male']#见图2

图1图2

df.loc[df['Sex']=='male','Age'].mean()#所有男性年龄的均值
#30.72664459161148
df['Age']>70.sum() #年龄大于70岁的人数和
#5
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!