Day1
Pandas基本操作
数据读取
import pandas as pd
df = pd.read_csv('data/titanic.csv')
#read_excel/read_json等
#.head()可以读取前几条数据
df.head(6)
#.info()返回当前的信息
df.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
'''
#返回当前索引
df.index
'''
RangeIndex(start=0, stop=891, step=1)
'''
#返回列名
df.columns
'''
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
'''
#返回类型
df.dtypes
'''
PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object
dtype: object
'''
#返回值
df.values
'''
array([[1, 0, 3, ..., 7.25, nan, 'S'],
[2, 1, 1, ..., 71.2833, 'C85', 'C'],
[3, 1, 3, ..., 7.925, nan, 'S'],
...,
[889, 0, 3, ..., 23.45, nan, 'S'],
[890, 1, 1, ..., 30.0, 'C148', 'C'],
[891, 0, 3, ..., 7.75, nan, 'Q']], dtype=object)
'''
自己创建一个Dataframe结构
data = {'country':['a','b','c'],
'population':[10,12,14]
}
df_data = pd.DataFrame(data)
df_data
单独拿出数据集中的一列
age = df['Age']
age[:5]
#age是series结构:dataframe中的一行/列
'''
0 22.0
1 38.0
2 26.0
3 35.0
4 35.0
Name: Age, dtype: float64
'''
将其中一列的属性作为索引
df = df.set_index('Name')
df.head()
df['Age'][:5]
'''
Name
Braund, Mr. Owen Harris 22.0
Cumings, Mrs. John Bradley (Florence Briggs Thayer) 38.0
Heikkinen, Miss. Laina 26.0
Futrelle, Mrs. Jacques Heath (Lily May Peel) 35.0
Allen, Mr. William Henry 35.0
Name: Age, dtype: float64
'''
age = df['Age']
age['Braund, Mr. Owen Harris']
#22.0
age = age + 5
age[:5]
'''
Name
Braund, Mr. Owen Harris 27.0
Cumings, Mrs. John Bradley (Florence Briggs Thayer) 43.0
Heikkinen, Miss. Laina 31.0
Futrelle, Mrs. Jacques Heath (Lily May Peel) 40.0
Allen, Mr. William Henry 40.0
Name: Age, dtype: float64
'''
age.mean()
#34.69911764705882
age.std()
#14.526497332334042
#age.max() age.min()
#更全面直接的方式
df.describe()
Pandas索引结构
import pandas as pd
df = pd.read_csv('data/titanic.csv')
#.head()可以读取前几条数据
df.head(6)
df[['Age','Fare']][:5]
- loc:用label来定位
- iloc:用position来定位
#iloc
#获得五条数据0:5,但限制为两个属性1:3
df.iloc[0:5,1:3]
df1 = df.set_index('Name')
df1.loc['Braund, Mr. Owen Harris']
'''
PassengerId 1
Survived 0
Pclass 3
Sex male
Age 22
SibSp 1
Parch 0
Ticket A/5 21171
Fare 7.25
Cabin NaN
Embarked S
Name: Braund, Mr. Owen Harris, dtype: object
'''
df1.loc['Braund, Mr. Owen Harris':'Allen, Mr. William Henry',:]
#赋值
df1 = df.set_index('Name')
df1.loc['Braund, Mr. Owen Harris','Fare'] = 1000
df1.head()
#bool类型索引
df = df.set_index('Name')
df['Fare'] > 40
'''
Name
Braund, Mr. Owen Harris False
Cumings, Mrs. John Bradley (Florence Briggs Thayer) True
Heikkinen, Miss. Laina False
Futrelle, Mrs. Jacques Heath (Lily May Peel) True
Allen, Mr. William Henry False
...
Montvila, Rev. Juozas False
Graham, Miss. Margaret Edith False
Johnston, Miss. Catherine Helen "Carrie" False
Behr, Mr. Karl Howell False
Dooley, Mr. Patrick False
Name: Fare, Length: 891, dtype: bool
'''
df[df['Fare'] > 40]#见图1
df[df['Sex'] == 'male']#见图2
df.loc[df['Sex']=='male','Age'].mean()#所有男性年龄的均值
#30.72664459161148
df['Age']>70.sum() #年龄大于70岁的人数和
#5
来源:CSDN
作者:weiwen6933
链接:https://blog.csdn.net/weiwen6933/article/details/104142221