python学习09之分类变量

匿名 (未验证) 提交于 2019-12-02 22:51:30

  本次学习

  1

import pandas as pd from sklearn.model_selection import train_test_split #读取数据 data = pd.read_csv('E:/data_handle/melb_data.csv') #从预测器中分离目标 y =data.Price X = data.drop(['Price'],axis=1) #将数据划分为训练和验证子集 X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8,test_size=0.2,random_state=0) #删除有缺少值得列 cols_with_missing = [col for col in X_train_full.columns                      if X_train_full[col].isnull().any()] X_train_full.drop(cols_with_missing, axis=1, inplace=True) X_valid_full.drop(cols_with_missing, axis=1, inplace=True) #“基数”是指列中唯一的数目 #选择技术相对较低的分类列(方便但随意) low_cardinality_cols = [cname for cname in X_train_full.columns                         if X_train_full[cname].dtype == 'object'] #选择数字列 numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']] #仅仅保留所选列 my_cols = low_cardinality_cols + numerical_cols X_train = X_train_full[my_cols].copy() X_valid = X_valid_full[my_cols].copy() #打印前五行数据 print(X_train.head())

    

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!