本次学习
1
import pandas as pd from sklearn.model_selection import train_test_split #读取数据 data = pd.read_csv('E:/data_handle/melb_data.csv') #从预测器中分离目标 y =data.Price X = data.drop(['Price'],axis=1) #将数据划分为训练和验证子集 X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8,test_size=0.2,random_state=0) #删除有缺少值得列 cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()] X_train_full.drop(cols_with_missing, axis=1, inplace=True) X_valid_full.drop(cols_with_missing, axis=1, inplace=True) #“基数”是指列中唯一的数目 #选择技术相对较低的分类列(方便但随意) low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype == 'object'] #选择数字列 numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']] #仅仅保留所选列 my_cols = low_cardinality_cols + numerical_cols X_train = X_train_full[my_cols].copy() X_valid = X_valid_full[my_cols].copy() #打印前五行数据 print(X_train.head())