该项目来自kaggle比赛,
处理sale_train_v2.csv和test.csv
1、读取训练数据:
test = pd.read_csv("./test.csv") .set_index("ID")
train = pd.read_csv("./sales_train_v2.csv")
2、删除异常值
先画图查看"item_price"属性的分布:
index = [v for v in range(2935849)]
plt.scatter(index,train["item_price"])
plt.show()
属性"item_price"的分布如上,这里将大于100000的点视为异常值,将其剔除掉:
train = train[train.item_price < 100000]
再看"item_cnt_day"属性:
将大于1000视为异常值:
train = train[train.item_cnt_day < 1000]
3、将售价低于0的使用中值填充
median = train[(train.shop_id==32)&(train.item_id==2973)&(train.date_block_num==4)&(train.item_price>0)].item_price.median()#为什么要这么写,看原数据集就知道了
train.loc[train.item_price<0, 'item_price'] = median
4、清除重复的行
train.drop_duplicates(subset = ["date","date_block_num","shop_id","item_id","item_price","item_cnt_day"],keep = "first",inplace = True)
5、有几家商店是彼此的复制品,修复训练集和测试集将其更改为同一商店编号
# Якутск Орджоникидзе, 56
train.loc[train.shop_id == 0, 'shop_id'] = 57
test.loc[test.shop_id == 0, 'shop_id'] = 57
# Якутск ТЦ "Центральный"
train.loc[train.shop_id == 1, 'shop_id'] = 58
test.loc[test.shop_id == 1, 'shop_id'] = 58
# Жуковский ул. Чкалова 39м²
train.loc[train.shop_id == 10, 'shop_id'] = 11
6、清除训练集中在测试集中不存在的商店
train = train.merge(test[['shop_id']].drop_duplicates(), how = 'inner')
处理shop.csv
shops = pd.read_csv(os.path.join(path,"shops.csv"))
他长成这样:
它是俄文的,其中"shop_name"列包含了城市和商店经营类型特征,下面将他提取出来。
1、增加特征:商店所在城市
shops.loc[shops.shop_name == 'Сергиев Посад ТЦ "7Я"', 'shop_name'] = 'СергиевПосад ТЦ "7Я"'
shops['city'] = shops['shop_name'].str.split(' ').map(lambda x: x[0])
shops.loc[shops.city == '!Якутск', 'city'] = 'Якутск'
shops['shop_city'] = LabelEncoder().fit_transform(shops['city'])
2、增加特征:商店经营类型
shops['shop_name1'] = shops['shop_name'].apply(lambda x: x.lower()).str.replace('[^\w\s]', '').str.replace('\d+','').str.strip()
shops['shop_type'] = shops['shop_name1'].apply(lambda x: 'мтрц' if 'мтрц' in x else 'трц' if 'трц' in x else 'трк' if 'трк' in x else 'тц' if 'тц' in x else 'тк' if 'тк' in x else 'NO_DATA')
shops["shop_type"] = shops["shop_type"].map({'NO_DATA': 0 ,'мтрц': 1 ,'тк': 2 ,'трк': 3 ,'трц': 4 ,'тц': 5 })
3、更新数据
shops = shops[['shop_id','shop_city','shop_type']]
更新后的shop如下:
处理item_categories.csv
cats = pd.read_csv(os.path.join(path,"item_categories.csv"))
和前面一样,"item_category_name"中包含了两个信息:商品的两个所属类别。
#分解出商品所在类别
cats['split'] = cats['item_category_name'].str.split('-')
cats['type'] = cats['split'].map(lambda x: x[0].strip())
cats['item_type'] = LabelEncoder().fit_transform(cats['type'])
cats['subtype'] = cats['split'].map(lambda x: x[1].strip() if len(x) > 1 else x[0].strip())
cats['item_subtype'] = LabelEncoder().fit_transform(cats['subtype'])
cats = cats[['item_category_id','item_type','item_subtype']]
提取之后长成这样:
处理items.csv
items = pd.read_csv(os.path.join(path,"items.csv"))
items['name_1'], items['name_2'] = items['item_name'].str.split('[', 1).str
items['name_1'], items['name_3'] = items['item_name'].str.split('(', 1).str
items['name_2'] = items['name_2'].str.replace('[^A-Za-z0-9А-Яа-я]+', ' ').str.lower()
items['name_3'] = items['name_3'].str.replace('[^A-Za-z0-9А-Яа-я]+', ' ').str.lower()
items = items.fillna('0')
items['name_1'] = LabelEncoder().fit_transform(items['name_1'])
items['name_2'] = LabelEncoder().fit_transform(items['name_2'])
items['name_3'] = LabelEncoder().fit_transform(items['name_3'])
items = items[['item_id','item_category_id','name_1','name_2','name_3']]
到此为止,数据集里的四个文件都已经初步处理完毕,接下来就开始整合这些数据集。
特征添加
再次回到train文件中
1、#对于每个月,我们从该月的所有商店/产品的而唯一标识符组合创建一个网格
matrix = []
cols = ["date_block_num","shop_id","item_id"]
for i in train["date_block_num"].unique():
sales = train[train.date_block_num == i]
matrix.append(np.array(list(product([i],sales.shop_id.unique(),sales.item_id.unique())),dtype= "int16"))
matrix = pd.DataFrame(np.vstack(matrix),columns = cols)
matrix['date_block_num'] = matrix['date_block_num'].astype(np.int8)
matrix['shop_id'] = matrix['shop_id'].astype(np.int8)
matrix['item_id'] = matrix['item_id'].astype(np.int16)
matrix.sort_values(cols,inplace=True)
2、求每月在该商店该商品的售出量:item_cnt_month
group = train.groupby(['date_block_num','shop_id','item_id']).agg({'item_cnt_day': ['sum']})
group.columns = ['item_cnt_month']
group.reset_index(inplace=True)
3、matrix和group合并
matrix = pd.merge(matrix, group, on=cols, how='left')
matrix['item_cnt_month'] = (matrix['item_cnt_month'].fillna(0).clip(0,20).astype(np.float16))
4、合并shops.csv、items.csv、cats.csv
matrix = pd.merge(matrix, shops, on=['shop_id'], how='left')
matrix = pd.merge(matrix, items, on=['item_id'], how='left')
matrix = pd.merge(matrix, cats, on=['item_category_id'], how='left')
del shops,items,cats
gc.collect()
5、添加月、日变化
matrix['year'] = matrix['date_block_num']/12+2013
matrix['month'] = matrix['date_block_num'] % 12
days = pd.Series([31,28,31,30,31,30,31,31,30,31,30,31])
matrix['days'] = matrix['month'].map(days).astype(np.int8)
matrix['year'] = matrix['year'].astype(np.int16)
6、将月销售量分别滞后1/2/3个月
def lag_feature(df, lags, col):
tmp = df[['date_block_num','shop_id','item_id',col]]
for i in lags:
shifted = tmp.copy()`在这里插入代码片`
shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)]
shifted['date_block_num'] += i
df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left')
return df
def tianjia1(list_names,list_num):
global matrix
str1 = list_names[0]
for i in range(1,len(list_names)):
str1 = str1+'_and_'+list_names[i]
str1 = str1+'_avg_item_cnt'
group = matrix.groupby(list_names).agg({'item_cnt_month': ['mean']})
group.columns = [str1]
group.reset_index(inplace=True)
matrix = pd.merge(matrix, group, on=list_names, how='left')
matrix[str1] = matrix[str1].astype(np.float16)
matrix = lag_feature(matrix, list_num, str1)
matrix.drop([str1], axis=1, inplace=True)
matrix = lag_feature(matrix, [1,2,3], 'item_cnt_month')
tianjia1(['date_block_num'],[1])
tianjia1(['date_block_num', 'item_id'],[1,2,3])
tianjia1(['date_block_num', 'shop_id'],[1,2,3])
tianjia1(['date_block_num', 'item_category_id'],[1])
tianjia1(['date_block_num', 'shop_id', 'item_category_id'],[1])
tianjia1(['date_block_num', 'shop_id', 'item_id'],[1])
未完待续……
来源:CSDN
作者:Macan_ML
链接:https://blog.csdn.net/weixin_42211626/article/details/103880859