第一编代码
import os
#print(os.getcwd())
np.random.seed(7)#对随机数生成器选定随机数种子,以确保每次执行代码时结果都是相同的
# define the raw dataset
def load_cor(fname,content=[],target=[],rating=[]):
with open(r'C:\Users\lujinyu\PycharmProjects\Attion\data\%s.cor' % fname) as f:
sentences = f.readlines()
print(f.name)
print(len(sentences) / 3)
for i in range(int(len(sentences) / 3)):
content.append(sentences[i * 3].strip())
target.append(sentences[i * 3 + 1].strip())
rating.append(sentences[i * 3 + 2].strip())
df=pd.DataFrame([content,target,rating], index=['content','target','rating'])
df= df.T#行列转置。
return df
if __name__ == '__main__':
print(load_cor('train'))#查看类型
第二遍代码
import numpy as np
import pandas as pd
import re
import os
print(os.getcwd())
np.random.seed(7)#对随机数生成器选定随机数种子,以确保每次执行代码时结果都是相同的
def load_cor(content=[],target=[],rating=[]):
with open(r'train.cor' ) as f:
sentences = f.readlines()
print(f.name)
print(len(sentences) / 3)
for i in range(int(len(sentences) / 3)):
content.append(sentences[i * 3].strip())
target.append(sentences[i * 3 + 1].strip())
rating.append(sentences[i * 3 + 2].strip())
df=[content,target,rating]
return df
def transLabel(labels):
for i in range(len(labels)):
if labels[i] == '1':
labels[i] = 2
elif labels[i] == '0':
labels[i] = '1'
elif labels[i] == '-1':
labels[i] = 0
else: print ("label无效:",labels[i])
return labels
def changeListCode(b):
a = []
for i in b:
a.append(i.decode('utf8'))
return a
if __name__ == '__main__':
df = load_cor()
opinion = transLabel(df[2])
content=df[0]
lint=df[0]+df[1]
# print(lint)
words = []
for line in lint:
lineArr = line.strip().split()
for word in lineArr:
data = re.findall(r'[a-zA-Z]*', word)
for w in data:
if w != '':
words.append(w.lower())
print('Preprocessing...')
dict = pd.DataFrame(pd.Series(words).value_counts()) # 统计词的出现次数
print(dict)
del words
来源:CSDN
作者:芦金宇
链接:https://blog.csdn.net/ch1209498273/article/details/78274023