数据类型:csv格式
['content']: 文本评论
['score']: 星级(1-5)5星为最好
处理方法: 将1-5星级改成三级, 1-2星为1级 'negative', 3星-2级'neutral', 4-5星为3级 'positive' 。
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, BertTokenizer
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import torch.nn as nn
BATCH_SIZE = 16
MAX_LEN = 160
class_names = ['negative', 'neutral', 'positive']
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
df = pd.read_csv('data/reviews.csv')
def to_sentiment(score):
score = int(score)
if score <= 2:
return 1
elif score == 3:
return 2
else:
return 3
df['sentiment'] = df.score.apply(to_sentiment)
df_train, df_test = train_test_split(df, test_size=0.1, random_state=1)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=1)
class ReviewsDataset(Dataset):
def __init__(self, reviews, targets, tokenizer, max_len):
self.reviews = reviews
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
self.len = len(self.reviews)
def __getitem__(self, item):
review = self.reviews[item]
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
review,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=True,
padding='max_length',
truncation=True,
return_attention_mask=True,
return_tensors='pt')
return {
'review_text': review,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'token_type_ids' : encoding['token_type_ids'].flatten(),
'targets': torch.tensor(target, dtype=torch.long)
}
def __len__(self):
return self.len
def create_data_loader(df, tokenizer, max_len, batch_size):
ds = ReviewsDataset(
reviews=df.content.to_numpy(),
targets=df.sentiment.to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)
return DataLoader(ds, batch_size=batch_size, shuffle=True)
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
model = BertModel.from_pretrained('bert-base-cased')
model = model.to(device)
for data in train_data_loader:
input_ids = data['input_ids'].to(device)
attention_mask = data["attention_mask"].to(device)
targets = data["targets"].to(device)
token_type_ids = data['token_type_ids'].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask,
token_type_ids=token_type_ids, return_dict=True)
print(outputs.keys()) # odict_keys(['last_hidden_state', 'pooler_output'])
来源:oschina
链接:https://my.oschina.net/u/4228078/blog/4545771