问题
I'm currently working on building an LSTM network to forecast time-series data using PyTorch. I tried to share all the code pieces that I thought would be helpful, but please feel free to let me know if there's anything further I can provide. I added some comments at the end of the post regarding what the underlying issue might be.
From the univariate time-series data indexed by date, I created 3 date features and split the data into training and validation sets as below.
# X_train
weekday monthday hour
timestamp
2015-01-08 17:00:00 3 8 17
2015-01-12 19:30:00 0 12 19
2014-12-01 15:30:00 0 1 15
2014-07-26 09:00:00 5 26 9
2014-10-17 20:30:00 4 17 20
... ... ... ...
2014-08-29 06:30:00 4 29 6
2014-10-13 14:30:00 0 13 14
2015-01-03 02:00:00 5 3 2
2014-12-06 16:00:00 5 6 16
2015-01-06 20:30:00 1 6 20
8256 rows × 3 columns
# y_train
value
timestamp
2015-01-08 17:00:00 17871
2015-01-12 19:30:00 20321
2014-12-01 15:30:00 16870
2014-07-26 09:00:00 11209
2014-10-17 20:30:00 26144
... ...
2014-08-29 06:30:00 9008
2014-10-13 14:30:00 17698
2015-01-03 02:00:00 12850
2014-12-06 16:00:00 18277
2015-01-06 20:30:00 19640
8256 rows × 1 columns
# X_val
weekday monthday hour
timestamp
2015-01-08 07:00:00 3 8 7
2014-10-13 22:00:00 0 13 22
2014-12-07 01:30:00 6 7 1
2014-10-14 17:30:00 1 14 17
2014-10-25 09:30:00 5 25 9
... ... ... ...
2014-09-26 12:30:00 4 26 12
2014-10-08 16:00:00 2 8 16
2014-12-03 01:30:00 2 3 1
2014-09-11 08:00:00 3 11 8
2015-01-15 10:00:00 3 15 10
2064 rows × 3 columns
# y_val
value
timestamp
2014-09-13 13:00:00 21345
2014-10-28 20:30:00 23210
2015-01-21 17:00:00 17001
2014-07-20 10:30:00 13936
2015-01-29 02:00:00 3604
... ...
2014-11-17 11:00:00 15247
2015-01-14 00:00:00 10584
2014-09-02 13:00:00 17698
2014-08-31 13:00:00 16652
2014-08-30 12:30:00 15775
2064 rows × 1 columns
Then, I transformed the values in the datasets by using MinMaxScaler from the sklearn library.
scaler = MinMaxScaler()
X_train_arr = scaler.fit_transform(X_train)
X_val_arr = scaler.transform(X_val)
y_train_arr = scaler.fit_transform(y_train)
y_val_arr = scaler.transform(y_val)
After converting these NumPy arrays into PyTorch Tensors, I created iterable datasets using TensorDataset and DataLoader classes provided by PyTorch.
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable
train_features = torch.Tensor(X_train_arr)
train_targets = torch.Tensor(y_train_arr)
val_features = torch.Tensor(X_val_arr)
val_targets = torch.Tensor(y_val_arr)
train = TensorDataset(train_features, train_targets)
train_loader = DataLoader(train, batch_size=64, shuffle=False)
val = TensorDataset(val_features, val_targets)
val_loader = DataLoader(train, batch_size=64, shuffle=False)
Then, I defined my LSTM Model and train_step functions as follows:
class LSTMModel(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super(LSTMModel, self).__init__()
# Hidden dimensions
self.hidden_dim = hidden_dim
# Number of hidden layers
self.layer_dim = layer_dim
# Building your LSTM
# batch_first=True causes input/output tensors to be of shape
# (batch_dim, seq_dim, feature_dim)
self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
# Readout layer
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
# Initialize hidden state with zeros
h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
# Initialize cell state
c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
# We need to detach as we are doing truncated backpropagation through time (BPTT)
# If we don't, we'll backprop all the way to the start even after going through another batch
out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
# Index hidden state of last time step
out = self.fc(out[:, -1, :])
return out
def make_train_step(model, loss_fn, optimizer):
# Builds function that performs a step in the train loop
def train_step(x, y):
# Sets model to TRAIN mode
model.train()
# Makes predictions
yhat = model(x)
# Computes loss
loss = loss_fn(y, yhat)
# Computes gradients
loss.backward()
# Updates parameters and zeroes gradients
optimizer.step()
optimizer.zero_grad()
# Returns the loss
return loss.item()
# Returns the function that will be called inside the train loop
return train_step
Finally, I start training my LSTM model in mini-batches with AdamOptimizer for 20 epochs, which is already long enough to see the model is not learning.
import torch.optim as optim
input_dim = n_features
hidden_dim = 64
layer_dim = 3
output_dim = 1
model = LSTMModel(input_dim, hidden_dim, layer_dim, output_dim)
criterion = nn.MSELoss(reduction='mean')
optimizer = optim.Adam(model.parameters(), lr=1e-2)
train_losses = []
val_losses = []
train_step = make_train_step(model, criterion, optimizer)
n_epochs = 20
device = 'cuda' if torch.cuda.is_available() else 'cpu'
for epoch in range(n_epochs):
batch_losses = []
for x_batch, y_batch in train_loader:
x_batch = x_batch.unsqueeze(dim=0).to(device)
y_batch = y_batch.to(device)
loss = train_step(x_batch, y_batch)
batch_losses.append(loss)
training_loss = np.mean(batch_losses)
train_losses.append(training_loss)
with torch.no_grad():
batch_val_losses = []
for x_val, y_val in val_loader:
x_val = x_val.unsqueeze(dim=0).to(device)
y_val = y_val.to(device)
model.eval()
yhat = model(x_val)
val_loss = criterion(y_val, yhat).item()
batch_val_losses.append(val_loss)
validation_loss = np.mean(batch_val_losses)
val_losses.append(validation_loss)
print(f"[{epoch+1}] Training loss: {training_loss:.4f}\t Validation loss: {validation_loss:.4f}")
And this is the output:
C:\Users\VS32XI\Anaconda3\lib\site-packages\torch\nn\modules\loss.py:446: UserWarning: Using a target size (torch.Size([1, 1])) that is different to the input size (torch.Size([64, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.mse_loss(input, target, reduction=self.reduction)
[1] Training loss: 0.0505 Validation loss: 0.0315
[2] Training loss: 0.0317 Validation loss: 0.0315
[3] Training loss: 0.0317 Validation loss: 0.0315
[4] Training loss: 0.0317 Validation loss: 0.0315
[5] Training loss: 0.0317 Validation loss: 0.0315
[6] Training loss: 0.0317 Validation loss: 0.0315
[7] Training loss: 0.0317 Validation loss: 0.0315
[8] Training loss: 0.0317 Validation loss: 0.0315
[9] Training loss: 0.0317 Validation loss: 0.0315
[10] Training loss: 0.0317 Validation loss: 0.0315
[11] Training loss: 0.0317 Validation loss: 0.0315
[12] Training loss: 0.0317 Validation loss: 0.0315
[13] Training loss: 0.0317 Validation loss: 0.0315
[14] Training loss: 0.0317 Validation loss: 0.0315
[15] Training loss: 0.0317 Validation loss: 0.0315
[16] Training loss: 0.0317 Validation loss: 0.0315
[17] Training loss: 0.0317 Validation loss: 0.0315
[18] Training loss: 0.0317 Validation loss: 0.0315
[19] Training loss: 0.0317 Validation loss: 0.0315
[20] Training loss: 0.0317 Validation loss: 0.0315
Note 1: Looking at the warning given, I'm not sure if that's the real reason why the model is not learning well. After all, I'm trying to predict the future values in the time-series data; therefore, 1 would be a plausible output dimension.
Note 2: To train the model in mini-batches, I relied on the class DataLoader. When iterating over the X and Y batches in both train and validation DataLoaders, the dimensions of x_batches were 2, while the model expected 3. So, I used PyTorch's unsqueeze function to match the expected dimension as in x_batch.unsqueeze(dim=0)
. I'm not sure if this is how I should have gone about it, which could also be the issue.
回答1:
Once I used Tensor View to reshape the mini-batches for the features in training and the validation set, the issue was resolved. As a side note, view()
enable fast and memory-efficient reshaping, slicing, and element-wise operations, by avoiding an explicit data copy.
It turned out that in the earlier implementation torch.unsqueeze()
did not reshape the batches into tensors with the dimensions (batch size, timesteps, number of features). Instead, the function unsqueeze(dim=0)
returns a new tensor with a singleton dimension inserted at the Oth index.
So, the mini batches for the feature sets is shaped as follows x_batch = x_batch.view([batch_size, -1, n_features]).to(device)
Then, the new training loop becomes:
for epoch in range(n_epochs):
batch_losses = []
for x_batch, y_batch in train_loader:
x_batch = x_batch.view([batch_size, -1, n_features]).to(device) # <---
y_batch = y_batch.to(device)
loss = train_step(x_batch, y_batch)
batch_losses.append(loss)
training_loss = np.mean(batch_losses)
train_losses.append(training_loss)
with torch.no_grad():
batch_val_losses = []
for x_val, y_val in val_loader:
x_val = x_val.view([batch_size, -1, n_features]).to(device) # <---
y_val = y_val.to(device)
model.eval()
yhat = model(x_val)
val_loss = criterion(y_val, yhat).item()
batch_val_losses.append(val_loss)
validation_loss = np.mean(batch_val_losses)
val_losses.append(validation_loss)
print(f"[{epoch+1}] Training loss: {training_loss:.4f}\t Validation loss: {validation_loss:.4f}")
Here's the output:
[1] Training loss: 0.0235 Validation loss: 0.0173
[2] Training loss: 0.0149 Validation loss: 0.0086
[3] Training loss: 0.0083 Validation loss: 0.0074
[4] Training loss: 0.0079 Validation loss: 0.0069
[5] Training loss: 0.0076 Validation loss: 0.0069
...
[96] Training loss: 0.0025 Validation loss: 0.0028
[97] Training loss: 0.0024 Validation loss: 0.0027
[98] Training loss: 0.0027 Validation loss: 0.0033
[99] Training loss: 0.0027 Validation loss: 0.0030
[100] Training loss: 0.0023 Validation loss: 0.0028
来源:https://stackoverflow.com/questions/65596522/lstm-for-time-series-prediction-failing-to-learn-pytorch