Version 1
y = episode_a.argmax(-1) # episode_a is in shape [T, n_actions] action_preds = self.net(ep_s) # action_preds is logits before softmax neg_log_lik