PyTorch学习笔记 | 易学教程

【PyTorch深度学习60分钟快速入门 】Part1：PyTorch是什么？来源：https://www.cnblogs.com/leejack/p/8370634.html

import torch 
import numpy as np #用于替代NumPy（torch对象叫张量，带前缀和括号，并用逗号分隔，numpy对象叫数组，用空格分隔），#torch可以使用GPU的计算力,一种深度学习研究平台，可以提供最大的灵活性和速度
x = torch.Tensor(5, 3) #创建一个5x3且未初始化的矩阵,如果将首字母大写的Tensor改成小写tensor，会报错
#print(x)
x = torch.rand(5, 3)#创建一个随机初始化的矩阵rand表示0~1之间均匀分布的随机数
#print(x)
#print(x.size())
y = torch.rand(5, 3)
#print(x + y)
#print(torch.add(x, y))  
result = torch.Tensor(5, 3)
#print(result)
torch.add(x, y, out=result)
#print(result)
y.add_(x) #原地替换，任何原地改变张量值的操作后缀都是固定的_，例如：x.copy_(y)、x.t_()，都将原地改变x的值。
#print(y)
x = torch.randn(4, 4)
y = x.view(16) #调整或重塑张量形状，使用torch.view,相当于numpy中的reshape
z = x.view(-1, 8)  # the size -1 is inferred from other dimensions,-1和2结果一样，可以从其size中看出来（2，8）
print(y,z)
print(x.size(), y.size(), z.size())

a = torch.ones(5)#ones和numpy中的ones一样
print("a:",a)
b = a.numpy()#将torch张量对象转换为numpy中的数组对象
print("b:",b)

a = np.ones(5)
b = torch.from_numpy(a)#将numpy中的数组对象转换为torch张量对象，b=a.torch()不可以，只能用torch.from_numpy(?)

#在CPU上的所有的张量，除了CharTensor之外，都支持转换成NumPy对象，也支持反向转换。
np.add(a, 1, out=a)#将a +1后 再回传给a
print('a:',a)
print('b:',b)
if torch.cuda.is_available():#使用.cuda函数可以将张量移到GPU上进行计算
    x = x.cuda()
    y = y.cuda()
    x + y
else:
    print('cuda is not available')

tensor([ 0.3856, -0.3865, -0.5026,  0.8776, -2.5368, -1.7295,  0.0219,  1.2241,
         1.4038, -0.8838, -0.1019,  2.1651, -0.3457, -0.5027,  0.0651,  0.1814]) tensor([[ 0.3856, -0.3865, -0.5026,  0.8776, -2.5368, -1.7295,  0.0219,  1.2241],
        [ 1.4038, -0.8838, -0.1019,  2.1651, -0.3457, -0.5027,  0.0651,  0.1814]])
torch.Size([4, 4]) torch.Size([16]) torch.Size([2, 8])
a: tensor([1., 1., 1., 1., 1.])
b: [1. 1. 1. 1. 1.]
a: [2. 2. 2. 2. 2.]
b: tensor([2., 2., 2., 2., 2.], dtype=torch.float64)
cuda is not available

#【PyTorch深度学习60分钟快速入门】Part2：Autograd自动化微分，

#在PyTorch中，集中于所有神经网络的是autograd包。
#autograd.Variable是autograd包的核心类，它封装了一个张量，并支持几乎所有在该张量上定义的操作。
#一旦完成了你的计算，你可以调用.backward()，它会自动计算所有梯度。你可以通过.data属性访问原始的张量，而梯度w.r.t.这个变量被累积到.grad。
#还有一个类对于autograd的实现非常重要——一个函数。变量和函数是相互联系的，并建立一个非循环图，它编码了计算的一个完整历史。
#每个变量都有一个.grad_fn属性，该属性引用了一个创建了该变量的函数（除了由用户创建的变量之外，它们的grad_fn是None）。
#如果你想计算导数，你可以在一个变量上调用.backward()。如果变量是一个标量（也就是说它包含一个元素数据），那么你不需要为backward()指定任何参数，
#但是如果它是矢量，有更多元素，那么你需要指定一个grad_output参数，该参数是一个匹配形状的张量。

import torch
from torch.autograd import Variable
x = Variable(torch.ones(2, 2), requires_grad=True)#创建一个变量：
y = x + 2 
z = y * y * 3#对y做更多的操作
out = z.mean()
print(x,y,z,out)#在打印出来的结果中，x值的后面携带了是否要求梯度的信息，y，z，out的值后面则各携带了一个grad_fn的编号，按计算公式名称和顺序编号
print(y.grad_fn,z.grad_fn,out.grad_fn)#y,z,out是由于操作而创建的，所以它各自都有一个grad_fn,相应的编号和地址参见打印结果

#现在我们介绍后向传播
#backward函数是反向求导数，使用链式法则求导，如果是标量则可以直接求导，
#对矢量(即多维变量)y求导，需要额外指定grad_tensors，grad_tensors的shape必须和y的相同
#y.backward(torch.Tensor(2,2)) #y可以和out或者z同时求梯度，没有冲突
#z.backward(torch.Tensor(2,2)) #z和out不能同时求梯度会报错，有冲突
out.backward()#out.backward()等效于做out.backward(torch.Tensor([1.0])),若括号里面改成torch.Tensor([1.0,1.0]结果会依次增加4.5，为什么?

print(x.grad)#打印梯度d(out)/dx， 梯度（Gradients）即导数或偏导数, z=3(x+2)^2,out对xi的偏导数=3（xi+2）/2，xi=1，所以结果=4.5
print("*********************************************************")

#backward函数中还有retain_graph参数 ，使用retain_graph参数的时候，再次求导的时候，会对之前的导数进行累加
#如果默认不设置即requires_grad=None，则会报错，无法反向求导2次

x=Variable(torch.Tensor([1,5,6,10,16]),requires_grad=True) #需要求导数
y=x*x

weights0=torch.ones(5)
y.backward(weights0,retain_graph=True)
print(x.grad)

weights1=torch.FloatTensor([0.1,0.1,0.1,0.1,0.1])
y.backward(weights1,retain_graph=True)
print(x.grad)

weights2=torch.FloatTensor([0.5,0.1,0.1,0.1,0.2])
y.backward(weights2,retain_graph=True)
print(x.grad)

print("*********************************************************")
#你可以利用梯度做很多疯狂的事情！
x = torch.randn(3)#生成均值为0，标准差为1的3个标准正态分布，一维张量
x = Variable(x, requires_grad=True)
y = x * 2
while y.data.norm() < 1000:#括号未写内容默认为求2范数，关于范数的解释见下面代码
    y = y * 2 #2的n次方，当n达到10的时候，y=1024
print(y)
gradients = torch.FloatTensor([0.1, 1.0, 10])#因为x=torch.randn(3)是3个数，所以需要给出3个权重
y.backward(gradients)
print(x.grad) #y对x的导数就是2的n次方

print("*********************************************************")
#范数(norm)是数学中的一种基本概念。在泛函分析中，它定义在赋范线性空间中，并满足一定的条件，即①非负性；②齐次性；③三角不等式。
#它常常被用来度量某个向量空间（或矩阵）中的每个向量的长度或大小。
#常用的三种p-范数推导出的矩阵范数：
#1-范数：║A║1 = max{ ∑|ai1|，∑|ai2|，……，∑|ain| } （列和范数，A每一列元素绝对值之和的最大值）
#（其中∑|ai1|第一列元素绝对值的和∑|ai1|=|a11|+|a21|+...+|an1|，其余类似）；
#2-范数：║A║2 = A的最大奇异值 = (max{ λi(AH*A) }) 1/2 （谱范数，即A^H*A特征值λi中最大者λ1的平方根，其中AH为A的转置共轭矩阵）；
#∞-范数：║A║∞ = max{ ∑|a1j|，∑|a2j|,...，∑|amj| } （行和范数，A每一行元素绝对值之和的最大值）（其中∑|a1j| 为第一行元素绝对值的和，其余类似）；
#其它的p-范数则没有很简单的表达式。
a = torch.ones((2,3))  #建立tensor
a1 = torch.norm(a,p=1)  #指定求1范数,等价于a1 = a.data.norm(p=1)  
a2 = torch.norm(a)      #默认求2范数,等价于a2 = a.data.norm(p=2)
print(a)
print(a1)
print(a2)

#求指定维度上的范数，返回输入张量给定维dim 上每行的p 范数
a = torch.tensor([[1, 2, 3, 4],[1, 2, 3, 4]]).float()  #norm仅支持floatTensor,a是一个2*4的Tensor
a0 = torch.norm(a,p=2,dim=0)    #按0维度求2范数，按列
a1 = torch.norm(a,p=2,dim=1)    #按1维度求2范数，按行
print(a0)
print(a1)
#可以看输出，dim=0是对0维度上的一个向量求范数，返回结果数量等于其列的个数，也就是说有多少个0维度的向量，将得到多少个范数。dim=1同理。

#再看keepdim，其含义是保持输出的维度，挺抽象的，我们还是通过具体例子来看吧。
a = torch.rand((2,3,4))
at = torch.norm(a,p=2,dim=1,keepdim=True)   #保持维度
af = torch.norm(a,p=2,dim=1,keepdim=False)  #不保持维度,默认为false，经过计算后3维降为2维
print(a.shape)
print(at.shape)
print(af.shape)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True) tensor([[3., 3.],
        [3., 3.]], grad_fn=<AddBackward0>) tensor([[27., 27.],
        [27., 27.]], grad_fn=<MulBackward0>) tensor(27., grad_fn=<MeanBackward1>)
<AddBackward0 object at 0x00000000061D8278> <MulBackward0 object at 0x000000000995FC50> <MeanBackward1 object at 0x000000000995FDA0>
tensor([[4.5000, 4.5000],
        [4.5000, 4.5000]])
*********************************************************
tensor([ 2., 10., 12., 20., 32.])
tensor([ 2.2000, 11.0000, 13.2000, 22.0000, 35.2000])
tensor([ 3.2000, 12.0000, 14.4000, 24.0000, 41.6000])
*********************************************************
tensor([-810.8790, -148.9206, -800.3726], grad_fn=<MulBackward0>)
tensor([  51.2000,  512.0000, 5120.0000])
*********************************************************
tensor([[1., 1., 1.],
        [1., 1., 1.]])
tensor(6.)
tensor(2.4495)
tensor([1.4142, 2.8284, 4.2426, 5.6569])
tensor([5.4772, 5.4772])
torch.Size([2, 3, 4])
torch.Size([2, 1, 4])
torch.Size([2, 4])

#【PyTorch深度学习60分钟快速入门】Part3：神经网络
#神经网络可以通过使用torch.nn包来构建。
#既然你已经了解了autograd，而nn依赖于autograd来定义模型并对其求微分。一个nn.Module包含多个网络层，以及一个前向函数forward(input)
#例如，查看下图中，对图片分类的网络：图片参见https://www.cnblogs.com/leejack/p/8387771.html
#这是一个简单的前馈网络。它接受输入，并将输入依次通过多个层，然后给出输出结果。对于神经网络来说，一个经典的训练过程包括以下步骤：
#定义一个包含一些可学习的参数（或权重）的神经网络，对输入数据集进行迭代，通过网络处理输入，计算损失函数（即输出距离正确值差多远）
#将梯度传播回网络参数，更新网络的权重，通常使用一个简单的更新规则：weight = weight - learning_rate * gradient
#下面，我们定义该网络：

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module): #所定义的Net是从nn.Module框架中继承而来的，而不是一个新玩意

    def __init__(self):#定义一个初始化函数，当调用这个Net的时候，内部会进行初始化
        
        super(Net, self).__init__() #等价于nn.Module.__init__(self)这里首先初始化（清空）所定义的网络
        
        #以下是配置网络内容及参数：本例中配置了两次卷积和三次全连接(前馈神经网络也称为全连接或多层感知器)，前馈函数单独定义，不含在初始化里面。
        
        # 1 input image channel, 6 output channels, 5x5 square convolution，# kernel 
        self.conv1 = nn.Conv2d(1, 6, 5) #"1"是图像通道，kernel 5(自定义边长5*5)，经卷积后size变成(（n-(5-1)),(n-(5-1)）)，6层是自定义的吗？
        self.conv2 = nn.Conv2d(6, 16, 5)#16层是自定义的吗？
        # an affine operation: y = Wx + b 全连接层，又名仿射层，输出y和输入x满足y=Wx +b,W和b是可以学习的参数，后续可以调用参数来获得W和b的值
        self.fc1 = nn.Linear(16 * 5 * 5, 120) #原图片32*32像素经过两次卷积和两次池化后大小变成了5*5，共有16个。120是自定义的吗？
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):#定义前向forward函数，引出自变量x,返回计算结果return x，反向（backward）函数使用autograd自动定义，在前向函数中可使用任何张量操作。
        # Max pooling over a (2, 2) window 通过（2，2）的最大池化操作以后，行数和列数就变成了原来的一半（因为是2个中选择一个大的）
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) #先调用kernal卷积conv1计算一次，再由激活函数relu计算一次，最后再经过最大池化计算一次
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)#接上一步，将经过一轮计算的x再调用conv2计算一次，再relu一次，再最大池化一次
        x = x.view(-1, self.num_flat_features(x)) #将x的size统一变成二维，其中用-1表示组数，可以自适应原先的组数，其余的合并，具体参见下方解释，
        x = F.relu(self.fc1(x))#调用上面定义好size的线性函数fc1计算一次，然后再relu一次
        x = F.relu(self.fc2(x))#继续调用fc2计算一次，然后再relu一次
        x = self.fc3(x)#最后再做一次线性fc3,最终的结果就是二维的，反向求导时尽管是10个特征值，out.backward(torch.randn(1, 10))，若写（10）不行
        return x

    def num_flat_features(self, x): #特征扁平化处理，这段函数的目的就是为了将特征重构
        size = x.size()[1:]  #[1：],其中的1：就是将除了第一维的组数保留外，其余全部通过下面的乘法计算总数，并利用上面的view(-1,)合并成2维
        # all dimensions except the batch dimension, x.size()[开始值:结尾值]，反馈x有几维，取其中几维的值
        #比如x=torch.rand((2,3,4))，则x.size()等效于x.size()[0:3]为[2，3，4]，x.size()[1:]为[3，4]，x.size()[1:3]为[3,4]
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

net = Net()
print(net)

print("***********************************************************")

def num_flat_features(x):
        size = x.size()[1:]  
        # all dimensions except the batch dimension, x.size()[开始值:结尾值]，反馈x有几维，取其中几维的值
        #比如x=torch.rand((2,3,4))，则x.size()等效于x.size()[0:3]为[2，3，4]，x.size()[1:]为[3，4]，x.size()[1:3]为[3,4]
        num_features = 1
        for s in size:
            num_features *= s
        return num_feature
    
x=torch.rand(2,3,5)
x.view(-1,num_flat_features(x))
print(x,num_flat_features(x),x.size())
print(x.view(-1,num_flat_features(x)))

print("***********************************************************")
#一个模型的可学习参数由net.parameters()返回
params = list(net.parameters())
print(len(params)) #上面构造的前馈神经网络共计包含10个参数，net return x的结果中恰好也是10个值
print(params[1])  
#params[0].size()四维[6,1,5,5]，表示6个大组，每个大组有1个小组（增加一个方括号而已），每小组内有25个(5*5)数据conv1's .weight， 每次运行值不一样，值是如何设定的？
#params[1].size()一维[6]，表示6个值，tensor([ 0.1707,  0.1464, -0.0306, -0.1523, -0.1115,  0.0629],requires_grad=True)值每次运行也会变，值是如何设定的？
#params[2].size()四维[16,6,5,5]，表示16个大组，每个大组有6个小组，每小组内有25个(5*5)数据conv2's .weight，每次运行值不一样，值是如何设定的？
#params[3].size()一维[16]，表示16个值，tensor([-0.0175,  0.0498, -0.0686,  ...， -0.0419],requires_grad=True),每次运行值不一样，值是如何设定的？
#params[4].size()二维[120,400]，表示120个组，每组400个数据的列表:W1
#params[5].size()一维[120]，表示120个值:b1
#params[6].size()二维[84,120],表示84个组，每组120个数据的列表:W2
#params[7].size()一维[84],表示84个值:b2
#params[8].size()二维[10,84],表示10个组，每组84个数据的列表:W3
#params[9].size()一维[10],表示10个值:b3，tensor([-0.0595, -0.0891, 0.0139, ..., -0.0213], requires_grad=True),每次运行值不一样

print("***********************************************************")
#前向输入是一个autograd.Variable，输出也是如此。
#注意：该网络（LeNet）的预期输入大小为32x32。要在MNIST数据集上使用该网络，需要将该数据集中的图片大小调整为32x32。
input = Variable(torch.randn(1, 1, 32, 32))
out = net(input)
print(out)

print("***********************************************************")
#将所有参数的梯度缓冲区置为零，并使用随机梯度进行后向传播：
net.zero_grad()
out.backward(torch.randn(1, 10))
#注意： torch.nn只支持小批量，整个torch.nn包都只支持小批量样本的输入，而不支持单个样本。
#例如，nn.Conv2d将接受一个4维的张量nSamples x nChannels x Height x Width。
#如果你只有单个样本，那么只需要使用input.unsqueeze(0)来添加一个假的批量维度。
#简要回顾：
#torch.Tensor：一个多维数组。
#autograd.Variable：封装了一个张量和对该张量操作的记录历史。除了与张量具有相同的API外，还拥有一些如backward()等的操作。
#此外，还持有对张量的梯度w.r.t.。
#nn.Module：神经网络模块。一种封装参数的便捷方式，并含有将参数移到GPU、导出、加载等的辅助功能。
#nn.Parameter：一种变量，当作为一个属性分配给一个模块时，它会自动注册为一个参数。
#autograd.Function：实现autograd操作的前向和后向定义。每个变量操作，至少创建一个单独的函数节点，连接到创建了一个变量的函数，并对其历史进行编码。

#0x02 损失函数（Loss Function）损失函数接受（输出，目标）输入对，并计算一个值，该值能够评估输出与目标的偏差大小。
#nn包中有几个不同的损失函数。一个简单的损失函数是nn.MSELoss，它会计算输入和目标之间的均方误差。
output = net(input)
target = Variable(torch.arange(1., 11.))  
# 虚拟目标举个例子 a dummy target, for example,arange函数中的数字要加小数点，否则会报错，左开右闭，output有10个所以target也10个
criterion = nn.MSELoss()#MSE就是两者差的平方和除以样本数量10，这里没有用自由度即10-1=9这个值,不能直接将参数output和target输入MSEloss（）的括号中
loss = criterion(output, target)
print("loss:",loss)
#现在，如果你沿着后向跟踪损失，那么可以使用它的`.grad_fn属性，你将会看到一个这样的计算图：
#input ->conv2d ->relu ->maxpool2d ->conv2d ->relu ->maxpool2d ->view ->linear ->relu ->linear ->relu ->linear ->MSELoss ->loss
#所以，当我们调用loss.backward()时，整个计算图是对损失函数求微分后的，并且图中所有的变量将使自己的.grad变量与梯度进行累积。
#为了便于展示，我们反向跟随几步：
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear 为什么要加参数[0][0]，去掉一个[0]也可以得到一个值，但是会多出一个“（，0）”
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # ReLU
#0x03 反向传播（Backprop）
#为了反向传播误差，我们所要做的就是调用loss.backward()。不过，你需要清除现有的梯度，否则梯度就会累积到已有的梯度上。
#现在我们应该调用loss.backward()，并在反向之前和之后查看conv1的偏差梯度。
net.zero_grad()     # zeroes the gradient buffers of all parameters
print('conv1.bias.grad before backward')
print(net.conv1.bias.grad) #conv1和bias可以换成其它函数（relu，linear），bias可以换成weight
loss.backward()
print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

#0x04 更新权重
#在实践中使用的最简单的更新规则是随机梯度下降法（Stochastic Gradient Descent，SGD）：
#weight = weight - learning_rate * gradient
#我们可以使用简单的python代码实现这一点：
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate) #Variable里面的值调用使用data？

#然而，当你使用神经网络时，你可能想使用各种不同的更新规则，如SGD、Nesterov-SGD、Adam、RMSProp等等。
#为了实现这一点，我们构建了一个小的工具包torch.optim，它实现了所有这些方法。使用它非常简单：
import torch.optim as optim
# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)
# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()    # Does the update

print("***********************************************************")
print("loss after optimizer:",loss)#经过优化以后打印损失观察一下
print("params[1] after optimizer:",params[1])#经过优化以后打印conv1的偏倚bias和上面未经优化的偏倚bias可以进行对比
#优化以后如何再进一步优化，优化到什么程度算是ok了？

Net(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)
***********************************************************
***********************************************************
10
Parameter containing:
tensor([ 0.1703, -0.1637,  0.1829, -0.1471, -0.0758, -0.0585],
       requires_grad=True)
***********************************************************
tensor([[ 0.1154,  0.0158,  0.0087, -0.1210, -0.0249,  0.0537, -0.0099,  0.0766,
         -0.0059, -0.0395]], grad_fn=<AddmmBackward>)
***********************************************************
loss: tensor(38.5074, grad_fn=<MseLossBackward>)
<MseLossBackward object at 0x00000000096CCD68>
<AddmmBackward object at 0x00000000096CCF28>
<AccumulateGrad object at 0x00000000096CCD68>
conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([-0.0713,  0.1072,  0.1073, -0.0711,  0.0175, -0.1034])
***********************************************************
loss after optimizer: tensor(37.9959, grad_fn=<MseLossBackward>)
params[1] after optimizer: Parameter containing:
tensor([ 0.1711, -0.1658,  0.1826, -0.1456, -0.0762, -0.0577],
       requires_grad=True)

#【PyTorch深度学习60分钟快速入门 】Part4：训练一个分类器
#太棒啦！到目前为止，你已经了解了如何定义神经网络、计算损失，以及更新网络权重。不过，现在你可能会思考以下几个方面：

#0x01 训练数据集 https://www.cnblogs.com/leejack/p/8388776.html
#通常，当你需要处理图像、文本、音频或视频数据时，你可以使用标准的python包将数据加载到numpy数组中。然后你可以将该数组转换成一个torch.*Tensor。
#对于图像，Pillow、OpenCV这些包将有所帮助。
#对于音频，可以使用scipy和librosa包。
#对于文本，无论是基于原始的Python还是Cython的加载，或者NLTK和SpaCy都将有所帮助。
#具体对于图像来说，我们已经创建了一个名为torchvision的包，它为像Imagenet、CIFAR10、MNIST等公共数据集提供了数据加载器，并为图像提供了数据转换器，即torchvision.datasets和torch.utils.data.DataLoader。
#这提供了极大的便利，避免了编写样板代码。
#对于本教程，我们将使用CIFAR10数据集。它包含以下10个分类：飞机、汽车、鸟、猫、鹿、狗、青蛙、马、轮船和卡车。
#CIFAR-10数据集中的图像大小为3x32x32，即大小为32x32像素的3通道彩色图像。

#0x02 训练一个图像分类器，我们将按顺序执行以下步骤：
#使用torchvision加载并归一化CIFAR10训练和测试数据集，定义一个卷积神经网络，定义一个损失函数，利用训练数据来训练网络，利用测试数据来测试网络
#1. 加载和归一化CIFAR10，使用torchvision可以很容易地加载CIFAR10。

import torch
import torchvision #需要在pytorch官网上安装带cuda版本的torch，否则这一步会报错，cuda可能需要事先安装
import torchvision.transforms as transforms

#torchvision数据集的输出结果为像素值在[0,1]范围内的PILImage图像。我们将它们转换成标准化范围[-1,1]的张量：
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

#为了增添一些乐趣，我们来展示一些训练图片：
import matplotlib.pyplot as plt
import numpy as np

# functions to show an image


def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))


# get some random training images
dataiter = iter(trainloader)
images, labels = dataiter.next()

# show images
imshow(torchvision.utils.make_grid(images))
#print (labels)
print(' '.join('%5s' % classes[labels[j]] for j in range(4)))

#2. 定义一个卷积神经网络
#从前面“神经网络”一节中拷贝神经网络并对其进行修改，使它接受3通道的图像（而不是原先定义的单通道图像）。
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net()

#3. 定义损失函数和优化器
#让我们用一个分类交叉熵的损失函数，以及带动量的SGD：
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

#4. 训练网络
#这里正是事情开始变得有趣的地方。我们只需循环遍历我们的数据迭代器，并将输入量输入到网络并进行优化：
for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = data

        # wrap them in Variable
        inputs, labels = Variable(inputs), Variable(labels)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item() ##需要将原代码中的loss.data[0] 改为loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

#5. 在测试数据上测试网络
#我们已经利用训练数据集对网络训练了2次。但是，我们需要检查网络是否已经学到了什么。
#我们将通过预测神经网络输出的类标签来检查它，并根据实际情况对其进行检查。如果预测是正确的，那么我们将该样本添加到正确的预测列表中。
#OK！第一步，让我们展示测试集中的一个图像，以便于我们熟悉它。
dataiter = iter(testloader)
images, labels = dataiter.next()

# print images
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))

#现在让我们看看神经网络认为上面例子中的对象是什么：
outputs = net(Variable(images))
#输出结果是10个类的能量值。如果一个类的能量值越高，那么网络就越可能认为图像是该特定类。所以，我们来获取最高能量值对应的索引：
_, predicted = torch.max(outputs.data, 1)

print('Predicted: ', ' '.join('%5s' % classes[predicted[j]]
                              for j in range(4)))
#结果看起来相当不错。下面，我们看一下该网络在整个数据集上的表现。

correct = 0
total = 0
for data in testloader:
    images, labels = data
    outputs = net(Variable(images))
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)
    correct += (predicted == labels).sum()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))
#结果看起来比随机概率要好，随机概率为10%的准确率（随机从10个类中挑选一个类）。看起来似乎该网络学到了一些东西。
#下面，我们看一下到底是哪些类别表现的很好，哪些类别表现的不好：
class_correct = list(0. for i in range(10))
class_total = list(0. for i in range(10))
for data in testloader:
    images, labels = data
    outputs = net(Variable(images))
    _, predicted = torch.max(outputs.data, 1)
    c = (predicted == labels).squeeze()
    for i in range(4):
        label = labels[i]
        class_correct[label] += c[i]
        class_total[label] += 1


for i in range(10):
    print('Accuracy of %5s : %2d %%' % (
        classes[i], 100 * class_correct[i] / class_total[i]))
#Ok，下一步我们将学习如何在GPU上运行神经网络。
#0x03 在GPU上训练
#将神经网络转移到GPU上，就像将一个张量转移到GPU上一样。这将递归地遍历所有模块，并将它们的参数和缓冲器转换为CUDA张量：
##net.cuda()
#记住，你还必须将每一步的输入和目标都发送到GPU上：
##inputs, labels = Variable(inputs.cuda()), Variable(labels.cuda())
#为什么与CPU相比，我没有看到速度的明显提升？那是因为你的网络实在是太小了。
#练习： 尝试增加网络的宽度（第一个nn.Conv2d的参数2，以及第二个nn.Conv2d的参数1，它们必须为同一个数字），然后看下速度提升效果。
#实现的目标：以更高的角度理解PyTorch的Tensor库和神经网络,训练一个小型的神经网络来对图像进行分类
#0x04 在多个GPU上训练
#如果你想使用所有GPU来得到速度更大的提升，可以阅读下一节“数据并行性”。

#0x05 扩展阅读
#Train neural nets to play video games
#Train a state-of-the-art ResNet network on imagenet
#Train a face generator using Generative Adversarial Networks
#Train a word-level language model using Recurrent LSTM networks
#More examples
#More tutorials
#Discuss PyTorch on the Forums
#Chat with other users on Slack

Files already downloaded and verified
Files already downloaded and verified
 ship truck  deer  deer
[1,  2000] loss: 2.232
[1,  4000] loss: 1.889
[1,  6000] loss: 1.703
[1,  8000] loss: 1.608
[1, 10000] loss: 1.536
[1, 12000] loss: 1.443
[2,  2000] loss: 1.409
[2,  4000] loss: 1.370
[2,  6000] loss: 1.339
[2,  8000] loss: 1.327
[2, 10000] loss: 1.309
[2, 12000] loss: 1.291
Finished Training
GroundTruth:    cat  ship  ship plane
Predicted:    cat  ship  ship plane
Accuracy of the network on the 10000 test images: 54 %
Accuracy of plane :  0 %
Accuracy of   car :  0 %
Accuracy of  bird :  0 %
Accuracy of   cat :  0 %
Accuracy of  deer :  0 %
Accuracy of   dog :  0 %
Accuracy of  frog :  0 %
Accuracy of horse :  0 %
Accuracy of  ship :  1 %
Accuracy of truck :  0 %

#【PyTorch深度学习60分钟快速入门 】Part5：数据并行化
#在本节中，我们将学习如何利用DataParallel使用多个GPU。在PyTorch中使用多个GPU非常容易，你可以使用下面代码将模型放在GPU上：
#model.gpu()
#然后，你可以将所有张量拷贝到GPU上：
#mytensor = my_tensor.gpu()
#请注意，仅仅调用my_tensor.gpu()并不会将张量拷贝到GPU上，你需要将它指派给一个新的张量，然后在GPU上使用这个新张量。
#在多个GPU上执行你的前向和后向传播是一件很自然的事情。然而，PyTorch默认情况下只会使用一个GPU。
#不过，通过利用DataParallel使你的模型并行地运行，这样你就能很容易地将操作运行在多个GPU上：
#model = nn.DataParallel(model)
#这正是本节的核心。下面，我们将更详细地分析该技术。
#0x01 导入和参数,下面，导入PyTorch模块并定义相关参数：
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
# Parameters and DataLoaders
input_size = 5
output_size = 2
batch_size = 30
data_size = 100
#0x02 虚拟数据集,创建一个虚拟的（随机的）数据集。你只需要实现__getitem__方法：
class RandomDataset(Dataset):

    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len

rand_loader = DataLoader(dataset=RandomDataset(input_size, 100),batch_size=batch_size, shuffle=True)
#0x03 简单模型
#在该demo中，我们的模型只会接受一个输入，并进行一个线性操作，然后给出输出结果。
#然而，你可以在任何模型（CNN、RNN、Capsule Net等等）上使用DataParallel。
#在模型内部我们添加了一个打印语句，以监控输入和输出的张量大小。请注意在rank 0批次时打印的内容：
class Model(nn.Module):
    # Our model

    def __init__(self, input_size, output_size):
        super(Model, self).__init__()
        self.fc = nn.Linear(input_size, output_size)

    def forward(self, input):
        output = self.fc(input)
        print("  In Model: input size", input.size(),
              "output size", output.size())

        return output
#0x04 创建模型和数据并行化
#这是本教程的核心部分。首先，我们需要创建一个模型实例，并检查我们是否拥有多个GPU。
#如果拥有多个GPU，那么可以使用nn.DataParallel来封装我们的模型。然后，利用model.gpu()将模型放到GPU上：
model = Model(input_size, output_size)
if torch.cuda.device_count() > 1:
  print("Let's use", torch.cuda.device_count(), "GPUs!")
  # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
  model = nn.DataParallel(model)

if torch.cuda.is_available():
   model.cuda()

#0x05 运行模型,现在，我们可以查看输入与输出张量的大小：
for data in rand_loader:
    if torch.cuda.is_available():
        input_var = Variable(data.cuda())
    else:
        input_var = Variable(data)

    output = model(input_var)
    print("Outside: input size", input_var.size(),
          "output_size", output.size())
#0x06 结果
#当我们将输入和输出都以30个作为批量大小时，正常情况下该模型会得到30并输出30。但如果你有多个GPU，那么你将会得到下面的结果：
#2个GPU,如果你有2个GPU，你将看到：
# on 2 GPUs
"""
Let's use 2 GPUs!
    In Model: input size torch.Size([15, 5]) output size torch.Size([15, 2])
    In Model: input size torch.Size([15, 5]) output size torch.Size([15, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
    In Model: input size torch.Size([15, 5]) output size torch.Size([15, 2])
    In Model: input size torch.Size([15, 5]) output size torch.Size([15, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
    In Model: input size torch.Size([15, 5]) output size torch.Size([15, 2])
    In Model: input size torch.Size([15, 5]) output size torch.Size([15, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
    In Model: input size torch.Size([5, 5]) output size torch.Size([5, 2])
    In Model: input size torch.Size([5, 5]) output size torch.Size([5, 2])
Outside: input size torch.Size([10, 5]) output_size torch.Size([10, 2])
"""

#3个GPU,如果你有3个GPU，你将看到：

"""
Let's use 3 GPUs!
    In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
    In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
    In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
    In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
    In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
    In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
    In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
    In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
    In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
Outside: input size torch.Size([10, 5]) output_size torch.Size([10, 2])
"""

#8个GPU,如果你有8个GPU，你将看到：
"""
Let's use 8 GPUs!
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([4, 5]) output size torch.Size([4, 2])
    In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
    In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
    In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
    In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
    In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
    In Model: input size torch.Size([2, 5]) output size torch.Size([2, 2])
Outside: input size torch.Size([10, 5]) output_size torch.Size([10, 2])
"""

#0x07 总结
#DataParallel会自动分割数据，并将作业顺序发送给多个GPU上的多个模型。
#在每个模型完成它们的作业之后，DataParallel会收集并合并结果，然后再返回给你。

In Model: input size torch.Size([30, 5]) output size torch.Size([30, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
  In Model: input size torch.Size([30, 5]) output size torch.Size([30, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
  In Model: input size torch.Size([30, 5]) output size torch.Size([30, 2])
Outside: input size torch.Size([30, 5]) output_size torch.Size([30, 2])
  In Model: input size torch.Size([10, 5]) output size torch.Size([10, 2])
Outside: input size torch.Size([10, 5]) output_size torch.Size([10, 2])

来源：https://www.cnblogs.com/brave-sailor/p/11588733.html

标签

梯度下降

神经网络模型

python神经网络

图像梯度

范数

num

张量