# Python:PyTorch 训练网络 (七十七)

## 训练神经网络

$$\ell = \frac{1}{2n}\sum_i^n{\left(y_i - \hat{y}_i\right)^2}$$

## 反向传播

$$\frac{\partial \ell}{\partial w_1} = \frac{\partial l_1}{\partial w_1} \frac{\partial s}{\partial l_1} \frac{\partial l_2}{\partial s} \frac{\partial \ell}{\partial l_2}$$

$$w^\prime = w - \alpha \frac{\partial \ell}{\partial w}$$

Torch 提供了模块 autograd 用于自动计算张量的梯度。计算方式是跟踪在张量上执行的运算。要让 PyTorch 跟踪运算，你需要使用 torch.autogradVariable 类封装张量。你可以使用 Variable 的 .data 属性获取张量。

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
import time

import torch
from torch import nn
from torch import optim
import torch.nn.functional as F

import helper
x = torch.randn(2,2)
print(x)
tensor([[ 2.0177, -1.4438],
[-0.8740,  1.4361]])
y = x**2
print(y)
tensor([[ 4.0712,  2.0845],
[ 0.7639,  2.0623]])

## grad_fn shows the function that generated this variable
print(y.grad_fn)
<PowBackward0 object at 0x7fc13410ba90>

autgrad 模块会跟踪这些运算并知道如何为每个运算计算梯度。这样的话，它就能够计算一系列运算相对于任何一个张量的梯度。我们将张量 y 简化为标量值，即均值。

z = y.mean()
print(z)
tensor(2.2455)

print(x.grad)
None

$$\frac{\partial z}{\partial x} = \frac{\partial}{\partial x}\left[\frac{1}{n}\sum_i^n x_i^2\right] = \frac{x}{2}$$

z.backward()
print(x/2)
tensor([[ 1.0089, -0.7219],
[-0.4370,  0.7180]])
tensor([[ 1.0089, -0.7219],
[-0.4370,  0.7180]])

## 训练网络！

from torchvision import datasets, transforms

# Define a transform to normalize the data
transform = transforms.Compose([transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

testloader = torch.utils.data.DataLoader(testset, batch_size=64, shuffle=True)
class Network(nn.Module):
def __init__(self):
super().__init__()
# Defining the layers, 200, 50, 10 units each
self.fc1 = nn.Linear(784, 200)
self.fc2 = nn.Linear(200, 50)
# Output layer, 10 units - one for each digit
self.fc3 = nn.Linear(50, 10)

def forward(self, x):
''' Forward pass through the network, returns the output logits '''

x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
x = F.relu(x)
x = self.fc3(x)

return x

def predict(self, x):
''' This function for predicts classes by calculating the softmax '''
logits = self.forward(x)
return F.softmax(logits, dim=1)
net = Network()
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)

• 在网络中进行前向传递以获得 logits
• 使用 logits 计算损失
• 通过 loss.backward() 对网络进行反向传递以计算梯度
• 用优化器执行一步以更新权重

rint('print fc1 - ', net.fc1)

print('Initial weights - ', net.fc1.weight)

print('print dataiter- ', dataiter)

images, labels = dataiter.next()

print('print images- ', images)
print('print labels- ', labels)

images.resize_(64, 784)

# Create Variables for the inputs and targets
inputs = Variable(images)
targets = Variable(labels)

print('print inputs:', inputs)

# Clear the gradients from all Variables

# Forward pass, then backward pass, then update weights
output = net.forward(inputs)
loss = criterion(output, targets)
loss.backward()
optimizer.step()
     print fc1 -  Linear(in_features=784, out_features=200, bias=True)
Initial weights -  Parameter containing:
tensor([[ 8.5397e-03,  2.0490e-02, -3.0762e-02,  ...,  2.7010e-02,
1.2961e-02,  6.6650e-03],
[ 2.4989e-02, -2.4855e-02, -1.3049e-02,  ..., -3.4125e-03,
-1.0886e-03, -2.1010e-03],
[ 2.5357e-02, -7.1994e-04,  2.0392e-02,  ...,  1.1883e-02,
-3.0476e-02, -1.7069e-02],
...,
[ 2.9283e-02,  1.7928e-02, -1.9773e-03,  ...,  9.2352e-03,
1.9706e-02, -6.5281e-03],
[ 4.0206e-02,  1.8597e-02, -5.2074e-03,  ...,  3.0402e-02,
8.5699e-03,  2.0727e-02],
[ 8.6584e-03,  1.3021e-03,  9.7864e-03,  ...,  1.1017e-02,
-2.7003e-02,  2.3624e-02]])
print images-  tensor([[[[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
...,
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000]]],

[[[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
...,
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000]]],

[[[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
...,
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000]]],

...,

[[[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
...,
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000]]],

[[[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
...,
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000]]],

[[[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
...,
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000]]]])
print labels-  tensor([ 0,  0,  4,  6,  0,  7,  3,  3,  4,  3,  6,  3,  8,  1,
2,  5,  5,  4,  3,  2,  8,  7,  1,  8,  2,  5,  3,  0,
8,  3,  5,  8,  2,  0,  8,  0,  6,  9,  1,  7,  4,  5,
8,  0,  1,  1,  7,  3,  7,  6,  9,  9,  6,  7,  0,  9,
3,  7,  9,  9,  3,  2,  0,  8])
print inputs: tensor([[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
...,
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000],
[-1.0000, -1.0000, -1.0000,  ..., -1.0000, -1.0000, -1.0000]])
[[-0.9912, -0.9912, -0.9912,  ..., -0.9912, -0.9912, -0.9912],
[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
[ 1.7549,  1.7549,  1.7549,  ...,  1.7549,  1.7549,  1.7549],
...,
[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])
print('Updated weights - ', net.fc1.weight)
Updated weights -  Parameter containing:
tensor([[-2.3054e-02, -7.3247e-03,  3.3989e-02,  ...,  1.4626e-03,
-2.4275e-04,  2.5772e-02],
[-1.4883e-02, -6.8011e-03,  1.4760e-02,  ...,  2.2036e-02,
8.5643e-03,  3.2226e-02],
[-1.3967e-02, -7.4284e-05, -3.3173e-02,  ...,  2.1535e-02,
2.9550e-02, -2.5174e-02],
...,
[ 4.2445e-03,  1.3663e-02,  1.1734e-02,  ...,  3.2027e-02,
-3.1084e-03,  2.8480e-02],
[ 2.3525e-02,  1.5927e-02, -2.5106e-02,  ..., -1.0978e-02,
-1.4049e-02,  9.7510e-03],
[-2.5424e-02, -3.7520e-03,  3.4995e-02,  ...,  1.8272e-02,
-6.4011e-03, -1.3063e-02]])

### 实际训练

net = Network()
optimizer = optim.Adam(net.parameters(), lr=0.001)
epochs = 1
steps = 0
running_loss = 0
print_every = 20
for e in range(epochs):
steps += 1
# Flatten MNIST images into a 784 long vector
images.resize_(images.size()[0], 784)

# Wrap images and labels in Variables so we can calculate gradients
inputs = Variable(images)
targets = Variable(labels)

output = net.forward(inputs)
loss = criterion(output, targets)
loss.backward()
optimizer.step()

running_loss += loss.data[0]

if steps % print_every == 0:
# Test accuracy
accuracy = 0
for ii, (images, labels) in enumerate(testloader):

images = images.resize_(images.size()[0], 784)
inputs = Variable(images, volatile=True)

predicted = net.predict(inputs).data
equality = (labels == predicted.max(1)[1])
accuracy += equality.type_as(torch.FloatTensor()).mean()

print("Epoch: {}/{}".format(e+1, epochs),
"Loss: {:.4f}".format(running_loss/print_every),
"Test accuracy: {:.4f}".format(accuracy/(ii+1)))
running_loss = 0
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:21: UserWarning: invalid index of a 0-dim tensor. This will be an error in PyTorch 0.5. Use tensor.item() to convert a 0-dim tensor to a Python number
/opt/conda/lib/python3.6/site-packages/ipykernel_launcher.py:29: UserWarning: volatile was removed and now has no effect. Use with torch.no_grad(): instead.

Epoch: 1/1 Loss: 1.9097 Test accuracy: 0.6854
Epoch: 1/1 Loss: 1.1061 Test accuracy: 0.7751
Epoch: 1/1 Loss: 0.7042 Test accuracy: 0.8204
Epoch: 1/1 Loss: 0.5564 Test accuracy: 0.8457
Epoch: 1/1 Loss: 0.5339 Test accuracy: 0.8407
Epoch: 1/1 Loss: 0.5396 Test accuracy: 0.8608
Epoch: 1/1 Loss: 0.4474 Test accuracy: 0.8873
Epoch: 1/1 Loss: 0.4038 Test accuracy: 0.8887
Epoch: 1/1 Loss: 0.4636 Test accuracy: 0.8880
Epoch: 1/1 Loss: 0.3866 Test accuracy: 0.8908
Epoch: 1/1 Loss: 0.4189 Test accuracy: 0.8814
Epoch: 1/1 Loss: 0.4408 Test accuracy: 0.8911
Epoch: 1/1 Loss: 0.3593 Test accuracy: 0.8952
Epoch: 1/1 Loss: 0.3571 Test accuracy: 0.8991
Epoch: 1/1 Loss: 0.3540 Test accuracy: 0.8888
Epoch: 1/1 Loss: 0.3815 Test accuracy: 0.8910
Epoch: 1/1 Loss: 0.3729 Test accuracy: 0.8936
Epoch: 1/1 Loss: 0.3273 Test accuracy: 0.9001
Epoch: 1/1 Loss: 0.3517 Test accuracy: 0.8982
Epoch: 1/1 Loss: 0.3505 Test accuracy: 0.8962
Epoch: 1/1 Loss: 0.3411 Test accuracy: 0.8982
Epoch: 1/1 Loss: 0.3639 Test accuracy: 0.9171
Epoch: 1/1 Loss: 0.3541 Test accuracy: 0.9149
Epoch: 1/1 Loss: 0.3050 Test accuracy: 0.9155
Epoch: 1/1 Loss: 0.3200 Test accuracy: 0.9138
Epoch: 1/1 Loss: 0.3314 Test accuracy: 0.9111
Epoch: 1/1 Loss: 0.2506 Test accuracy: 0.9157
Epoch: 1/1 Loss: 0.2568 Test accuracy: 0.9113
Epoch: 1/1 Loss: 0.3099 Test accuracy: 0.9148
Epoch: 1/1 Loss: 0.2746 Test accuracy: 0.9104
Epoch: 1/1 Loss: 0.2986 Test accuracy: 0.9194
Epoch: 1/1 Loss: 0.2738 Test accuracy: 0.9274
Epoch: 1/1 Loss: 0.2576 Test accuracy: 0.9203
Epoch: 1/1 Loss: 0.2592 Test accuracy: 0.9209
Epoch: 1/1 Loss: 0.3085 Test accuracy: 0.9219
Epoch: 1/1 Loss: 0.3040 Test accuracy: 0.9235
Epoch: 1/1 Loss: 0.2466 Test accuracy: 0.9258
Epoch: 1/1 Loss: 0.2511 Test accuracy: 0.9258
Epoch: 1/1 Loss: 0.2586 Test accuracy: 0.9277
Epoch: 1/1 Loss: 0.2779 Test accuracy: 0.9274
Epoch: 1/1 Loss: 0.2639 Test accuracy: 0.9360
Epoch: 1/1 Loss: 0.2621 Test accuracy: 0.9376
Epoch: 1/1 Loss: 0.2230 Test accuracy: 0.9340
Epoch: 1/1 Loss: 0.2481 Test accuracy: 0.9322
Epoch: 1/1 Loss: 0.1941 Test accuracy: 0.9416
Epoch: 1/1 Loss: 0.2000 Test accuracy: 0.9406
dataiter = iter(testloader)
images, labels = dataiter.next()
img = images[0]
ps = net.predict(Variable(img.resize_(1, 784)))
helper.view_classify(img.resize_(1, 28, 28), ps)