simple example of mxnet model parallelism

随声附和 提交于 2020-01-03 03:12:52

问题


The simple examples in the Guon tutorial for mxnet are very helpful to those of us who are just getting started with mxnet. As yet, there is not a simple example for model parallelism. I see the model parallelism example code for LSTM, but I am new to mxnet and it would help me (and perhaps others) to have a more streamlined example. So, I have created a model parallelism example by working off the regression example in the gluon tutorial, and by mixing in some code from mxnet.gluon.Trainer.

However, I am clearly getting something wrong. The gradients do not seem to be updated. Can anyone assist by identifying the problem(s)? The goal here is to create a linear regression model that has three layers, each held on a different gpu. The model itself is not useful, except as an example to show how initialization and training can occur for model parallelism, when using a custom block and imperative programming.

As I understand it, Trainer() is written for data parallelism. It will not work for model parallelism in that it requires all parameters to be initialized on all GPUs.

import os
import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon
from mxnet.gluon import Block

# make some data
num_inputs = 2
num_outputs = 1
num_examples = 10000

def real_fn(X):
    return 2 * X[:, 0] - 3.4 * X[:, 1] + 4.2

X = np.random.normal(0,1, (num_examples, num_inputs))
noise = 0.001 * np.random.normal(0,1, (num_examples))
y = real_fn(X) + noise
y = y.reshape(-1,1)

# configuration
hidden_layers = 2
num_gpus = hidden_layers + 1
ctxList = [mx.gpu(i) for i in range(num_gpus)]
#ctxList = [mx.gpu() for i in range(num_gpus)]

#os.environ["MXNET_ENGINE_TYPE"] = "NaiveEngine"
print("\n")

# ======================================================================
class myDenseBlock(Block):
    """
    A custom layer
    """
    def __init__(self, layer_number, size_input, size_output, **kwargs):
        super(myDenseBlock, self).__init__(**kwargs)

        self.layer_number = layer_number
        self.size_input = size_input
        self.size_output = size_output

        with self.name_scope():
            # add parameters to the Block's ParameterDict.
            self.w = self.params.get(
                'weight',
                init= mx.init.Xavier(magnitude=2.24),
                shape=(size_input, size_output),
                grad_req = 'write')

            self.b = self.params.get(
                'bias',
                init= mx.init.Constant(0.5),
                shape=(size_output,),
                grad_req = 'write')

    def forward(self, x):
        x = x.as_in_context(ctxList[self.layer_number])
        with x.context:
            linear = nd.dot(x, self.w.data()) + self.b.data()
            return linear

# ======================================================================

# create net
net = gluon.nn.Sequential()
with net.name_scope():
    # initial layer, with X as input
    net.add(myDenseBlock(0,
        size_input = 2,
        size_output = 2))

    for ii in range(hidden_layers-1):
        net.add(myDenseBlock(ii+1,
            size_input = 2,
            size_output = 2))

    # final block, Y is nx1
    net.add(myDenseBlock(ii+2,
        size_input = 2,
        size_output = 1))


# ititialize paramerters for different blocks (layers) on different gpus.
params = net.collect_params()

"""
The parameters are:
sequential0_mydenseblock0_weight
sequential0_mydenseblock0_bias
sequential0_mydenseblock1_weight
sequential0_mydenseblock1_bias
sequential0_mydenseblock2_weight
sequential0_mydenseblock2_bias
"""

print("\ninitializing:")
for i, param in enumerate(params):
    if 'mydenseblock0' in param:
        params[param].initialize(ctx=ctxList[0])
    elif 'mydenseblock1' in param:
        params[param].initialize(ctx=ctxList[1])
    elif 'mydenseblock2' in param:
        params[param].initialize(ctx=ctxList[2])
    print("  ", i, param, "  ", params[param].list_data()[0].context)
print("\n")

def square_loss(yhat, y):
    return nd.mean((yhat - y) ** 2)

def mytrainer(updaters, params, ignore_stale_grad=False):
    #print("\n")
    for i, param in enumerate(params):
        #print(i, param, "  ", len(params[param].list_data()), params[param].list_data()[0].context)
        if params[param].grad_req == 'null':
            continue
        if not ignore_stale_grad:
            for data in params[param].list_data():
                if not data._fresh_grad:
                    print(
                        "`%s` on context %s has not been updated"%(params[param].name, str(data.context)))
                    assert False

        for upd, arr, grad in zip(updaters, params[param].list_data(), params[param].list_grad()):

            if not ignore_stale_grad or arr._fresh_grad:
                upd(i, grad, arr)
                arr._fresh_grad = False
                #print ("grad= ", grad)


batch_size = 100
epochs = 100000
iteration = -1

opt = mx.optimizer.create('adam', learning_rate=0.001, rescale_grad = 1 / batch_size)
updaters = [mx.optimizer.get_updater(opt)]

# the following definition for updaters does not work either
#updaters = [mx.optimizer.get_updater(opt) for _ in ctxList]

results = []
for e in range(epochs):
    train_groups = np.array_split(np.arange(X.shape[0]), X.shape[0]/batch_size)
    for ii, idx in enumerate(train_groups):
        iteration += 1
        xtrain, ytrain = X[idx,:], y[idx]

        xtrain = nd.array(xtrain)
        xtrain = xtrain.as_in_context(ctxList[0])

        ytrain = nd.array(ytrain).reshape((-1, 1))
        ytrain = ytrain.as_in_context(ctxList[0])

        with autograd.record():
            yhat = net(xtrain)
            error = square_loss(yhat, ytrain.as_in_context(ctxList[-1]))


            # Question: does the call to error.backward() go under the indent 
            # for autograd.record() or outside the indent? The gluon examples have 
            # it both ways

        error.backward()

        mytrainer(updaters, net.collect_params())

        if iteration%10 == 0:

            results.append([iteration, error.asnumpy().item()])
            print(("epoch= {:5,d}, iter= {:6,d},  error= {:6.3E}").format(
                e, iteration, error.asnumpy().item()))

The code fails at the "if not data._fresh_grad" test in mytrainer(). The output is:

initializing:
   0 sequential0_mydenseblock0_weight    gpu(0)
   1 sequential0_mydenseblock0_bias    gpu(0)
   2 sequential0_mydenseblock1_weight    gpu(1)
   3 sequential0_mydenseblock1_bias    gpu(1)
   4 sequential0_mydenseblock2_weight    gpu(2)
   5 sequential0_mydenseblock2_bias    gpu(2)

`sequential0_mydenseblock0_weight` on context gpu(0) has not been updated

I can verify using mx.autograd.get_symbol(error).tojson() that the computational graph only extends to the parameters on gpu(2), and does not reach other gpus.


回答1:


Yes, per @sergei's comment, moving to v1.0.0 solves this.



来源:https://stackoverflow.com/questions/47029809/simple-example-of-mxnet-model-parallelism

标签
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!