Implementing the Rprop algorithm in Keras

问题

I am trying to implement the resilient backpropagation optimizer for Keras (link), but the challenging part was being able to perform an update on each individual parameter based on whether its corresponding gradient is positive, negative or zero. I wrote the code below as a start towards implementing the Rprop optimizer. However, I can't seem to find a way to access the parameters individually. Looping over params (as in the code below) returns p, g, g_old, s, wChangeOld at each iteration which are all matrices.

Is there a way where I could iterate over the individual parameters and update them ? It would also work if I could index the parameter vector based on the sign of its gradients.

class Rprop(Optimizer):
    def __init__(self, init_step=0.01, **kwargs):
        super(Rprop, self).__init__(**kwargs)
        self.init_step = K.variable(init_step, name='init_step')
        self.iterations = K.variable(0., name='iterations')

        self.posStep = 1.2
        self.negStep = 0.5
        self.minStep = 1e-6
        self.maxStep = 50.

    def get_updates(self, params, constraints, loss):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        shapes = [K.get_variable_shape(p) for p in params]
        stepList = [K.ones(shape)*self.init_step  for shape in shapes]
        wChangeOldList = [K.zeros(shape) for shape in shapes]
        grads_old = [K.zeros(shape) for shape in shapes]

        self.weights = stepList + grads_old + wChangeOldList
        self.updates = []

        for p, g, g_old, s, wChangeOld in zip(params, grads, grads_old, 
                                                                  stepList, wChangeOldList):
            change = K.sign(g * g_old)

            if change > 0:
                s_new = K.minimum(s * self.posStep, self.maxStep)
                wChange = s_new * K.sign(g)
                g_new = g

            elif change < 0:
                s_new = K.maximum(s * self.posStep, self.maxStep)
                wChange = - wChangeOld
                g_new = 0

            else:
                s_new = s
                wChange = s_new * K.sign(g)
                g_new = p

            self.updates.append(K.update(g_old, g_new))
            self.updates.append(K.update(wChangeOld, wChange))
            self.updates.append(K.update(s, s_new))

            new_p = p - wChange

            # Apply constraints
            if p in constraints:
                c = constraints[p]
                new_p = c(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates

    def get_config(self):
        config = {'init_step': float(K.get_value(self.init_step))}
        base_config = super(Rprop, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

回答1:

I was looking for an RProp algorithm in Keras as well and found this question. I took the liberty of adapting your code to my purpose and post it back here now. So far it seems to work quite well, but I didn't test it extensively.

Disclaimer: I'm very new to keras but have a lot of experience with theano (and blocks). Further I tested this only with theano as a backend, but not tensorflow.

class RProp(Optimizer):
    def __init__(self, init_alpha=1e-3, scale_up=1.2, scale_down=0.5, min_alpha=1e-6, max_alpha=50., **kwargs):
        super(RProp, self).__init__(**kwargs)
        self.init_alpha = K.variable(init_alpha, name='init_alpha')
        self.scale_up = K.variable(scale_up, name='scale_up')
        self.scale_down = K.variable(scale_down, name='scale_down')
        self.min_alpha = K.variable(min_alpha, name='min_alpha')
        self.max_alpha = K.variable(max_alpha, name='max_alpha')

    def get_updates(self, params, constraints, loss):
        grads = self.get_gradients(loss, params)
        shapes = [K.get_variable_shape(p) for p in params]
        alphas = [K.variable(numpy.ones(shape) * self.init_alpha) for shape in shapes]
        old_grads = [K.zeros(shape) for shape in shapes]
        self.weights = alphas + old_grads
        self.updates = []

        for param, grad, old_grad, alpha in zip(params, grads, old_grads, alphas):
            new_alpha = K.switch(
                K.greater(grad * old_grad, 0),
                K.minimum(alpha * self.scale_up, self.max_alpha),
                K.maximum(alpha * self.scale_down, self.min_alpha)
            )
            new_param = param - K.sign(grad) * new_alpha
            # Apply constraints
            if param in constraints:
                c = constraints[param]
                new_param = c(new_param)
            self.updates.append(K.update(param, new_param))
            self.updates.append(K.update(alpha, new_alpha))
            self.updates.append(K.update(old_grad, grad))

        return self.updates

    def get_config(self):
        config = {
            'init_alpha': float(K.get_value(self.init_alpha)),
            'scale_up': float(K.get_value(self.scale_up)),
            'scale_down': float(K.get_value(self.scale_down)),
            'min_alpha': float(K.get_value(self.min_alpha)),
            'max_alpha': float(K.get_value(self.max_alpha)),
        }
        base_config = super(RProp, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

Important notes:

RProp is often not included in machine learning libraries for a reason: It does not work at all unless you use full-batch learning. And full-batch learning is only useful if you have a small training set.
Adam (Keras builtin) outperforms this RProp algorithm. Maybe because that's just how it is, or maybe because I made a mistake :)

A few comments about your code (referring to your original variable names):

wChange is never used across iterations, so you don't need to store those in permanent variables.
change > 0 does not do what you think it does because change is a tensor variable. What you want here is a element-wise comparison, use K.switch() instead.
You used maxStep twice instead of using minStep the other time.
The situation where change is zero is negligible, since that almost never happens in practice.
g_new = 0 and g_new = p are both completely bogus and should be g_new = g as in the first if branch.

回答2:

I'm new to keras and Python but I modified the code above for my purposes a bit.

It is incredibly fast and simple algorithm due to using full-batch learning and partial derivatives. In my tests it outperformed all other backpropagation algorithms, including Adam. I tested it with Tensorflow and CNTK as a backend.

Modified Rprop without Weight-Backtracking: https://pdfs.semanticscholar.org/df9c/6a3843d54a28138a596acc85a96367a064c2.pdf

class iRprop_(Optimizer):
def __init__(self, init_alpha=0.01, scale_up=1.2, scale_down=0.5, min_alpha=0.00001, max_alpha=50., **kwargs):
    super(iRprop_, self).__init__(**kwargs)
    self.init_alpha = K.variable(init_alpha, name='init_alpha')
    self.scale_up = K.variable(scale_up, name='scale_up')
    self.scale_down = K.variable(scale_down, name='scale_down')
    self.min_alpha = K.variable(min_alpha, name='min_alpha')
    self.max_alpha = K.variable(max_alpha, name='max_alpha')

def get_updates(self, params, loss):
    grads = self.get_gradients(loss, params)
    shapes = [K.get_variable_shape(p) for p in params]
    alphas = [K.variable(K.ones(shape) * self.init_alpha) for shape in shapes]
    old_grads = [K.zeros(shape) for shape in shapes]
    self.weights = alphas + old_grads
    self.updates = []

    for p, grad, old_grad, alpha in zip(params, grads, old_grads, alphas):
        grad = K.sign(grad)
        new_alpha = K.switch(
            K.greater(grad * old_grad, 0),
            K.minimum(alpha * self.scale_up, self.max_alpha),
            K.switch(K.less(grad * old_grad, 0),K.maximum(alpha * self.scale_down, self.min_alpha),alpha)    
        )

        grad = K.switch(K.less(grad * old_grad, 0),K.zeros_like(grad),grad)
        new_p = p - grad * new_alpha 

        # Apply constraints.
        if getattr(p, 'constraint', None) is not None:
            new_p = p.constraint(new_p)
        self.updates.append(K.update(p, new_p))
        self.updates.append(K.update(alpha, new_alpha))
        self.updates.append(K.update(old_grad, grad))

    return self.updates

def get_config(self):
    config = {
        'init_alpha': float(K.get_value(self.init_alpha)),
        'scale_up': float(K.get_value(self.scale_up)),
        'scale_down': float(K.get_value(self.scale_down)),
        'min_alpha': float(K.get_value(self.min_alpha)),
        'max_alpha': float(K.get_value(self.max_alpha)),
    }
    base_config = super(iRprop_, self).get_config()
    return dict(list(base_config.items()) + list(config.items()))

来源：https://stackoverflow.com/questions/43768411/implementing-the-rprop-algorithm-in-keras

标签

python

tensorflow

neural-network

keras

theano