问题
I am trying to implement the resilient backpropagation optimizer for Keras (link), but the challenging part was being able to perform an update on each individual parameter based on whether its corresponding gradient is positive, negative or zero. I wrote the code below as a start towards implementing the Rprop optimizer. However, I can't seem to find a way to access the parameters individually. Looping over params
(as in the code below) returns p, g, g_old, s, wChangeOld
at each iteration which are all matrices.
Is there a way where I could iterate over the individual parameters and update them ? It would also work if I could index the parameter vector based on the sign of its gradients.
class Rprop(Optimizer):
def __init__(self, init_step=0.01, **kwargs):
super(Rprop, self).__init__(**kwargs)
self.init_step = K.variable(init_step, name='init_step')
self.iterations = K.variable(0., name='iterations')
self.posStep = 1.2
self.negStep = 0.5
self.minStep = 1e-6
self.maxStep = 50.
def get_updates(self, params, constraints, loss):
grads = self.get_gradients(loss, params)
self.updates = [K.update_add(self.iterations, 1)]
shapes = [K.get_variable_shape(p) for p in params]
stepList = [K.ones(shape)*self.init_step for shape in shapes]
wChangeOldList = [K.zeros(shape) for shape in shapes]
grads_old = [K.zeros(shape) for shape in shapes]
self.weights = stepList + grads_old + wChangeOldList
self.updates = []
for p, g, g_old, s, wChangeOld in zip(params, grads, grads_old,
stepList, wChangeOldList):
change = K.sign(g * g_old)
if change > 0:
s_new = K.minimum(s * self.posStep, self.maxStep)
wChange = s_new * K.sign(g)
g_new = g
elif change < 0:
s_new = K.maximum(s * self.posStep, self.maxStep)
wChange = - wChangeOld
g_new = 0
else:
s_new = s
wChange = s_new * K.sign(g)
g_new = p
self.updates.append(K.update(g_old, g_new))
self.updates.append(K.update(wChangeOld, wChange))
self.updates.append(K.update(s, s_new))
new_p = p - wChange
# Apply constraints
if p in constraints:
c = constraints[p]
new_p = c(new_p)
self.updates.append(K.update(p, new_p))
return self.updates
def get_config(self):
config = {'init_step': float(K.get_value(self.init_step))}
base_config = super(Rprop, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
回答1:
I was looking for an RProp algorithm in Keras as well and found this question. I took the liberty of adapting your code to my purpose and post it back here now. So far it seems to work quite well, but I didn't test it extensively.
Disclaimer: I'm very new to keras but have a lot of experience with theano (and blocks). Further I tested this only with theano as a backend, but not tensorflow.
class RProp(Optimizer):
def __init__(self, init_alpha=1e-3, scale_up=1.2, scale_down=0.5, min_alpha=1e-6, max_alpha=50., **kwargs):
super(RProp, self).__init__(**kwargs)
self.init_alpha = K.variable(init_alpha, name='init_alpha')
self.scale_up = K.variable(scale_up, name='scale_up')
self.scale_down = K.variable(scale_down, name='scale_down')
self.min_alpha = K.variable(min_alpha, name='min_alpha')
self.max_alpha = K.variable(max_alpha, name='max_alpha')
def get_updates(self, params, constraints, loss):
grads = self.get_gradients(loss, params)
shapes = [K.get_variable_shape(p) for p in params]
alphas = [K.variable(numpy.ones(shape) * self.init_alpha) for shape in shapes]
old_grads = [K.zeros(shape) for shape in shapes]
self.weights = alphas + old_grads
self.updates = []
for param, grad, old_grad, alpha in zip(params, grads, old_grads, alphas):
new_alpha = K.switch(
K.greater(grad * old_grad, 0),
K.minimum(alpha * self.scale_up, self.max_alpha),
K.maximum(alpha * self.scale_down, self.min_alpha)
)
new_param = param - K.sign(grad) * new_alpha
# Apply constraints
if param in constraints:
c = constraints[param]
new_param = c(new_param)
self.updates.append(K.update(param, new_param))
self.updates.append(K.update(alpha, new_alpha))
self.updates.append(K.update(old_grad, grad))
return self.updates
def get_config(self):
config = {
'init_alpha': float(K.get_value(self.init_alpha)),
'scale_up': float(K.get_value(self.scale_up)),
'scale_down': float(K.get_value(self.scale_down)),
'min_alpha': float(K.get_value(self.min_alpha)),
'max_alpha': float(K.get_value(self.max_alpha)),
}
base_config = super(RProp, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
Important notes:
- RProp is often not included in machine learning libraries for a reason: It does not work at all unless you use full-batch learning. And full-batch learning is only useful if you have a small training set.
- Adam (Keras builtin) outperforms this RProp algorithm. Maybe because that's just how it is, or maybe because I made a mistake :)
A few comments about your code (referring to your original variable names):
wChange
is never used across iterations, so you don't need to store those in permanent variables.change > 0
does not do what you think it does becausechange
is a tensor variable. What you want here is a element-wise comparison, useK.switch()
instead.- You used
maxStep
twice instead of usingminStep
the other time. - The situation where
change
is zero is negligible, since that almost never happens in practice. g_new = 0
andg_new = p
are both completely bogus and should beg_new = g
as in the first if branch.
回答2:
I'm new to keras and Python but I modified the code above for my purposes a bit.
It is incredibly fast and simple algorithm due to using full-batch learning and partial derivatives. In my tests it outperformed all other backpropagation algorithms, including Adam. I tested it with Tensorflow and CNTK as a backend.
Modified Rprop without Weight-Backtracking: https://pdfs.semanticscholar.org/df9c/6a3843d54a28138a596acc85a96367a064c2.pdf
class iRprop_(Optimizer):
def __init__(self, init_alpha=0.01, scale_up=1.2, scale_down=0.5, min_alpha=0.00001, max_alpha=50., **kwargs):
super(iRprop_, self).__init__(**kwargs)
self.init_alpha = K.variable(init_alpha, name='init_alpha')
self.scale_up = K.variable(scale_up, name='scale_up')
self.scale_down = K.variable(scale_down, name='scale_down')
self.min_alpha = K.variable(min_alpha, name='min_alpha')
self.max_alpha = K.variable(max_alpha, name='max_alpha')
def get_updates(self, params, loss):
grads = self.get_gradients(loss, params)
shapes = [K.get_variable_shape(p) for p in params]
alphas = [K.variable(K.ones(shape) * self.init_alpha) for shape in shapes]
old_grads = [K.zeros(shape) for shape in shapes]
self.weights = alphas + old_grads
self.updates = []
for p, grad, old_grad, alpha in zip(params, grads, old_grads, alphas):
grad = K.sign(grad)
new_alpha = K.switch(
K.greater(grad * old_grad, 0),
K.minimum(alpha * self.scale_up, self.max_alpha),
K.switch(K.less(grad * old_grad, 0),K.maximum(alpha * self.scale_down, self.min_alpha),alpha)
)
grad = K.switch(K.less(grad * old_grad, 0),K.zeros_like(grad),grad)
new_p = p - grad * new_alpha
# Apply constraints.
if getattr(p, 'constraint', None) is not None:
new_p = p.constraint(new_p)
self.updates.append(K.update(p, new_p))
self.updates.append(K.update(alpha, new_alpha))
self.updates.append(K.update(old_grad, grad))
return self.updates
def get_config(self):
config = {
'init_alpha': float(K.get_value(self.init_alpha)),
'scale_up': float(K.get_value(self.scale_up)),
'scale_down': float(K.get_value(self.scale_down)),
'min_alpha': float(K.get_value(self.min_alpha)),
'max_alpha': float(K.get_value(self.max_alpha)),
}
base_config = super(iRprop_, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
来源:https://stackoverflow.com/questions/43768411/implementing-the-rprop-algorithm-in-keras