Source code for downhill.adaptive

# -*- coding: utf-8 -*-

'''This module defines gradient descent optimizers with adaptive learning rates.
'''

import numpy as np
import theano
import theano.tensor as TT

from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

from .base import Optimizer
from . import util

__all__ = ['RProp', 'RMSProp', 'ADAGRAD', 'ADADELTA', 'ESGD', 'Adam']


[docs]class RProp(Optimizer): r'''Resilient backpropagation optimizer. Parameters ---------- rprop_increase: float, optional (default 1.01) Increase step sizes at this rate when the gradient sign stays the same. rprop_decrease: float, optional (default 0.99) Decrease step sizes at this rate when the gradient sign changes. rprop_min_step: float, optional (default 0) Minimum step size for any parameter. rprop_max_step: float, optional (default 100) Maximum step size for any parameter. momentum: float, optional (default 0) Momentum to apply to the updates, if any. Defaults to 0 (no momentum). Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of momentum. nesterov: bool, optional (default False) Set this to ``True`` to enable Nesterov-style momentum updates, whenever ``momentum`` is nonzero. Notes ----- The RProp method takes small steps in parameter space using local gradient information. RProp is unlike "vanilla" first-order techniques like :class:`SGD <downhill.first_order.SGD>`, however, because only the signs of the gradients are taken into account when making parameter updates. That is, the step size for each parameter is independent of the magnitude of the gradient for that parameter. To accomplish this, RProp maintains a separate learning rate for every parameter in the model, and adjusts this learning rate based on the consistency of the sign of the gradient over time. Whenever two consecutive gradients for a parameter have the same sign, the learning rate for that parameter increases, and whenever the signs disagree, the learning rate decreases. This has a similar effect to momentum-based stochastic gradient methods but effectively maintains parameter-specific learning rates. .. math:: \begin{eqnarray*} && \mbox{if } \frac{\partial\mathcal{L}}{\partial p}_{t-1} \frac{\partial\mathcal{L}}{\partial p} > 0 \\ && \qquad \Delta_t = \min (\eta_+\Delta_{t−1}, \Delta_+) \\ && \mbox{if } \frac{\partial\mathcal{L}}{\partial p}_{t-1} \frac{\partial\mathcal{L}}{\partial p} < 0 \\ && \qquad \Delta_t = \max (\eta_-\Delta_{t−1}, \Delta_-) \\ && \qquad \frac{\partial\mathcal{L}}{\partial p} = 0 \\ && p_{t+1} = p_t − \mbox{sgn}\left( \frac{\partial\mathcal{L}}{\partial p}\right) \Delta_t \end{eqnarray*} Here, :math:`s(\cdot)` is the sign function (i.e., returns -1 if its argument is negative and 1 otherwise), :math:`\eta_-` and :math:`\eta_+` are the amount to decrease (increase) the step size if the gradients disagree (agree) in sign, and :math:`\Delta_+` and :math:`\Delta_-` are the maximum and minimum step size. The implementation here is actually the "iRprop-" variant of RProp described in Algorithm 4 from [Igel00]_. This variant resets the running gradient estimates to zero in cases where the previous and current gradients have switched signs. References ---------- .. [Ried92] M. Riedmiller & H. Braun. (1992) "Rprop - A Fast Adaptive Learning Algorithm." In Proceedings of the International Symposium on Computer and Information Science VII. .. [Igel00] C. Igel & M. Hüsken. (2000) "Improving the Rprop Learning Algorithm." In Proceedings of the Second International Symposium on Neural Computation. http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.17.1332 ''' def _prepare(self, rprop_increase=1.01, rprop_decrease=0.99, rprop_min_step=0, rprop_max_step=100, **kwargs): self.step_increase = util.as_float(rprop_increase) self.step_decrease = util.as_float(rprop_decrease) self.min_step = util.as_float(rprop_min_step) self.max_step = util.as_float(rprop_max_step) util.log_param('rprop_increase', rprop_increase) util.log_param('rprop_decrease', rprop_decrease) util.log_param('rprop_min_step', rprop_min_step) util.log_param('rprop_max_step', rprop_max_step) super(RProp, self)._prepare(**kwargs) def _get_updates_for(self, param, grad): grad_tm1 = util.shared_like(param, 'grad') step_tm1 = util.shared_like(param, 'step', self.learning_rate.eval()) test = grad * grad_tm1 diff = TT.lt(test, 0) steps = step_tm1 * (TT.eq(test, 0) + TT.gt(test, 0) * self.step_increase + diff * self.step_decrease) step = TT.minimum(self.max_step, TT.maximum(self.min_step, steps)) grad = grad - diff * grad yield param, TT.sgn(grad) * step yield grad_tm1, grad yield step_tm1, step
[docs]class ADAGRAD(Optimizer): r'''ADAGRAD optimizer. Parameters ---------- rms_regularizer: float, optional (default 1e-8) Regularize the learning rate scaling factor by this :math:`\epsilon`. momentum: float, optional (default 0) Momentum to apply to the updates, if any. Defaults to 0 (no momentum). Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of momentum. nesterov: bool, optional (default False) Set this to ``True`` to enable Nesterov-style momentum updates, whenever ``momentum`` is nonzero. Notes ----- The ADAGRAD method uses the same general strategy as all first-order stochastic gradient methods, in the sense that these methods make small parameter adjustments iteratively using local derivative information. The difference with ADAGRAD is that as gradients are computed during each parameter update, their squares are accumulated, and this accumulated value is used to rescale the global learning rate :math:`\alpha` separately for each parameter. .. math:: \begin{eqnarray*} g_{t+1} &=& g_t + \left(\frac{\partial\mathcal{L}}{\partial p}\right)^2 \\ p_{t+1} &=& p_t - \frac{\alpha}{\sqrt{g_{t+1}} + \epsilon} \frac{\partial\mathcal{L}}{\partial p} \end{eqnarray*} Like the other adaptive learning methods, learning method effectively maintains a sort of parameter-specific learning rate. Unlike :class:`RMSProp` and :class:`ADADELTA`, however, in ADAGRAD, the gradient magnitudes accumulate throughout training, which has the effect of scaling the learning rate for each parameter, but also effectively anneals the learning rate overall as training progresses. In this implementation, the scale values are regularized (made less extreme) by :math:`\epsilon`, which is specified using the ``rms_regularizer`` parameter. References ---------- .. [Duch10] J. Duchi, E. Hazan, & Y. Singer (2010) “Adaptive subgradient methods for online leaning and stochastic optimization.” Proc. Conference on Learning Theory (COLT). ''' def _prepare(self, rms_regularizer=1e-8, **kwargs): self.epsilon = util.as_float(rms_regularizer) util.log_param('rms_regularizer', rms_regularizer) super(ADAGRAD, self)._prepare(**kwargs) def _get_updates_for(self, param, grad): g2_tm1 = util.shared_like(param, 'g2_acc') g2_t = g2_tm1 + grad * grad yield g2_tm1, g2_t yield param, grad * self.learning_rate / TT.sqrt(g2_t + self.epsilon)
[docs]class RMSProp(Optimizer): r'''RMSProp optimizer. Parameters ---------- learning_rate: float, optional (default 1e-4) Step size to take during optimization. rms_halflife: float, optional (default 14) Compute RMS gradient values using an exponentially weighted moving average that decays with this halflife. rms_regularizer: float, optional (default 1e-8) Regularize RMS gradient values by this :math:`\epsilon`. momentum: float, optional (default 0) Momentum to apply to the updates, if any. Defaults to 0 (no momentum). Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of momentum. nesterov: bool, optional (default False) Set this to ``True`` to enable Nesterov-style momentum updates, whenever ``momentum`` is nonzero. Notes ----- The RMSProp method uses the same general strategy as all first-order stochastic gradient methods, in the sense that these methods make small parameter adjustments iteratively using local derivative information. The difference here is that as gradients are computed during each parameter update, an exponentially-weighted moving average (EWMA) of gradient magnitudes is maintained as well. At each update, the EWMA is used to compute the root-mean-square (RMS) gradient value that's been seen in the recent past. The actual gradient is normalized by this RMS scaling factor before being applied to update the parameters. Intuitively, this makes RMSProp take steps near 1 whenever the gradient is of constant magnitude, and larger steps whenever the local scale of the gradient starts to increase. .. math:: \begin{eqnarray*} f_{t+1} &=& \gamma f_t + (1 - \gamma) \frac{\partial\mathcal{L}}{\partial p} \\ g_{t+1} &=& \gamma g_t + (1 - \gamma) \left( \frac{\partial\mathcal{L}}{\partial p}\right)^2 \\ p_{t+1} &=& p_t - \frac{\alpha}{\sqrt{g_{t+1} - f_{t+1}^2 + \epsilon}} \frac{\partial\mathcal{L}}{\partial p} \end{eqnarray*} Like :class:`RProp`, this learning method effectively maintains a sort of parameter-specific momentum value, but this method takes into account both the sign and the magnitude of the gradient for each parameter. In this algorithm, RMS values are regularized (made less extreme) by :math:`\epsilon`, which is specified using the ``rms_regularizer`` keyword argument. The weight parameter :math:`\gamma` for the EWMA window is computed from the ``rms_halflife`` keyword argument, such that the actual EWMA weight varies inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln 2}{h}}`. The implementation here is taken from [Grav13]_, equations (38)--(45). Graves' implementation in particular seems to have introduced the :math:`f_t` terms into the RMS computation; these terms appear to act as a sort of momentum for the RMS values. References ---------- .. [Grav13] A. Graves. (2013) "Generating Sequences With Recurrent Neural Networks." http://arxiv.org/abs/1308.0850 ''' def _prepare(self, rms_halflife=14, rms_regularizer=1e-8, **kwargs): self.ewma = util.as_float(np.exp(-np.log(2) / rms_halflife)) self.epsilon = util.as_float(rms_regularizer) util.log_param('rms_halflife', rms_halflife) util.log_param('rms_regularizer', rms_regularizer) super(RMSProp, self)._prepare(**kwargs) def _get_updates_for(self, param, grad): g1_tm1 = util.shared_like(param, 'g1_ewma') g2_tm1 = util.shared_like(param, 'g2_ewma') g1_t = self.ewma * g1_tm1 + (1 - self.ewma) * grad g2_t = self.ewma * g2_tm1 + (1 - self.ewma) * grad * grad rms = TT.sqrt(g2_t - g1_t * g1_t + self.epsilon) yield g1_tm1, g1_t yield g2_tm1, g2_t yield param, self.learning_rate * grad / rms
[docs]class ADADELTA(RMSProp): r'''ADADELTA optimizer. Parameters ---------- rms_halflife: float, optional (default 14) Compute RMS gradient values using an exponentially weighted moving average that decays with this halflife. rms_regularizer: float, optional (default 1e-8) Regularize RMS gradient values by this :math:`\epsilon`. momentum: float, optional (default 0) Momentum to apply to the updates, if any. Defaults to 0 (no momentum). Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of momentum. nesterov: bool, optional (default False) Set this to ``True`` to enable Nesterov-style momentum updates, whenever ``momentum`` is nonzero. Notes ----- The ADADELTA method uses the same general strategy as all first-order stochastic gradient methods, in the sense that these methods make small parameter adjustments iteratively using local derivative information. The difference with ADADELTA is that as gradients are computed during each parameter update, an exponentially-weighted weighted moving average (EWMA) gradient value, as well as an EWMA of recent parameter steps, are maintained as well. The actual gradient is normalized by the ratio of the root-mean-square (RMS) parameter step size to the RMS gradient magnitude. .. math:: \begin{eqnarray*} g_{t+1} &=& \gamma g_t + (1 - \gamma) \left( \frac{\partial\mathcal{L}}{\partial p}\right)^2 \\ v_{t+1} &=& \frac{\sqrt{x_t + \epsilon}}{\sqrt{g_{t+1} + \epsilon}} \frac{\partial\mathcal{L}}{\partial p} \\ x_{t+1} &=& \gamma x_t + (1 - \gamma) v_{t+1}^2 \\ p_{t+1} &=& p_t - v_{t+1} \end{eqnarray*} Like :class:`RProp` and the :class:`RMSProp`--:class:`ESGD` family, this learning method effectively maintains a sort of parameter-specific momentum value. The primary difference between this method and :class:`RMSProp` is that ADADELTA additionally incorporates a sliding window of RMS parameter step sizes, (somewhat) obviating the need for a learning rate parameter. In this implementation, the RMS values are regularized (made less extreme) by :math:`\epsilon`, which is specified using the ``rms_regularizer`` parameter. The weight parameter :math:`\gamma` for the EWMA window is computed from the ``rms_halflife`` keyword argument, such that the actual EWMA weight varies inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln 2}{h}}`. References ---------- .. [Zeil12] M. Zeiler. (2012) "ADADELTA: An adaptive learning rate method." http://arxiv.org/abs/1212.5701 ''' def _get_updates_for(self, param, grad): x2_tm1 = util.shared_like(param, 'x2_ewma') g2_tm1 = util.shared_like(param, 'g2_ewma') g2_t = self.ewma * g2_tm1 + (1 - self.ewma) * grad * grad delta = grad * TT.sqrt(x2_tm1 + self.epsilon) / TT.sqrt(g2_t + self.epsilon) x2_t = self.ewma * x2_tm1 + (1 - self.ewma) * delta * delta yield g2_tm1, g2_t yield x2_tm1, x2_t yield param, delta
[docs]class ESGD(RMSProp): r'''Equilibrated SGD computes a diagonal Hessian preconditioner. Parameters ---------- hv_method: {'rop', 'lop', 'grad'}, optional The Hv (Hessian-vector) product will be computed using the given method. The default is to use 'rop'. learning_rate: float, optional (default 1e-4) Step size to take during optimization. rms_halflife: float, optional (default 14) Compute RMS gradient values using an exponentially weighted moving average that decays with this halflife. rms_regularizer: float, optional (default 1e-8) Regularize RMS gradient values by this :math:`\epsilon`. momentum: float, optional (default 0) Momentum to apply to the updates, if any. Defaults to 0 (no momentum). Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of momentum. nesterov: bool, optional (default False) Set this to ``True`` to enable Nesterov-style momentum updates, whenever ``momentum`` is nonzero. Notes ----- The ESGD method uses the same general strategy as all first-order stochastic gradient methods, in the sense that these methods make small parameter adjustments iteratively using local derivative information. The difference here is that as gradients are computed during each parameter update, an exponentially-weighted moving average (EWMA) of estimates of the diagonal of the Hessian (the matrix of second derivatives) is maintained as well. At each update, the EWMA is used to compute the root-mean-square (RMS) diagonal value that's been seen in the recent past. The actual gradient is scaled by the inverse of this diagonal preconditioner before being applied to update the parameters. Intuitively, this causes the algorithm to "reshape" the loss function in parameter space, such that directions of steep gradient (i.e., large diagonal values) and directions of shallow gradient (i.e., small diagonal values) are scaled to be approximately the same slope. The diagonal estimates are computed using a nice trick: A vector :math:`r \sim \mathcal{N}(0, 1)` consisting of standard normal values is sampled randomly at each update step, and the value of :math:`Hr` is computed symbolically. These vector values tend to approximate the diagonal of the Hessian. Because :math:`Hr` is itself a vector, the full Hessian :math:`H` does not need to be computed or stored. .. math:: \begin{eqnarray*} r &\sim& \mathcal{N}(0, 1) \\ Hr &=& \frac{\partial^2 \mathcal{L}}{\partial^2 p}r \\ D_{t+1} &=& \gamma D_t + (1 - \gamma) (Hr)^2 \\ p_{t+1} &=& p_t + - \frac{\alpha}{\sqrt{D_{t+1} + \epsilon}} \frac{\partial\mathcal{L}}{\partial p} \end{eqnarray*} Like :class:`Rprop` and the :class:`ADADELTA`--:class:`RMSProp` family, this learning method effectively maintains a sort of parameter-specific learning rate for each parameter in the loss. In this implementation, :math:`\epsilon` regularizes the RMS values; it is is specified using the ``rms_regularizer`` parameter. The weight parameter :math:`\gamma` for the EWMA is computed from the ``rms_halflife`` keyword argument, such that the actual EWMA weight varies inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln 2}{h}}`. The primary difference between this implementation and the algorithm described in the paper (see below) is the use of an EWMA to decay the diagonal values over time, while in the paper the diagonal is divided by the training iteration. The EWMA halflife should be set to something reasonably large to ensure that this method emulates the method described in the original paper. References ---------- .. [Daup14] Y. Dauphin, H. de Vries, J. Chung & Y. Bengio. (2014) "RMSProp and equilibrated adaptive learning rates for non-convex optimization." http://arxiv.org/abs/1502.04390 '''
[docs] def __init__(self, *args, **kwargs): self.rng = RandomStreams() self.hv_method = kwargs.pop('hv_method', 'rop').lower() assert self.hv_method in ('rop', 'lop', 'grad') super(ESGD, self).__init__(*args, **kwargs)
def _get_updates_for(self, param, grad): D_tm1 = util.shared_like(param, 'D_ewma') v = self.rng.normal(param.shape) if self.hv_method == 'rop': Hv = TT.Rop(grad, param, v) if self.hv_method == 'lop': Hv = TT.Lop(grad, param, v) if self.hv_method == 'grad': Hv = TT.grad(TT.sum(grad * v), param) D_t = self.ewma * D_tm1 + (1 - self.ewma) * Hv * Hv denom = TT.sqrt(D_t) + self.epsilon yield D_tm1, D_t yield param, grad * self.learning_rate / denom
[docs]class Adam(RMSProp): r'''Adam optimizer using unbiased gradient moment estimates. Parameters ---------- learning_rate: float, optional (default 1e-4) Step size to take during optimization. beta1_decay: float, optional (default 1 - 1e-6) Extend the :math:`\beta_1` halflife by this amount after every update. beta1_halflife: float, optional (default 7) Compute RMS gradient estimates using an exponentially weighted moving average that decays with this halflife. beta2_halflife: float, optional (default 69) Compute squared-magnitude RMS gradient estimates using an exponentially weighted moving average that decays with this halflife. rms_regularizer: float, optional (default 1e-8) Regularize RMS gradient values by this :math:`\epsilon`. momentum: float, optional (default 0) Momentum to apply to the updates, if any. Defaults to 0 (no momentum). Set to a value close to 1 (e.g., 1 - 1e-4) for large amounts of momentum. nesterov: bool, optional (default False) Set this to ``True`` to enable Nesterov-style momentum updates, whenever ``momentum`` is nonzero. Notes ----- The Adam method uses the same general strategy as all first-order stochastic gradient methods, in the sense that these methods make small parameter adjustments iteratively using local derivative information. The difference here is that as gradients are computed during each parameter update, exponentially-weighted moving averages (EWMAs) of (1) the first moment of the recent gradient values and (2) the second moment of recent gradient values are maintained as well. At each update, the step taken is proportional to the ratio of the first moment to the second moment. .. math:: \begin{eqnarray*} \beta_1^t &=& \beta_1 \lambda^{t} f_{t+1} &=& \beta_1^t f_t + (1 - \beta_1^t) \frac{\partial\mathcal{L}}{\partial\theta} \\ g_{t+1} &=& \beta_2 g_t + (1 - \beta_2) \left(\frac{\partial\mathcal{L}}{\partial\theta}\right)^2 \\ \theta_{t+1} &=& \theta_t - \frac{f_{t+1} / (1 - \beta_1^t)}{\sqrt{g_{t+1} / (1 - \beta_2)} + \epsilon} \end{eqnarray*} Like all adaptive optimization algorithms, this optimizer effectively maintains a sort of parameter-specific momentum value. It shares with :class:`RMSProp` and :class:`ADADELTA` the idea of using an EWMA to track recent quantities related to the stochastic gradient during optimization. But the Adam method is unique in that it incorporates an explicit computation to remove the bias from these estimates. In this implementation, :math:`\epsilon` regularizes the RMS values and is given using the ``rms_regularizer`` keyword argument. The weight parameters :math:`\beta_1` and :math:`\beta_2` for the first and second EWMA windows are computed from the ``beta1_halflife`` and ``beta2_halflife`` keyword arguments, respectively, such that the actual EWMA weight varies inversely with the halflife :math:`h`: :math:`\gamma = e^{\frac{-\ln 2}{h}}`. The decay :math:`\lambda` for the :math:`\beta_1` EWMA is provided by the ``beta1_decay`` keyword argument. The implementation here is taken from Algorithm 1 of [King15]_. References ---------- .. [King15] D. Kingma & J. Ba. (ICLR 2015) "Adam: A Method for Stochastic Optimization." http://arxiv.org/abs/1412.6980 ''' def _prepare(self, beta1_halflife=7, beta2_halflife=69, **kwargs): self.beta1 = util.as_float(np.exp(-np.log(2) / beta1_halflife)) self.beta2 = util.as_float(np.exp(-np.log(2) / beta2_halflife)) super(Adam, self)._prepare(**kwargs) def _get_updates_for(self, param, grad): t_tm1 = theano.shared(np.cast['float32'](0), 't') t_t = 1 + t_tm1 g1_tm1 = util.shared_like(param, 'g1_ewma') g2_tm1 = util.shared_like(param, 'g2_ewma') g1_t = self.beta1 * g1_tm1 + (1 - self.beta1) * grad g2_t = self.beta2 * g2_tm1 + (1 - self.beta2) * grad * grad numer = g1_t / (1 - self.beta1 ** t_t) denom = TT.sqrt(g2_t / (1 - self.beta2 ** t_t)) yield t_tm1, t_t yield g1_tm1, g1_t yield g2_tm1, g2_t yield param, self.learning_rate * numer / (denom + self.epsilon)