Source code for downhill.base

# -*- coding: utf-8 -*-

'''This module defines a base class for optimization techniques.'''

import click
import collections
import numpy as np
import theano
import theano.tensor as TT
import warnings

from . import util


[docs]def build(algo, loss, params=None, inputs=None, updates=(), monitors=(),
          monitor_gradients=False):
    '''Construct an optimizer by name.

    Parameters
    ----------
    algo : str
        The name of the optimization algorithm to build.
    loss : Theano expression
        Loss function to minimize. This must be a scalar-valued expression.
    params : list of Theano variables, optional
        Symbolic variables to adjust to minimize the loss. If not given, these
        will be computed automatically by walking the computation graph.
    inputs : list of Theano variables, optional
        Symbolic variables required to compute the loss. If not given, these
        will be computed automatically by walking the computation graph.
    updates : list of update pairs, optional
        A list of pairs providing updates for the internal of the loss
        computation. Normally this is empty, but it can be provided if the loss,
        for example, requires an update to an internal random number generator.
    monitors : dict or sequence of (str, Theano expression) tuples, optional
        Additional values to monitor during optimization. These must be provided
        as either a sequence of (name, expression) tuples, or as a dictionary
        mapping string names to Theano expressions.
    monitor_gradients : bool, optional
        If True, add monitors to log the norms of the parameter gradients during
        optimization. Defaults to False.

    Returns
    -------
    optimizer : :class:`Optimizer`
        An optimizer instance.
    '''
    return Optimizer.build(algo, loss, params, inputs,
                           updates=updates, monitors=monitors,
                           monitor_gradients=monitor_gradients)


[docs]class Optimizer(util.Registrar(str('Base'), (), {})):
    '''An optimizer computes gradient updates to iteratively optimize a loss.

    Attributes
    ----------
    patience : int, optional
        Number of validation "failures" that we are willing to tolerate before
        stopping the optimization process. A validation failure happens whenever
        the loss on the validation dataset decreases by less than
        ``min_improvement`` (relative) over the previous best validation loss.
        Defaults to 5.
    validate_every : int, optional
        Evaluate the loss on the validation dataset after making this many
        passes over the training data. Defaults to 10.
    min_improvement : float, optional
        Insist that the validation loss must improve by this relative amount
        before considering that the optimization has made progress. The
        optimization process halts when ``patience`` validations have failed to
        make this relative improvement. Defaults to 0; set to a larger value
        (e.g., 0.01 for 1% improvement) to halt the optimization process sooner.
    max_gradient_norm : float, optional
        Rescale each parameter's gradient so that it has at most this L2 norm.
        Set to 0 (the default) to disable norm rescaling. If
        ``max_gradient_elem`` is also specified, then this has no effect.
    max_gradient_elem : float, optional
        Perform elementwise clipping on the magnitude of gradient values. Set to
        0 (the default) to disable. If elementwise clipping is enabled, norm
        rescaling (via ``max_gradient_norm``) will have no effect. Deprecated
        synonyms of this parameter are "max_gradient_clip" and "gradient_clip".
    learning_rate : float, optional
        Many SGD-based optimization algorithms require a learning rate
        hyperparameter that scales the gradient step. Defaults to 1e-4.
    momentum : float, optional
        Apply momentum to the parameter updates for this optimizer, with the
        given strength. Typically this value ranges from 0 (no momentum) to
        :math:`1 - \epsilon` (large momentum). Defaults to 0.
    nesterov : bool, optional
        If True, and ``momentum`` is nonzero, apply Nesterov-style momentum to
        parameter updates for this optimizer. If False, and ``momentum`` is
        nonzero, "regular" momentum is applied. Has no effect if ``momentum`` is
        zero. See :class:`NAG <downhill.NAG>` for a description of Nesterov
        momentum.

    Parameters
    ----------
    loss : Theano expression
        Loss function to minimize. This must be a scalar-valued expression.
    params : list of Theano variables, optional
        Symbolic variables to adjust to minimize the loss. If not given, these
        will be computed automatically by walking the computation graph.
    inputs : list of Theano variables, optional
        Symbolic variables required to compute the loss. If not given, these
        will be computed automatically by walking the computation graph.
    updates : list of update pairs, optional
        A list of pairs providing updates for the internals of the loss
        computation. Normally this is empty, but it can be provided if the loss,
        for example, requires an update to an internal random number generator.
    monitors : sequence of (str, Theano expression) tuples, optional
        Additional values to monitor during optimization. These must be provided
        as a sequence of (name, expression) tuples.
    monitor_gradients : bool, optional
        If True, add monitors to log the norms of the parameter gradients during
        optimization. Defaults to False.
    '''

[docs]    def __init__(self, loss, params=None, inputs=None, updates=(), monitors=(),
                 monitor_gradients=False):
        inputs_, params_ = util.find_inputs_and_params(loss)

        self._loss = loss
        self._params = params or params_
        self._inputs = inputs or inputs_
        self._updates = updates

        self._shapes = [p.get_value(borrow=True).shape for p in self._params]
        self._counts = [np.prod(s) for s in self._shapes]
        self._starts = np.cumsum([0] + self._counts)[:-1]
        self._dtype = self._params[0].get_value().dtype

        self._curr_iter = 0
        self._best_iter = 0
        self._best_loss = 1e100
        self._best_params = [p.get_value().copy() for p in self._params]

        self._monitor_exprs = [self._loss]
        self._monitor_names = ['loss']
        for name, monitor in monitors:
            self._monitor_names.append(name)
            self._monitor_exprs.append(monitor)
        if monitor_gradients:
            unnamed = 0
            for p, g in zip(self._params, TT.grad(self._loss, self._params)):
                name = p.name
                if not name:
                    name = 'unnamed{}'.format(unnamed)
                    unnamed += 1
                    util.log('"{}" unnamed, will be "{}" internally'.format(p, name))
                self._monitor_names.append('grad({})'.format(name))
                self._monitor_exprs.append((g * g).sum())

    def _compile(self, **kwargs):
        '''Compile the Theano functions for evaluating and updating our model.
        '''
        util.log('compiling evaluation function')
        self.f_eval = theano.function(self._inputs,
                                      self._monitor_exprs,
                                      updates=self._updates,
                                      name='evaluation')
        label = self.__class__.__name__
        util.log('compiling {} optimizer'.format(click.style(label, fg='red')))
        updates = list(self._updates) + list(self.get_updates(**kwargs))
        self.f_step = theano.function(self._inputs,
                                      self._monitor_exprs,
                                      updates=updates,
                                      name=label)

[docs]    def get_updates(self, **kwargs):
        '''Get parameter update expressions for performing optimization.

        Keyword arguments can be applied here to set any of the global
        optimizer attributes.

        Yields
        ------
        updates : (parameter, expression) tuples
            A sequence of parameter updates to be applied during optimization.
        '''
        self._prepare(**kwargs)
        for param, grad in self._differentiate():
            for var, update in self._get_updates_for(param, grad):
                # For auxiliary variables, updates are meant to replace the
                # existing variable value.
                if var != param:
                    yield var, update
                    continue
                # If momentum is disabled, just apply the parameter delta.
                if self.momentum == 0:
                    yield var, param - update
                    continue
                # Momentum is enabled, so we keep track of velocity here.
                vel_tm1 = util.shared_like(param, 'vel')
                vel_t = util.as_float(self.momentum) * vel_tm1 - update
                if self.nesterov:
                    # see http://arxiv.org/pdf/1212.0901v2.pdf (eq 7) and
                    # https://github.com/lisa-lab/pylearn2/pull/136#issuecomment-10381617
                    mom_sqr = util.as_float(self.momentum ** 2)
                    mom_inc = util.as_float(1 + self.momentum)
                    vel_t = mom_sqr * vel_tm1 - mom_inc * update
                yield vel_tm1, vel_t
                yield param, param + vel_t

    def _get_updates_for(self, param, grad):
        '''Generate some update pairs for the given model parameter.

        Yields
        ------
        updates : (parameter, expression) tuples
            A sequence of parameter updates to be applied during optimization.
        '''
        raise NotImplementedError

    def _differentiate(self, params=None):
        '''Return a sequence of gradients for our parameters.

        If this optimizer has been configured with a gradient norm limit, or
        with elementwise gradient clipping, this method applies the appropriate
        rescaling and clipping operations before returning the gradient.

        Parameters
        ----------
        params : list of Theano variables, optional
            Return the gradient with respect to these parameters. Defaults to
            all parameters that the optimizer knows about.

        Yields
        ------
        pairs : (param, grad) tuples
            Generates a sequence of tuples representing each of the parameters
            requested and the corresponding Theano gradient expressions.
        '''
        if params is None:
            params = self._params
        for param, grad in zip(params, TT.grad(self._loss, params)):
            if self.max_gradient_elem > 0:
                limit = util.as_float(self.max_gradient_elem)
                yield param, TT.clip(grad, -limit, limit)
            elif self.max_gradient_norm > 0:
                norm = TT.sqrt((grad * grad).sum())
                limit = util.as_float(self.max_gradient_norm)
                yield param, grad * TT.minimum(1, limit / norm)
            else:
                yield param, grad

[docs]    def set_params(self, targets=None):
        '''Set the values of the parameters to the given target values.

        Parameters
        ----------
        targets : sequence of ndarray, optional
            Arrays for setting the parameters of our model. If this is not
            provided, the current best parameters for this optimizer will be
            used.
        '''
        if not isinstance(targets, (list, tuple)):
            targets = self._best_params
        for param, target in zip(self._params, targets):
            param.set_value(target)

    def _log(self, monitors, iteration, label='', suffix=''):
        '''Log the state of the optimizer on the console.

        Parameters
        ----------
        monitors : OrderedDict
            A dictionary of monitor names mapped to values. These names and
            values are what is being logged.
        iteration : int
            Optimization iteration that we are logging.
        label : str, optional
            A label for the name of the optimizer creating the log line.
            Defaults to the name of the current class.
        suffix : str, optional
            A suffix to add to the end of the log line, if any.
        '''
        label = label or self.__class__.__name__
        fields = (('{}={:.6f}').format(k, v) for k, v in monitors.items())
        util.log('{} {} {}{}'.format(label, iteration, ' '.join(fields), suffix))

[docs]    def evaluate(self, dataset):
        '''Evaluate the current model parameters on a dataset.

        Parameters
        ----------
        dataset : :class:`Dataset <downhill.dataset.Dataset>`
            A set of data to use for evaluating the model.

        Returns
        -------
        monitors : OrderedDict
            A dictionary mapping monitor names to values. Monitors are
            quantities of interest during optimization---for example, loss
            function, accuracy, or whatever the optimization task requires.
        '''
        if dataset is None:
            values = [self.f_eval()]
        else:
            values = [self.f_eval(*x) for x in dataset]
        monitors = zip(self._monitor_names, np.mean(values, axis=0))
        return collections.OrderedDict(monitors)

    def _test_patience(self, monitors):
        '''Test whether our patience with optimization has elapsed.

        Parameters
        ----------
        monitors : dict
            A dictionary mapping monitor names to values. The 'loss' key from
            this dictionary will be used to evaluate optimization progress.

        Returns
        -------
        elapsed : bool
            True iff our patience has elapsed and the model is no longer
            improving.
        '''
        self._curr_iter += 1
        marker = ''
        loss = monitors['loss']
        if self._best_loss - loss > self._best_loss * self.min_improvement:
            self._best_loss = loss
            self._best_iter = self._curr_iter
            self._best_params = [p.get_value().copy() for p in self._params]
            marker = ' *'
        self._log(monitors, self._curr_iter - 1, 'validation', marker)
        return self._curr_iter - self._best_iter > self.patience

    def _prepare(self, **kwargs):
        '''Set up properties for optimization.

        This method can be overridden by base classes to provide parameters that
        are specific to a particular optimization technique (e.g., setting up a
        learning rate value).
        '''
        self.learning_rate = util.as_float(kwargs.pop('learning_rate', 1e-4))
        self.momentum = kwargs.pop('momentum', 0)
        self.nesterov = kwargs.pop('nesterov', False)
        self.patience = kwargs.get('patience', 5)
        self.validate_every = kwargs.pop('validate_every', 10)
        self.min_improvement = kwargs.pop('min_improvement', 0)
        self.max_gradient_norm = kwargs.pop('max_gradient_norm', 0)
        self.max_gradient_elem = kwargs.pop('max_gradient_elem', 0)

        util.log_param('patience', self.patience)
        util.log_param('validate_every', self.validate_every)
        util.log_param('min_improvement', self.min_improvement)
        util.log_param('max_gradient_norm', self.max_gradient_norm)
        util.log_param('max_gradient_elem', self.max_gradient_elem)
        util.log_param('learning_rate', self.learning_rate)
        util.log_param('momentum', self.momentum)
        util.log_param('nesterov', self.nesterov)

[docs]    def iterate(self, train=None, valid=None, max_updates=None, **kwargs):
        r'''Optimize a loss iteratively using a training and validation dataset.

        This method yields a series of monitor values to the caller. After every
        optimization epoch, a pair of monitor dictionaries is generated: one
        evaluated on the training dataset during the epoch, and another
        evaluated on the validation dataset at the most recent validation epoch.

        The validation monitors might not be updated during every optimization
        iteration; in this case, the most recent validation monitors will be
        yielded along with the training monitors.

        Additional keyword arguments supplied here will set the global
        optimizer attributes.

        Parameters
        ----------
        train : sequence or :class:`Dataset <downhill.dataset.Dataset>`
            A set of training data for computing updates to model parameters.
        valid : sequence or :class:`Dataset <downhill.dataset.Dataset>`
            A set of validation data for computing monitor values and
            determining when the loss has stopped improving. Defaults to the
            training data.
        max_updates : int, optional
            If specified, halt optimization after this many gradient updates
            have been processed. If not provided, uses early stopping to decide
            when to halt.

        Yields
        ------
        train_monitors : dict
            A dictionary mapping monitor names to values, evaluated on the
            training dataset.
        valid_monitors : dict
            A dictionary containing monitor values evaluated on the validation
            dataset.
        '''
        self._compile(**kwargs)

        if valid is None:
            valid = train
        iteration = 0
        training = validation = None
        while max_updates is None or iteration < max_updates:
            if not iteration % self.validate_every:
                try:
                    validation = self.evaluate(valid)
                except KeyboardInterrupt:
                    util.log('interrupted!')
                    break
                if self._test_patience(validation):
                    util.log('patience elapsed!')
                    break
            try:
                training = self._step(train)
            except KeyboardInterrupt:
                util.log('interrupted!')
                break
            iteration += 1
            self._log(training, iteration)
            yield training, validation
        self.set_params('best')

[docs]    def minimize(self, *args, **kwargs):
        '''Optimize our loss exhaustively.

        This method is a thin wrapper over the :func:`iterate` method. It simply
        exhausts the iterative optimization process and returns the final
        monitor values.

        Returns
        -------
        train_monitors : dict
            A dictionary mapping monitor names to values, evaluated on the
            training dataset.
        valid_monitors : dict
            A dictionary containing monitor values evaluated on the validation
            dataset.
        '''
        monitors = None
        for monitors in self.iterate(*args, **kwargs):
            pass
        return monitors

    def _step(self, dataset):
        '''Advance the state of the optimizer by one step.

        Parameters
        ----------
        dataset : :class:`Dataset <downhill.dataset.Dataset>`
            A dataset for optimizing the model.

        Returns
        -------
        train_monitors : dict
            A dictionary mapping monitor names to values.
        '''
        if dataset is None:
            values = [self.f_step()]
        else:
            values = [self.f_step(*x) for x in dataset]
        return collections.OrderedDict(
            zip(self._monitor_names, np.mean(values, axis=0)))