Module `smearn.optimization`

The smearn.optimization module includes some gradient-based optimization techniques for neural networks (namely, stochastic gradient descent, stochastic gradient descent with momentum, AdaGrad, RMSProp and Adam), as well as a system for scheduling hyperparameter changes such as learning rate scheduling.

Expand source code

'''
The `smearn.optimization` module includes some gradient-based optimization techniques for neural networks (namely, stochastic gradient descent, stochastic gradient descent with momentum, AdaGrad, RMSProp and Adam), as well as a system for scheduling hyperparameter changes such as learning rate scheduling.
'''

from .symbolic import *

# Schedulers

def wrap_in_scheduler(x):
    return x if isinstance(x, Scheduler) else Scheduler(x)

class Scheduler:
    '''
    The base class for schedulers. The `update` method is called after each epoch during training, and this method updates its attribute `value`, whose value will be used as the learning rate during the next epoch.
    '''
    def __init__(self, value):
        self.value = value

    def update(self, step):
        pass

class LinearDecrease(Scheduler):
    '''
    Scheduler for a linear decrease.
    '''
    def __init__(self, start, end, steps):
        self.start = start
        self.end = end
        self.steps = steps
        self.value = self.start

    def update(self, step):
        if step > self.steps:
            self.value = self.end
        else:
            t = step / self.steps
            self.value = (1-t)*self.start + t*self.end


# Gradient-based optimization methods

class SGD:
    '''
    SGD optimizer
    '''
    def __init__(self, lr=0.001):
        self.lr = wrap_in_scheduler(lr)

    def initialize_symbol(self, symbol):
        pass

    def apply_gradient(self, symbol):
        symbol.value = symbol.value - self.lr.value * np.mean(symbol.gradient, axis=0)

    def update_hyperparameters(self, epoch):
        self.lr.update(epoch)

        
class SGDMomentum:
    '''
    SGD with momentum optimizer
    '''
    def __init__(self, lr=0.001, beta=0.5):
        self.lr = wrap_in_scheduler(lr)
        self.lr.value *= (1-beta)
        self.beta = wrap_in_scheduler(beta)
        self.momentums = {}

    def initialize_symbol(self, symbol):
        if self.momentums.get(symbol) is not None:
            return False
        self.momentums[symbol] = np.zeros(symbol.shape)
        return True

    def apply_gradient(self, symbol):
        self.momentums[symbol] = self.beta.value * self.momentums[symbol] - self.lr.value * np.mean(symbol.gradient, axis=0)
        symbol.value = symbol.value + self.momentums[symbol]

    def update_hyperparameters(self, epoch):
        self.lr.update(epoch)
        self.beta.update(epoch)

class AdaGrad:
    '''
    AdaGrad optimizer
    '''
    def __init__(self, lr=0.001, delta=1e-7):
        self.lr = wrap_in_scheduler(lr)
        self.delta = wrap_in_scheduler(delta)
        self.accumulated_square_gradients = {}

    def initialize_symbol(self, symbol):
        if self.accumulated_square_gradients.get(symbol) is not None:
            return False
        self.accumulated_square_gradients[symbol] = np.zeros(symbol.shape)
        return True

    def apply_gradient(self, symbol):
        gradient = np.mean(symbol.gradient, axis=0)
        self.accumulated_square_gradients[symbol] += np.multiply(gradient, gradient)
        symbol.value = symbol.value - self.lr.value / (self.delta.value + np.sqrt(self.accumulated_square_gradients[symbol])) * gradient

    def update_hyperparameters(self, epoch):
        self.lr.update(epoch)
        self.delta.update(epoch)

class RMSProp:
    '''
    RMSProp optimizer
    '''
    def __init__(self, lr=0.001, beta=0.5, delta=1e-6):
        self.lr = wrap_in_scheduler(lr)
        self.beta = wrap_in_scheduler(beta)
        self.delta = wrap_in_scheduler(delta)
        self.accumulations = {}

    def initialize_symbol(self, symbol):
        if self.accumulations.get(symbol) is not None:
            return False
        self.accumulations[symbol] = np.zeros(symbol.shape)
        return True

    def apply_gradient(self, symbol):
        gradient = np.mean(symbol.gradient, axis=0)
        self.accumulations[symbol] = self.beta.value * self.accumulations[symbol] + (1 - self.beta.value) * np.multiply(gradient, gradient)
        symbol.value = symbol.value - self.lr.value / (np.sqrt(self.delta.value + self.accumulations[symbol])) * gradient

    def update_hyperparameters(self, epoch):
        self.lr.update(epoch)
        self.beta.update(epoch)
        self.delta.update(epoch)

class Adam:
    '''
    Adam optimizer
    '''
    def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, delta=1e-8):
        self.lr = wrap_in_scheduler(lr)
        self.beta_1 = wrap_in_scheduler(beta_1)
        self.beta_2 = wrap_in_scheduler(beta_2)
        self.delta = wrap_in_scheduler(delta)
        self.first_moments = {}
        self.second_moments = {}
        self.beta_t = {}

    def initialize_symbol(self, symbol):
        if self.first_moments.get(symbol) is not None:
            return False
        self.first_moments[symbol] = np.zeros(symbol.shape)
        self.second_moments[symbol] = np.zeros(symbol.shape)
        self.beta_t[symbol] = [1,1]
        return True

    def apply_gradient(self, symbol):
        gradient = np.mean(symbol.gradient, axis=0)
        self.beta_t[symbol][0] *= self.beta_1.value
        self.beta_t[symbol][1] *= self.beta_2.value
        self.first_moments[symbol] = self.beta_1.value * self.first_moments[symbol] + (1 - self.beta_1.value) * gradient
        self.second_moments[symbol] = self.beta_2.value * self.second_moments[symbol] + (1 - self.beta_2.value) * np.multiply(gradient, gradient)
        unbiased_first_moment = self.first_moments[symbol] / (1 - self.beta_t[symbol][0])
        unbiased_second_moment = self.second_moments[symbol] / (1 - self.beta_t[symbol][1])
        symbol.value = symbol.value - self.lr.value * unbiased_first_moment / (self.delta.value + np.sqrt(unbiased_second_moment))

    def update_hyperparameters(self, epoch):
        self.lr.update(epoch)
        self.beta_1.update(epoch)
        self.beta_2.update(epoch)
        self.delta.update(epoch)
        print(self.lr.value)

Functions

def wrap_in_scheduler(x)

Expand source code

def wrap_in_scheduler(x):
    return x if isinstance(x, Scheduler) else Scheduler(x)

Classes

class AdaGrad (lr=0.001, delta=1e-07)

AdaGrad optimizer

Expand source code

class AdaGrad:
    '''
    AdaGrad optimizer
    '''
    def __init__(self, lr=0.001, delta=1e-7):
        self.lr = wrap_in_scheduler(lr)
        self.delta = wrap_in_scheduler(delta)
        self.accumulated_square_gradients = {}

    def initialize_symbol(self, symbol):
        if self.accumulated_square_gradients.get(symbol) is not None:
            return False
        self.accumulated_square_gradients[symbol] = np.zeros(symbol.shape)
        return True

    def apply_gradient(self, symbol):
        gradient = np.mean(symbol.gradient, axis=0)
        self.accumulated_square_gradients[symbol] += np.multiply(gradient, gradient)
        symbol.value = symbol.value - self.lr.value / (self.delta.value + np.sqrt(self.accumulated_square_gradients[symbol])) * gradient

    def update_hyperparameters(self, epoch):
        self.lr.update(epoch)
        self.delta.update(epoch)

Methods

def apply_gradient(self, symbol)

Expand source code

def apply_gradient(self, symbol):
    gradient = np.mean(symbol.gradient, axis=0)
    self.accumulated_square_gradients[symbol] += np.multiply(gradient, gradient)
    symbol.value = symbol.value - self.lr.value / (self.delta.value + np.sqrt(self.accumulated_square_gradients[symbol])) * gradient

def initialize_symbol(self, symbol)

Expand source code

def initialize_symbol(self, symbol):
    if self.accumulated_square_gradients.get(symbol) is not None:
        return False
    self.accumulated_square_gradients[symbol] = np.zeros(symbol.shape)
    return True

def update_hyperparameters(self, epoch)

Expand source code

def update_hyperparameters(self, epoch):
    self.lr.update(epoch)
    self.delta.update(epoch)

class Adam (lr=0.001, beta_1=0.9, beta_2=0.999, delta=1e-08)

Adam optimizer

Expand source code

class Adam:
    '''
    Adam optimizer
    '''
    def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, delta=1e-8):
        self.lr = wrap_in_scheduler(lr)
        self.beta_1 = wrap_in_scheduler(beta_1)
        self.beta_2 = wrap_in_scheduler(beta_2)
        self.delta = wrap_in_scheduler(delta)
        self.first_moments = {}
        self.second_moments = {}
        self.beta_t = {}

    def initialize_symbol(self, symbol):
        if self.first_moments.get(symbol) is not None:
            return False
        self.first_moments[symbol] = np.zeros(symbol.shape)
        self.second_moments[symbol] = np.zeros(symbol.shape)
        self.beta_t[symbol] = [1,1]
        return True

    def apply_gradient(self, symbol):
        gradient = np.mean(symbol.gradient, axis=0)
        self.beta_t[symbol][0] *= self.beta_1.value
        self.beta_t[symbol][1] *= self.beta_2.value
        self.first_moments[symbol] = self.beta_1.value * self.first_moments[symbol] + (1 - self.beta_1.value) * gradient
        self.second_moments[symbol] = self.beta_2.value * self.second_moments[symbol] + (1 - self.beta_2.value) * np.multiply(gradient, gradient)
        unbiased_first_moment = self.first_moments[symbol] / (1 - self.beta_t[symbol][0])
        unbiased_second_moment = self.second_moments[symbol] / (1 - self.beta_t[symbol][1])
        symbol.value = symbol.value - self.lr.value * unbiased_first_moment / (self.delta.value + np.sqrt(unbiased_second_moment))

    def update_hyperparameters(self, epoch):
        self.lr.update(epoch)
        self.beta_1.update(epoch)
        self.beta_2.update(epoch)
        self.delta.update(epoch)
        print(self.lr.value)

Methods

def apply_gradient(self, symbol)

Expand source code

def apply_gradient(self, symbol):
    gradient = np.mean(symbol.gradient, axis=0)
    self.beta_t[symbol][0] *= self.beta_1.value
    self.beta_t[symbol][1] *= self.beta_2.value
    self.first_moments[symbol] = self.beta_1.value * self.first_moments[symbol] + (1 - self.beta_1.value) * gradient
    self.second_moments[symbol] = self.beta_2.value * self.second_moments[symbol] + (1 - self.beta_2.value) * np.multiply(gradient, gradient)
    unbiased_first_moment = self.first_moments[symbol] / (1 - self.beta_t[symbol][0])
    unbiased_second_moment = self.second_moments[symbol] / (1 - self.beta_t[symbol][1])
    symbol.value = symbol.value - self.lr.value * unbiased_first_moment / (self.delta.value + np.sqrt(unbiased_second_moment))

def initialize_symbol(self, symbol)

Expand source code

def initialize_symbol(self, symbol):
    if self.first_moments.get(symbol) is not None:
        return False
    self.first_moments[symbol] = np.zeros(symbol.shape)
    self.second_moments[symbol] = np.zeros(symbol.shape)
    self.beta_t[symbol] = [1,1]
    return True

def update_hyperparameters(self, epoch)

Expand source code

def update_hyperparameters(self, epoch):
    self.lr.update(epoch)
    self.beta_1.update(epoch)
    self.beta_2.update(epoch)
    self.delta.update(epoch)
    print(self.lr.value)

class LinearDecrease (start, end, steps)

Scheduler for a linear decrease.

Expand source code

class LinearDecrease(Scheduler):
    '''
    Scheduler for a linear decrease.
    '''
    def __init__(self, start, end, steps):
        self.start = start
        self.end = end
        self.steps = steps
        self.value = self.start

    def update(self, step):
        if step > self.steps:
            self.value = self.end
        else:
            t = step / self.steps
            self.value = (1-t)*self.start + t*self.end

Ancestors

Scheduler

Methods

def update(self, step)

Expand source code

def update(self, step):
    if step > self.steps:
        self.value = self.end
    else:
        t = step / self.steps
        self.value = (1-t)*self.start + t*self.end

class RMSProp (lr=0.001, beta=0.5, delta=1e-06)

RMSProp optimizer

Expand source code

class RMSProp:
    '''
    RMSProp optimizer
    '''
    def __init__(self, lr=0.001, beta=0.5, delta=1e-6):
        self.lr = wrap_in_scheduler(lr)
        self.beta = wrap_in_scheduler(beta)
        self.delta = wrap_in_scheduler(delta)
        self.accumulations = {}

    def initialize_symbol(self, symbol):
        if self.accumulations.get(symbol) is not None:
            return False
        self.accumulations[symbol] = np.zeros(symbol.shape)
        return True

    def apply_gradient(self, symbol):
        gradient = np.mean(symbol.gradient, axis=0)
        self.accumulations[symbol] = self.beta.value * self.accumulations[symbol] + (1 - self.beta.value) * np.multiply(gradient, gradient)
        symbol.value = symbol.value - self.lr.value / (np.sqrt(self.delta.value + self.accumulations[symbol])) * gradient

    def update_hyperparameters(self, epoch):
        self.lr.update(epoch)
        self.beta.update(epoch)
        self.delta.update(epoch)

Methods

def apply_gradient(self, symbol)

Expand source code

def apply_gradient(self, symbol):
    gradient = np.mean(symbol.gradient, axis=0)
    self.accumulations[symbol] = self.beta.value * self.accumulations[symbol] + (1 - self.beta.value) * np.multiply(gradient, gradient)
    symbol.value = symbol.value - self.lr.value / (np.sqrt(self.delta.value + self.accumulations[symbol])) * gradient

def initialize_symbol(self, symbol)

Expand source code

def initialize_symbol(self, symbol):
    if self.accumulations.get(symbol) is not None:
        return False
    self.accumulations[symbol] = np.zeros(symbol.shape)
    return True

def update_hyperparameters(self, epoch)

Expand source code

def update_hyperparameters(self, epoch):
    self.lr.update(epoch)
    self.beta.update(epoch)
    self.delta.update(epoch)

class SGD (lr=0.001)

SGD optimizer

Expand source code

class SGD:
    '''
    SGD optimizer
    '''
    def __init__(self, lr=0.001):
        self.lr = wrap_in_scheduler(lr)

    def initialize_symbol(self, symbol):
        pass

    def apply_gradient(self, symbol):
        symbol.value = symbol.value - self.lr.value * np.mean(symbol.gradient, axis=0)

    def update_hyperparameters(self, epoch):
        self.lr.update(epoch)

Methods

def apply_gradient(self, symbol)

Expand source code

def apply_gradient(self, symbol):
    symbol.value = symbol.value - self.lr.value * np.mean(symbol.gradient, axis=0)

def initialize_symbol(self, symbol)

Expand source code

def initialize_symbol(self, symbol):
    pass

def update_hyperparameters(self, epoch)

Expand source code

def update_hyperparameters(self, epoch):
    self.lr.update(epoch)

class SGDMomentum (lr=0.001, beta=0.5)

SGD with momentum optimizer

Expand source code

class SGDMomentum:
    '''
    SGD with momentum optimizer
    '''
    def __init__(self, lr=0.001, beta=0.5):
        self.lr = wrap_in_scheduler(lr)
        self.lr.value *= (1-beta)
        self.beta = wrap_in_scheduler(beta)
        self.momentums = {}

    def initialize_symbol(self, symbol):
        if self.momentums.get(symbol) is not None:
            return False
        self.momentums[symbol] = np.zeros(symbol.shape)
        return True

    def apply_gradient(self, symbol):
        self.momentums[symbol] = self.beta.value * self.momentums[symbol] - self.lr.value * np.mean(symbol.gradient, axis=0)
        symbol.value = symbol.value + self.momentums[symbol]

    def update_hyperparameters(self, epoch):
        self.lr.update(epoch)
        self.beta.update(epoch)

Methods

def apply_gradient(self, symbol)

Expand source code

def apply_gradient(self, symbol):
    self.momentums[symbol] = self.beta.value * self.momentums[symbol] - self.lr.value * np.mean(symbol.gradient, axis=0)
    symbol.value = symbol.value + self.momentums[symbol]

def initialize_symbol(self, symbol)

Expand source code

def initialize_symbol(self, symbol):
    if self.momentums.get(symbol) is not None:
        return False
    self.momentums[symbol] = np.zeros(symbol.shape)
    return True

def update_hyperparameters(self, epoch)

Expand source code

def update_hyperparameters(self, epoch):
    self.lr.update(epoch)
    self.beta.update(epoch)

class Scheduler (value)

The base class for schedulers. The update method is called after each epoch during training, and this method updates its attribute value, whose value will be used as the learning rate during the next epoch.

Expand source code

class Scheduler:
    '''
    The base class for schedulers. The `update` method is called after each epoch during training, and this method updates its attribute `value`, whose value will be used as the learning rate during the next epoch.
    '''
    def __init__(self, value):
        self.value = value

    def update(self, step):
        pass

Subclasses

LinearDecrease

Methods

def update(self, step)

Expand source code

def update(self, step):
    pass