Module smearn.optimization
The smearn.optimization module includes some gradient-based optimization techniques for neural networks (namely, stochastic gradient descent, stochastic gradient descent with momentum, AdaGrad, RMSProp and Adam), as well as a system for scheduling hyperparameter changes such as learning rate scheduling.
Expand source code
'''
The `smearn.optimization` module includes some gradient-based optimization techniques for neural networks (namely, stochastic gradient descent, stochastic gradient descent with momentum, AdaGrad, RMSProp and Adam), as well as a system for scheduling hyperparameter changes such as learning rate scheduling.
'''
from .symbolic import *
# Schedulers
def wrap_in_scheduler(x):
return x if isinstance(x, Scheduler) else Scheduler(x)
class Scheduler:
'''
The base class for schedulers. The `update` method is called after each epoch during training, and this method updates its attribute `value`, whose value will be used as the learning rate during the next epoch.
'''
def __init__(self, value):
self.value = value
def update(self, step):
pass
class LinearDecrease(Scheduler):
'''
Scheduler for a linear decrease.
'''
def __init__(self, start, end, steps):
self.start = start
self.end = end
self.steps = steps
self.value = self.start
def update(self, step):
if step > self.steps:
self.value = self.end
else:
t = step / self.steps
self.value = (1-t)*self.start + t*self.end
# Gradient-based optimization methods
class SGD:
'''
SGD optimizer
'''
def __init__(self, lr=0.001):
self.lr = wrap_in_scheduler(lr)
def initialize_symbol(self, symbol):
pass
def apply_gradient(self, symbol):
symbol.value = symbol.value - self.lr.value * np.mean(symbol.gradient, axis=0)
def update_hyperparameters(self, epoch):
self.lr.update(epoch)
class SGDMomentum:
'''
SGD with momentum optimizer
'''
def __init__(self, lr=0.001, beta=0.5):
self.lr = wrap_in_scheduler(lr)
self.lr.value *= (1-beta)
self.beta = wrap_in_scheduler(beta)
self.momentums = {}
def initialize_symbol(self, symbol):
if self.momentums.get(symbol) is not None:
return False
self.momentums[symbol] = np.zeros(symbol.shape)
return True
def apply_gradient(self, symbol):
self.momentums[symbol] = self.beta.value * self.momentums[symbol] - self.lr.value * np.mean(symbol.gradient, axis=0)
symbol.value = symbol.value + self.momentums[symbol]
def update_hyperparameters(self, epoch):
self.lr.update(epoch)
self.beta.update(epoch)
class AdaGrad:
'''
AdaGrad optimizer
'''
def __init__(self, lr=0.001, delta=1e-7):
self.lr = wrap_in_scheduler(lr)
self.delta = wrap_in_scheduler(delta)
self.accumulated_square_gradients = {}
def initialize_symbol(self, symbol):
if self.accumulated_square_gradients.get(symbol) is not None:
return False
self.accumulated_square_gradients[symbol] = np.zeros(symbol.shape)
return True
def apply_gradient(self, symbol):
gradient = np.mean(symbol.gradient, axis=0)
self.accumulated_square_gradients[symbol] += np.multiply(gradient, gradient)
symbol.value = symbol.value - self.lr.value / (self.delta.value + np.sqrt(self.accumulated_square_gradients[symbol])) * gradient
def update_hyperparameters(self, epoch):
self.lr.update(epoch)
self.delta.update(epoch)
class RMSProp:
'''
RMSProp optimizer
'''
def __init__(self, lr=0.001, beta=0.5, delta=1e-6):
self.lr = wrap_in_scheduler(lr)
self.beta = wrap_in_scheduler(beta)
self.delta = wrap_in_scheduler(delta)
self.accumulations = {}
def initialize_symbol(self, symbol):
if self.accumulations.get(symbol) is not None:
return False
self.accumulations[symbol] = np.zeros(symbol.shape)
return True
def apply_gradient(self, symbol):
gradient = np.mean(symbol.gradient, axis=0)
self.accumulations[symbol] = self.beta.value * self.accumulations[symbol] + (1 - self.beta.value) * np.multiply(gradient, gradient)
symbol.value = symbol.value - self.lr.value / (np.sqrt(self.delta.value + self.accumulations[symbol])) * gradient
def update_hyperparameters(self, epoch):
self.lr.update(epoch)
self.beta.update(epoch)
self.delta.update(epoch)
class Adam:
'''
Adam optimizer
'''
def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, delta=1e-8):
self.lr = wrap_in_scheduler(lr)
self.beta_1 = wrap_in_scheduler(beta_1)
self.beta_2 = wrap_in_scheduler(beta_2)
self.delta = wrap_in_scheduler(delta)
self.first_moments = {}
self.second_moments = {}
self.beta_t = {}
def initialize_symbol(self, symbol):
if self.first_moments.get(symbol) is not None:
return False
self.first_moments[symbol] = np.zeros(symbol.shape)
self.second_moments[symbol] = np.zeros(symbol.shape)
self.beta_t[symbol] = [1,1]
return True
def apply_gradient(self, symbol):
gradient = np.mean(symbol.gradient, axis=0)
self.beta_t[symbol][0] *= self.beta_1.value
self.beta_t[symbol][1] *= self.beta_2.value
self.first_moments[symbol] = self.beta_1.value * self.first_moments[symbol] + (1 - self.beta_1.value) * gradient
self.second_moments[symbol] = self.beta_2.value * self.second_moments[symbol] + (1 - self.beta_2.value) * np.multiply(gradient, gradient)
unbiased_first_moment = self.first_moments[symbol] / (1 - self.beta_t[symbol][0])
unbiased_second_moment = self.second_moments[symbol] / (1 - self.beta_t[symbol][1])
symbol.value = symbol.value - self.lr.value * unbiased_first_moment / (self.delta.value + np.sqrt(unbiased_second_moment))
def update_hyperparameters(self, epoch):
self.lr.update(epoch)
self.beta_1.update(epoch)
self.beta_2.update(epoch)
self.delta.update(epoch)
print(self.lr.value)
Functions
def wrap_in_scheduler(x)-
Expand source code
def wrap_in_scheduler(x): return x if isinstance(x, Scheduler) else Scheduler(x)
Classes
class AdaGrad (lr=0.001, delta=1e-07)-
AdaGrad optimizer
Expand source code
class AdaGrad: ''' AdaGrad optimizer ''' def __init__(self, lr=0.001, delta=1e-7): self.lr = wrap_in_scheduler(lr) self.delta = wrap_in_scheduler(delta) self.accumulated_square_gradients = {} def initialize_symbol(self, symbol): if self.accumulated_square_gradients.get(symbol) is not None: return False self.accumulated_square_gradients[symbol] = np.zeros(symbol.shape) return True def apply_gradient(self, symbol): gradient = np.mean(symbol.gradient, axis=0) self.accumulated_square_gradients[symbol] += np.multiply(gradient, gradient) symbol.value = symbol.value - self.lr.value / (self.delta.value + np.sqrt(self.accumulated_square_gradients[symbol])) * gradient def update_hyperparameters(self, epoch): self.lr.update(epoch) self.delta.update(epoch)Methods
def apply_gradient(self, symbol)-
Expand source code
def apply_gradient(self, symbol): gradient = np.mean(symbol.gradient, axis=0) self.accumulated_square_gradients[symbol] += np.multiply(gradient, gradient) symbol.value = symbol.value - self.lr.value / (self.delta.value + np.sqrt(self.accumulated_square_gradients[symbol])) * gradient def initialize_symbol(self, symbol)-
Expand source code
def initialize_symbol(self, symbol): if self.accumulated_square_gradients.get(symbol) is not None: return False self.accumulated_square_gradients[symbol] = np.zeros(symbol.shape) return True def update_hyperparameters(self, epoch)-
Expand source code
def update_hyperparameters(self, epoch): self.lr.update(epoch) self.delta.update(epoch)
class Adam (lr=0.001, beta_1=0.9, beta_2=0.999, delta=1e-08)-
Adam optimizer
Expand source code
class Adam: ''' Adam optimizer ''' def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, delta=1e-8): self.lr = wrap_in_scheduler(lr) self.beta_1 = wrap_in_scheduler(beta_1) self.beta_2 = wrap_in_scheduler(beta_2) self.delta = wrap_in_scheduler(delta) self.first_moments = {} self.second_moments = {} self.beta_t = {} def initialize_symbol(self, symbol): if self.first_moments.get(symbol) is not None: return False self.first_moments[symbol] = np.zeros(symbol.shape) self.second_moments[symbol] = np.zeros(symbol.shape) self.beta_t[symbol] = [1,1] return True def apply_gradient(self, symbol): gradient = np.mean(symbol.gradient, axis=0) self.beta_t[symbol][0] *= self.beta_1.value self.beta_t[symbol][1] *= self.beta_2.value self.first_moments[symbol] = self.beta_1.value * self.first_moments[symbol] + (1 - self.beta_1.value) * gradient self.second_moments[symbol] = self.beta_2.value * self.second_moments[symbol] + (1 - self.beta_2.value) * np.multiply(gradient, gradient) unbiased_first_moment = self.first_moments[symbol] / (1 - self.beta_t[symbol][0]) unbiased_second_moment = self.second_moments[symbol] / (1 - self.beta_t[symbol][1]) symbol.value = symbol.value - self.lr.value * unbiased_first_moment / (self.delta.value + np.sqrt(unbiased_second_moment)) def update_hyperparameters(self, epoch): self.lr.update(epoch) self.beta_1.update(epoch) self.beta_2.update(epoch) self.delta.update(epoch) print(self.lr.value)Methods
def apply_gradient(self, symbol)-
Expand source code
def apply_gradient(self, symbol): gradient = np.mean(symbol.gradient, axis=0) self.beta_t[symbol][0] *= self.beta_1.value self.beta_t[symbol][1] *= self.beta_2.value self.first_moments[symbol] = self.beta_1.value * self.first_moments[symbol] + (1 - self.beta_1.value) * gradient self.second_moments[symbol] = self.beta_2.value * self.second_moments[symbol] + (1 - self.beta_2.value) * np.multiply(gradient, gradient) unbiased_first_moment = self.first_moments[symbol] / (1 - self.beta_t[symbol][0]) unbiased_second_moment = self.second_moments[symbol] / (1 - self.beta_t[symbol][1]) symbol.value = symbol.value - self.lr.value * unbiased_first_moment / (self.delta.value + np.sqrt(unbiased_second_moment)) def initialize_symbol(self, symbol)-
Expand source code
def initialize_symbol(self, symbol): if self.first_moments.get(symbol) is not None: return False self.first_moments[symbol] = np.zeros(symbol.shape) self.second_moments[symbol] = np.zeros(symbol.shape) self.beta_t[symbol] = [1,1] return True def update_hyperparameters(self, epoch)-
Expand source code
def update_hyperparameters(self, epoch): self.lr.update(epoch) self.beta_1.update(epoch) self.beta_2.update(epoch) self.delta.update(epoch) print(self.lr.value)
class LinearDecrease (start, end, steps)-
Scheduler for a linear decrease.
Expand source code
class LinearDecrease(Scheduler): ''' Scheduler for a linear decrease. ''' def __init__(self, start, end, steps): self.start = start self.end = end self.steps = steps self.value = self.start def update(self, step): if step > self.steps: self.value = self.end else: t = step / self.steps self.value = (1-t)*self.start + t*self.endAncestors
Methods
def update(self, step)-
Expand source code
def update(self, step): if step > self.steps: self.value = self.end else: t = step / self.steps self.value = (1-t)*self.start + t*self.end
class RMSProp (lr=0.001, beta=0.5, delta=1e-06)-
RMSProp optimizer
Expand source code
class RMSProp: ''' RMSProp optimizer ''' def __init__(self, lr=0.001, beta=0.5, delta=1e-6): self.lr = wrap_in_scheduler(lr) self.beta = wrap_in_scheduler(beta) self.delta = wrap_in_scheduler(delta) self.accumulations = {} def initialize_symbol(self, symbol): if self.accumulations.get(symbol) is not None: return False self.accumulations[symbol] = np.zeros(symbol.shape) return True def apply_gradient(self, symbol): gradient = np.mean(symbol.gradient, axis=0) self.accumulations[symbol] = self.beta.value * self.accumulations[symbol] + (1 - self.beta.value) * np.multiply(gradient, gradient) symbol.value = symbol.value - self.lr.value / (np.sqrt(self.delta.value + self.accumulations[symbol])) * gradient def update_hyperparameters(self, epoch): self.lr.update(epoch) self.beta.update(epoch) self.delta.update(epoch)Methods
def apply_gradient(self, symbol)-
Expand source code
def apply_gradient(self, symbol): gradient = np.mean(symbol.gradient, axis=0) self.accumulations[symbol] = self.beta.value * self.accumulations[symbol] + (1 - self.beta.value) * np.multiply(gradient, gradient) symbol.value = symbol.value - self.lr.value / (np.sqrt(self.delta.value + self.accumulations[symbol])) * gradient def initialize_symbol(self, symbol)-
Expand source code
def initialize_symbol(self, symbol): if self.accumulations.get(symbol) is not None: return False self.accumulations[symbol] = np.zeros(symbol.shape) return True def update_hyperparameters(self, epoch)-
Expand source code
def update_hyperparameters(self, epoch): self.lr.update(epoch) self.beta.update(epoch) self.delta.update(epoch)
class SGD (lr=0.001)-
SGD optimizer
Expand source code
class SGD: ''' SGD optimizer ''' def __init__(self, lr=0.001): self.lr = wrap_in_scheduler(lr) def initialize_symbol(self, symbol): pass def apply_gradient(self, symbol): symbol.value = symbol.value - self.lr.value * np.mean(symbol.gradient, axis=0) def update_hyperparameters(self, epoch): self.lr.update(epoch)Methods
def apply_gradient(self, symbol)-
Expand source code
def apply_gradient(self, symbol): symbol.value = symbol.value - self.lr.value * np.mean(symbol.gradient, axis=0) def initialize_symbol(self, symbol)-
Expand source code
def initialize_symbol(self, symbol): pass def update_hyperparameters(self, epoch)-
Expand source code
def update_hyperparameters(self, epoch): self.lr.update(epoch)
class SGDMomentum (lr=0.001, beta=0.5)-
SGD with momentum optimizer
Expand source code
class SGDMomentum: ''' SGD with momentum optimizer ''' def __init__(self, lr=0.001, beta=0.5): self.lr = wrap_in_scheduler(lr) self.lr.value *= (1-beta) self.beta = wrap_in_scheduler(beta) self.momentums = {} def initialize_symbol(self, symbol): if self.momentums.get(symbol) is not None: return False self.momentums[symbol] = np.zeros(symbol.shape) return True def apply_gradient(self, symbol): self.momentums[symbol] = self.beta.value * self.momentums[symbol] - self.lr.value * np.mean(symbol.gradient, axis=0) symbol.value = symbol.value + self.momentums[symbol] def update_hyperparameters(self, epoch): self.lr.update(epoch) self.beta.update(epoch)Methods
def apply_gradient(self, symbol)-
Expand source code
def apply_gradient(self, symbol): self.momentums[symbol] = self.beta.value * self.momentums[symbol] - self.lr.value * np.mean(symbol.gradient, axis=0) symbol.value = symbol.value + self.momentums[symbol] def initialize_symbol(self, symbol)-
Expand source code
def initialize_symbol(self, symbol): if self.momentums.get(symbol) is not None: return False self.momentums[symbol] = np.zeros(symbol.shape) return True def update_hyperparameters(self, epoch)-
Expand source code
def update_hyperparameters(self, epoch): self.lr.update(epoch) self.beta.update(epoch)
class Scheduler (value)-
The base class for schedulers. The
updatemethod is called after each epoch during training, and this method updates its attributevalue, whose value will be used as the learning rate during the next epoch.Expand source code
class Scheduler: ''' The base class for schedulers. The `update` method is called after each epoch during training, and this method updates its attribute `value`, whose value will be used as the learning rate during the next epoch. ''' def __init__(self, value): self.value = value def update(self, step): passSubclasses
Methods
def update(self, step)-
Expand source code
def update(self, step): pass