#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# networkqit -- a python module for manipulations of spectral entropies framework
#
# Copyright (C) 2017-2018 Carlo Nicolini <carlo.nicolini@iit.it>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
Define the base and inherited classes for model optimization, both in the continuous approximation
and for the stochastic optimization.
The `ModelOptimizer` class defines the base class where all the other optimization classes must inherit.
The most important class to optimize an expected adjacency model is `ContinuousOptimizer`.
In this class the gradients are defined as:
.. math::
\\frac{\\partial S(\\boldsymbol \\rho \\| \\boldsymbol \\sigma(\\mathbb{E}_{\\boldsymbol \\theta}[\\mathbf{L}]))}{\\partial \\boldsymbol \\theta_k} = \\beta \\textrm{Tr}\\biggl \\lbrack \\left(\\boldsymbol \\rho - \\boldsymbol \\sigma(\\mathbb{E}_{\\boldsymbol \\theta}[L])\\right)\\frac{\\partial \\mathbb{E}_{\\boldsymbol \\theta}[\mathbf{L}]}{\\partial \\theta_k} \\biggr \\rbrack
In the `StochasticOptimizer` class we instead address the issue to implement stochastic gradient descent methods.
In these methods the gradients are defined as:
.. math::
\\frac{\\partial \\mathbb{E}_{\\boldsymbol \\theta}[S(\\boldsymbol \\rho \\| \\boldsymbol \\sigma)]}{\\partial \\theta_k} = \\beta \\textrm{Tr}\\biggl \\lbrack \\boldsymbol \\rho \\frac{\\partial \\mathbb{E}_{\\boldsymbol \\theta}[L]}{\\partial \\boldsymbol \\theta_k}\\biggr \\rbrack + \\frac{\\partial}{\\partial \\theta_k}\\mathbb{E}_{\\boldsymbol \\theta}\\biggl \\lbrack \\log \\left( \\textrm{Tr}\\left \\lbrack e^{-\\beta L(\\boldsymbol \\theta)} \\right \\rbrack \\right) \\biggr \\rbrack
The stochastic optimizer is the **correct** optimizer, as it makes no approximation on the Laplacian eigenvalues.
It is more suitable for small graphs and intermediate $\\beta$, where the differences between the random matrix s
pectrum and its expected counterpart are non-neglibile.
For large and dense enough graphs however the `ContinuousOptimizer` works well and yields deterministic results,
as the optimization landscape is smooth.
In order to minimize the expected relative entropy then, we need both the expected Laplacian formula, which is simple
to get, and a way to estimate the second summand in the gradient, that involves averaging over different realizations
of the log trace of $e^{-\\beta L(\\boldsymbol \\theta)}$.
A good the approximation to the expected logtrace $\\mathbb{E}_{\\boldsymbol \\theta}[\\log \\textrm{Tr}[\\exp{(-\\beta L)}]]$ is,
makes a better is the estimate of the gradients.
Finally, the `MLEOptimizer` maximizes the standard likelihood of a model and it is not related to the spectral entropies
framework introduced in the paper on which **networkqit** is based.
"""
# Copyright (C) 2018 by
# Carlo Nicolini <carlo.nicolini@iit.it>
# All rights reserved.
# BSD license.
import autograd
import autograd.numpy as np
from autograd.numpy.linalg import eigh
from autograd.scipy.misc import logsumexp
from networkqit.graphtheory.matrices import softmax
from scipy.optimize import minimize, least_squares, OptimizeResult
from networkqit.graphtheory import * # imports GraphModel
from networkqit.infotheory.density import *
__all__ = ['MLEOptimizer',
'ContinuousOptimizer',
'StochasticOptimizer',
'Adam']
[docs]class MLEOptimizer:
"""
This class, inheriting from the model optimizer class solves the problem of
maximum likelihood parameters estimation in the classical settings.
"""
[docs] def __init__(self, G: np.array, x0: np.array, model: GraphModel, **kwargs):
"""
Initialize the optimizer with the observed graph, an initial guess and the
model to optimize.
Args:
G (numpy.array) :is the empirical network to study. A N x N adjacency matrix as numpy.array.
x0 (numpy.array): is the k-element array of initial parameters estimates. Typically set as random.
model (nq.GraphModel): the graph model to optimize the likelihood of.
"""
self.G = G
self.x0 = x0
self.model = model
self.sol = None
super().__init__(**kwargs)
def run(self, method, ftol=1E-10, gtol=1E-5, xtol=1E-8, maxiter=1E4, **kwargs):
"""
Maximimize the likelihood of the model given the observed network G.
Args:
method (str): optimization method to use. Can be either 'MLE' or 'saddle_point'.
'MLE' uses L-BFGS-B method to optimize the likelihood of the model
'saddle_point' finds the saddle point solution solving a system of
nonlinear equation. This method finds the parameters such that the
gradients are zero. Here we adopt two different methods.
Models leading to convex likelihood are solved by a direct call to
the least-square optimizer, where we minimize the square error cost:
$sum_i (c_i^* - <c_i>)^2$
where c_i is the empirical value of the specific constraint from MaxEnt
and <c_i> is its ensemble average as defined from the model.
Empirically we found that the 'dogbox' method works best in most cases
with a nice tradeoff between speed and precision.
xtol: (float) set the termination tolerance on parameters
(default 1E-10)
gtol: (float) set the termination tolerance on gradients
(default 1E-10)
maxiter: (int) set the maximum number of iteration. Default 10.000 iterations
Kwargs:
maxfun: (int) set the maximum number of evaluations of the cost.
Default value is very large ten thousand (1E4)
verbose: (int) Verbosity level. No output=0, 2=iterations output.
basin_hopping: (bool) whether to run global optimization with local
optimization by least_squares (only for saddle point method)
basin_hopping_niter: (int) number of basin hopping repetitions
Outputs:
sol: (scipy.optimize.OptimizeResult) parameters at optimal likelihood
"""
opts = {'ftol': ftol,
'gtol': gtol, # as default of LBFGSB
'xtol': xtol,
'maxiter': maxiter,
'eps': kwargs.get('eps', 1E-8), # as default of LBFGSB
'maxfun': kwargs.get('maxfun', 1E10),
'verbose' : kwargs.get('verbose', 0),
'disp': bool(kwargs.get('verbose', 0)),
'iprint': kwargs.get('verbose', 0),
'maxls' : 100
}
if method is 'MLE':
# If the model has non-linear constraints, must use Sequential Linear Square Programming SLSQP
from autograd import jacobian # using automatic jacobian from autograd
J = jacobian(lambda z : -self.model.loglikelihood(self.G,z))
if hasattr(self.model, 'constraints'):
#H=hessian(lambda z : -self.model.loglikelihood(self.G,z))
# remove some options to avoid warnings
opts.pop('gtol'), opts.pop('verbose'), opts.pop('xtol'),opts.pop('maxfun')
self.sol = minimize(fun=lambda z: -self.model.loglikelihood(self.G, z),
x0=np.squeeze(self.x0),
method='SLSQP',
constraints={'fun':self.model.constraints, 'type':'ineq'},
bounds=self.model.bounds,
jac=J,
tol=opts['ftol'],
options=opts)
else: # the model has only bound-constraints, hence use L-BFGS-B
# Optimize using L-BFGS-B which typically returns good results
opts.pop('xtol'), opts.pop('verbose')
self.sol = minimize(fun=lambda z: -self.model.loglikelihood(self.G, z),
x0=np.squeeze(self.x0),
method='L-BFGS-B',
jac=J,
bounds=self.model.bounds,
options=opts)
if kwargs.get('verbose',0)>0:
print(self.sol['message'])
if self.sol['status'] != 0:
RuntimeWarning(self.sol['message'])
#raise Exception('Method did not converge to maximum likelihood: ')
elif method is 'saddle_point':
if hasattr(self.model, 'constraints'):
raise RuntimeError('Cannot solve saddle_point_equations with non-linear constraints')
# Use the Dogbox method to optimize likelihood
#ls_bounds = [np.min(np.ravel(self.model.bounds)), np.max(np.ravel(self.model.bounds))]
ls_bounds = np.ravel(self.model.bounds).astype(float)
ls_bounds[np.isnan(ls_bounds)] = np.inf
ls_bounds = np.reshape(ls_bounds,[len(self.x0),2]).T.tolist() # it is required in a different format
def basin_opt(*basin_opt_args, **basin_opt_kwargs):
opt_result = least_squares(fun=basin_opt_args[0],
x0=np.squeeze(basin_opt_args[1]),
bounds=ls_bounds,
method='trf',
xtol=opts['xtol'],
gtol=opts['gtol'],
tr_solver='lsmr',
max_nfev=opts['maxiter'],
verbose=opts['verbose'])
# use this form with linear loss as in the basinhopping
# func argument to be consistent
opt_result['fun'] = 0.5 * np.sum(opt_result['fun'] ** 2)
return opt_result
# If the model is convex, we simply run least_squares
if not kwargs.get('basinhopping', False):
self.sol = basin_opt(lambda z: self.model.saddle_point(self.G, z), self.x0)
else:
# otherwise combine local and global optimization with basinhopping
nlineq = lambda z: self.model.saddle_point(self.G, z)
from .basinhoppingmod import basinhopping, BHBounds, BHRandStepBounded
bounds = BHBounds(xmin=ls_bounds[0], xmax=ls_bounds[1])
bounded_step = BHRandStepBounded(ls_bounds[0], ls_bounds[1], stepsize=0.5)
self.sol = basinhopping(func=lambda z: (nlineq(z)**2).sum(),
x0=np.squeeze(self.x0),
T=kwargs.get('T', 1),
minimize_routine=basin_opt,
minimizer_kwargs={'saddle_point_equations': nlineq },
accept_test=bounds,
take_step=bounded_step,
niter=kwargs.get('basin_hopping_niter', 5),
disp=bool(kwargs.get('verbose')))
else:
raise RuntimeError('Only MLE and saddle_point methods are supported')
# Compute the corrected Akaike information and Bayes information criteria
# http://downloads.hindawi.com/journals/complexity/2019/5120581.pdf
K = len(self.sol['x'])
N = len(self.G)
n = N*(N-1)/2 # n is the sample size
# Both AIC and BIC are minimum for the best explanatory model
L = self.model.loglikelihood(self.G,self.sol['x'])
self.sol['AIC'] = -2*L + 2*K + (2*K*(K+1)) / (n-K+1)
self.sol['BIC'] = -2*L + K*np.log(n)
return self.sol
[docs]class ContinuousOptimizer:
"""
Continuos optimization method of spectral entropy in the continuous approximation S(rho, sigma(E[L])))
"""
[docs] def __init__(self, A, x0, beta_range, model, **kwargs):
"""
Initialization method, must provide the observed network in form of adjacency matrix,
the initial optimization parameters and the range over which to span $\beta$.
args:
A (numpy.array): The observed adjacency matrix
x0 (numpy.array): The initial value of the optimization parameters (also called θ_0)
beta_range (numpy.array, list): The values for which to run optimization
"""
self.A = A
self.x0 = x0
self.beta_range = beta_range
self.L = graph_laplacian(A)
self.model = model
self.step_callback = None
super().__init__(**kwargs)
def gradient(self, x, rho, beta):
"""
This method computes the gradient as
:math:`\\frac{s(\\boldsymbol \\rho \| \\boldsymbol \\sigma)}{\\partial \\boldsymbol \\theta_k} = \\beta \textrm{Tr}\left \lbrack \left( \rho - \sigma(\theta)\right) \frac{\mathbb{E}\mathbf{L}(\theta)}{\partial \theta_k} \right \rbrack`
args:
x (numpy.array): the current parameters
rho (numpy.array): the observed density matrix
beta (float): the beta, a positive real.
Returns:
the gradient as a three index numpy array. The last index is the one pertaining to the k-th component of x
"""
sigma = compute_vonneuman_density(graph_laplacian(self.model.expected_adjacency(x)), beta)
# Here instead of tracing over the matrix product, we just sum over the entrywise product of the two matrices
# (rho-sigma) and the ∂E_θ[L]/.
return np.array([np.sum(np.multiply(rho - sigma, self.model.expected_laplaplacian_grad(x)[:, i, :].T)) for i in
range(0, len(self.x0))])
def run(self, **kwargs):
"""
Starts the optimization.
Args:
method (string): 'BFGS'
if the optimization problem is bounded, instead use one of the scipy constrained optimizer,
which are 'L-BFGS-B', 'TNC', 'SLSQP' or 'least_squares'
Kwargs:
gtol: (float)
gradient tolerance to be used in optimization (default 1E-12)
maxfun (int):
maximum number of function evaluations
maxiter (int):
maximum number of iterations of gradient descent
xtol (float):
tolerance in the solution change
Output:
sol: (scipy.optimize.OptimizeResult) parameters at optimal likelihood
"""
self.method = kwargs.get('method', 'BFGS')
if self.model.bounds is not None and self.method not in ['L-BFGS-B', 'TNC', 'SLSQP', 'least_squares']:
raise RuntimeWarning('This model has bounds. BFGS cannot handle constraints nor bounds, switch to either '
'L-BFGS-B, TNC or SLSQP')
# Populate the solution list as function of beta
# the list sol contains all optimization points
sol = []
# Iterate over all beta provided by the user
for beta in self.beta_range:
# define the relative entropy function, dependent on current beta
self.rel_entropy_fun = lambda x: SpectralDivergence(Lobs=self.L,
Lmodel=graph_laplacian(self.model.expected_adjacency(x)),
beta=beta).rel_entropy
# Define initially fgrad as None, relying on numerical computation of it
fgrad = None
# Define observed density rho as none initially, if necessary it is computed
rho = None
# If user provides gradients of the model, use them, redefyining fgrad to pass to solver
if self.model.expected_laplaplacian_grad is not None:
# compute rho for each beta, just once
rho = compute_vonneuman_density(L=self.L, beta=beta)
# define the gradient of the Dkl, given rho at current beta
fgrad = lambda x: self.gradient(x, rho, beta)
# Append the solutions
# Here we adopt two different strategies, either minimizing the residual of gradients of Dkl
# using least_squares or minimizing the Dkl itself.
# The least_squares approach requires the gradients of the model, if they are not implemented
# in the model, the algorithmic differentiation is used, which is slower
sol.append(minimize(fun=self.rel_entropy_fun,
x0=self.x0,
jac=fgrad,
method=self.method,
options={'disp': kwargs.get('disp', False),
'gtol': kwargs.get('gtol', 1E-12),
'maxiter': kwargs.get('maxiter', 5E4),
'maxfun': kwargs.get('maxfun', 5E4)},
bounds=self.model.bounds))
# important to reinitialize from the last solution, solution is restarted at every step otherwise
if kwargs.get('reinitialize', True):
self.x0 = sol[-1].x
# Call the step_callback function to print or display current solution
if self.step_callback is not None:
self.step_callback(beta, sol[-1].x)
# Here creates the output data structure as a dictionary of the optimization parameters and variables
spect_div = SpectralDivergence(Lobs=self.L, Lmodel=graph_laplacian(self.model.expected_adjacency(sol[-1].x)),
beta=beta)
sol[-1]['DeltaL'] = (np.trace(self.L) - np.trace(graph_laplacian(self.model.expected_adjacency(sol[-1].x)))) / 2
if kwargs.get('compute_sigma', False):
Lmodel = graph_laplacian(self.model.expected_adjacency(sol[-1].x))
rho = compute_vonneuman_density(L=self.L, beta=beta)
sigma = compute_vonneuman_density(L=Lmodel, beta=beta)
sol[-1]['<DeltaL>'] = np.sum((rho-sigma)*Lmodel)
sol[-1]['T'] = 1 / beta
sol[-1]['beta'] = beta
sol[-1]['loglike'] = spect_div.loglike
sol[-1]['rel_entropy'] = spect_div.rel_entropy
sol[-1]['entropy'] = spect_div.entropy
sol[-1]['AIC'] = 2 * len(self.model.expected_adjacency.args_mapping) - 2 * sol[-1]['loglike']
for i in range(0, len(self.model.expected_adjacency.args_mapping)):
sol[-1][self.model.expected_adjacency.args_mapping[i]] = sol[-1].x[i]
self.sol = sol
return sol
def summary(self, to_dataframe=False):
"""
A convenience function to summarize all the optimization process, with results of optimization.
args:
to_dataframe (bool): if True, returns a pandas dataframe, otherwise returns a list of dictionaries
"""
if to_dataframe:
import pandas as pd
return pd.DataFrame(self.sol).set_index('T') # it's 1/beta
else:
s = "{:>20} " * (len(self.model.expected_adjacency.args_mapping) + 1)
print('=' * 20 * (len(self.model.expected_adjacency.args_mapping) + 1))
print('Summary:')
print('Model:\t' + str(self.model.expected_adjacency.formula))
print('=' * 20 * (len(self.model.expected_adjacency.args_mapping) + 1))
print('Optimization method: ' + self.method)
print('Variables bounds: ')
for i, b in enumerate(self.bounds):
left = '-∞' if self.bounds[i][0] is None else str(
self.bounds[i][0])
right = '+∞' if self.bounds[i][1] is None else str(
self.bounds[i][1])
print("{: >1} {:} {: >10} {:} {: >1}".format(
left, '<=', self.model.expected_adjacency.args_mapping[i], '<=', right))
print('=' * 20 * (len(self.model.expected_adjacency.args_mapping) + 1))
print('Results:')
print('=' * 20 * (len(self.model.expected_adjacency.args_mapping) + 1))
print('Von Neumann Log Likelihood:\t' +
str(self.sol[-1]['loglike']))
print('Von Neumann Entropy:\t\t' + str(self.sol[-1]['entropy']))
print('Von Neumann Relative entropy:\t' +
str(self.sol[-1]['rel_entropy']))
print('AIC:\t\t\t\t' + str(self.sol[-1]['AIC']))
print('=' * 20 * (len(self.model.expected_adjacency.args_mapping) + 1))
print('Estimate:')
print('=' * 20 * (len(self.model.expected_adjacency.args_mapping) + 1))
print(s.format('beta', *self.model.expected_adjacency.args_mapping))
for i in range(0, len(self.sol)):
row = [str(x) for x in self.sol[i].x]
print(s.format(self.sol[i]['beta'], *row))
[docs]class StochasticOptimizer:
"""
This class is at the base of implementation of methods based on stochastic gradient descent.
The idea behind this class is to help the user in designing a nice stochastic gradient descent method,
such as ADAM, AdaGrad or older methods, like the Munro-Robbins stochastic gradients optimizer.
Working out the expression for the gradients of the relative entropy, one remains with the following:
:math: `\nabla_{\theta}S(\rho \| \sigma) = \beta \textrm\biggl \lbrack \rho \nabla_{\theta}\mathbb{E}_{\theta}[L]} \biggr \rbrack`
:math: `\frac{\partial S(\rho \| \sigma)}{\partial \theta_k} = \beta \textrm{Tr}\lbrack \rho \frac{\partial}{\partial \theta_k} \rbrack + \frac{\partial}{\partial \theta_k}\mathbb{E}_{\theta}\log \textrm{Tr} e^{-\beta L(\theta)}\lbrack \rbrack`
This class requires either Tensorflow or Pytorch to support backpropagation in the eigenvalues routines.
Alternatively you can use github.com/HIPS/autograd method for full CPU support.
"""
[docs] def __init__(self, G: np.array, x0 : np.array, model : GraphModel, **kwargs):
"""
Initialize the stochastic optimizer with the observed graph, an initial guess and the
model to optimize.
Args:
G (numpy.array) :is the empirical network to study. A N x N adjacency matrix as numpy.array.
x0 (numpy.array): is the k-element array of initial parameters estimates. Typically set as random.
model (nq.GraphModel): the graph model to optimize the likelihood of.
"""
self.G = G
self.L = graph_laplacian(self.G) # compute graph laplacian
self.x0 = x0
self.model = model
self.sol = OptimizeResult(x=self.x0,
success=False,
status=None,
message='',
fun=None,
jac=None,
hess=None,
hess_inv=None,
nfev=0, # number of function evaluations
njev=0, # number of jacobian evaluations
nhev=0, # number of hessian evaluations
nit=0, # number of iterations
maxcv=-1) # maximum constraint violation
self.model.ls_bounds = np.ravel(self.model.bounds).astype(float)
self.model.ls_bounds[np.isnan(self.model.ls_bounds)] = np.inf
self.model.min_bounds = self.model.ls_bounds.reshape([len(self.x0),2])[:,0]
self.model.max_bounds = self.model.ls_bounds.reshape([len(self.x0),2])[:,1]
def gradient(self, x, rho, beta, batch_size=1):
# Compute the relative entropy between rho and sigma, using tensors with autograd support
# Entropy of rho is constant over the iterations
# if beta is kept constant, otherwise a specific rho can be passed
#lambd_rho = eigh(rho)[0]
#entropy = -np.sum(lambd_rho * np.log(lambd_rho))
def log_likelihood(z):
N = len(self.G)
# advanced broadcasting here!
# Sample 'batch_size' adjacency matrices shape=[batch_size,N,N]
Amodel = self.model.sample_adjacency(theta=z, batch_size=batch_size, with_grads=True)
#print('Amodel nan?:', np.any(np.isnan(Amodel.ravel())))
# Here exploit broadcasting to create batch_size diagonal matrices with the degrees
Dmodel = np.eye(N) * np.transpose(np.zeros([1, 1, N]) + np.einsum('ijk->ik', Amodel, optimize=True), [1, 0, 2])
Lmodel = Dmodel - Amodel # returns a batch_size x N x N tensor
# do average over batches of the sum of product of matrix elements (done with * operator)
Emodel = np.mean(np.sum(np.sum(Lmodel * rho, axis=2), axis=1))
lambd_model = eigh(Lmodel)[0] # eigh is a batched operation, take the eigenvalues only
Fmodel = - np.mean(logsumexp(-beta * lambd_model, axis=1) / beta)
loglike = -beta * Emodel + Fmodel # - Tr[rho log(sigma)]
#loglike #- entropy # Tr[rho log(rho)] - Tr[rho log(sigma)]
# Add a penalty term accouting for boundaries violation
# Follows these ideas https://www.cs.jhu.edu/~ijwang/pub/01271742.pdf
def quadratic_penalty(theta):
z = np.zeros(theta.shape)
# compute the penalty with respect to low bounds violation
penalty_low = np.sum(np.max(np.hstack([z, -theta + self.model.min_bounds]))**2)
# compute the penalty with respect to high bounds violation
penalty_high = np.sum(np.max(np.hstack([z, theta - self.model.max_bounds]))**2)
return 0.5 * (penalty_low + penalty_high)
def absolute_value_penalty(theta):
z = np.zeros(theta.shape)
# compute the penalty with respect to low bounds violation
penalty_low = np.max(np.hstack([z, -theta + self.model.min_bounds]))
# compute the penalty with respect to high bounds violation
penalty_high = np.max(np.hstack([z, theta - self.model.max_bounds]))
return np.max(np.hstack(penalty_low,penalty_high))
penalty = quadratic_penalty
cost = -loglike + beta*penalty(z)
return cost
# value and gradient of relative entropy as a function
dkl_and_dkldx = autograd.value_and_grad(log_likelihood)
return dkl_and_dkldx(x)
[docs]class Adam(StochasticOptimizer):
"""
Implements the ADAM stochastic gradient descent.
Adam: A Method for Stochastic Optimization
Diederik P. Kingma, Jimmy Ba
https://arxiv.org/abs/1412.6980
However here we use quasi-hyperbolic adam by default, with parameters nu1,nu2
https://arxiv.org/pdf/1810.06801.pdf
"""
[docs] def __init__(self, G: np.array, x0 : np.array, model : GraphModel, **kwargs):
super().__init__(G, x0, model, **kwargs)
self.logfile = open('adam.log','w')
self.mt, self.vt = np.zeros(self.sol.x.shape), np.zeros(self.sol.x.shape)
def run(self,
beta,
rho=None,
maxiter=1E4,
learning_rate=1E-3,
beta1=0.9,
beta2=0.999,
nu1=0.7,
nu2=1.0,
epsilon=1E-8,
batch_size=64,
ftol=1E-10,
gtol=1E-5,
xtol=1E-8,
last_iters = 100,
quasi_hyperbolic = True,
callback=None,
**kwargs):
opts = {'ftol': ftol,
'gtol': gtol, # as default of LBFGSB
'xtol': xtol,
'maxiter': maxiter,
'eps': kwargs.get('eps', 1E-8), # as default of LBFGSB
'maxfun': kwargs.get('maxfun', 1E10),
'verbose' : kwargs.get('verbose', 0),
'disp': bool(kwargs.get('verbose', 0)),
'iprint': kwargs.get('verbose', 0)
}
# if rho is provided, user rho is used, otherwise is computed at every beta
if rho is None:
rho = compute_vonneuman_density(L=self.L, beta=beta)
#logging.basicConfig(stream=sys.stderr, level=logging.INFO)
#import logging
#adam_logger = logging.getLogger('ADAM')
#adam_logger.setLevel(logging.INFO)
#all_dkl = []
#from scipy.stats import linregress
for t in range(1,int(maxiter)+1):
self.sol.nit += 1
# get the relative entropy value and its gradient w.r.t. variables
dkl, grad_t = self.gradient(self.sol.x, rho, beta, batch_size=batch_size)
print('\rIteration %d beta=%.3f' % (t,beta),end='')
# Convergence status
if np.linalg.norm(grad_t) < gtol:
self.sol.success = True
self.sol.message = 'Exceeded minimum gradient |grad| < %g' % gtol
break
#all_dkl.append(dkl)
# check the convergence based on the slope of dkl of the last 10 iterations
#if t > last_iters:
# dkl_iters = np.arange(last_iters)
# slope, intercept, r_value, p_value, std_err = linregress(x=dkl_iters, y=all_dkl[-last_iters:])
# if np.abs(slope - 1E-3) < 0:
# converged = True
# print('Optimization terminated for flatness')
self.mt = beta1 * self.mt + (1.0 - beta1) * grad_t
self.vt = beta2 * self.vt + (1.0 - beta2) * (grad_t * grad_t)
mttilde = self.mt / (1.0 - (beta1 ** t)) # compute bias corrected first moment estimate
vttilde = self.vt / (1.0 - (beta2 ** t)) # compute bias-corrected second raw moment estimate
if quasi_hyperbolic: # quasi hyperbolic adam
deltax = ((1-nu1) * grad_t + nu1 * mttilde)/(np.sqrt((1-nu2) * (grad_t**2) + nu2 * vttilde ) + epsilon)
else: # vanilla Adam
deltax = mttilde / np.sqrt(vttilde + epsilon)
self.sol.x -= learning_rate * deltax
self.logfile.write( u"%g\t%g\t%g\n" % (self.sol.x,beta,dkl) )
if t % 50:
self.logfile.flush()
return self.sol
# def run(self, **kwargs):
# x = self.x0
# batch_size = kwargs.get('batch_size', 1)
# max_iters = kwargs.get('max_iters', 1000)
# # ADAM parameters
# eta = kwargs.get('eta', 1E-3)
# gtol = kwargs.get('gtol', 1E-4)
# xtol = kwargs.get('xtol', 1E-3)
# beta1 = 0.9
# beta2 = 0.999
# epsilon = 1E-8 # avoid nan with large learning rates
# # Use quasi-hyperbolic adam by default
# # https://arxiv.org/pdf/1810.06801.pdf
# quasi_hyperbolic = kwargs.get('quasi_hyperbolic', True)
# nu1 = 0.7 # for quasi-hyperbolic adam
# nu2 = 1.0 # for quasi-hyperbolic adam
# # visualization options
# refresh_frames = kwargs.get('refresh_frames', 100)
# from drawnow import drawnow, figure
# figure(figsize=(8,8))
# import matplotlib.pyplot as plt
# #plt.figure(figsize=(8, 8))
# # Populate the solution list as function of beta
# # the list sol contains all optimization points
# sol = []
# # Iterate over all beta provided by the user
# mt, vt = np.zeros(self.x0.shape), np.zeros(self.x0.shape)
# all_dkl = []
# # TODO implement model boundaries in Adam
# frames = 0
# filename = 'adam.log'
# logfile = open(filename,'w')
# from scipy.stats import linregress
# import logging
# import sys
# logging.basicConfig(stream=sys.stderr, level=logging.INFO)
# adam_logger = logging.getLogger('ADAM')
# adam_logger.setLevel(logging.INFO)
# for beta in self.beta_range:
# adam_logger.info('Changed beta to %g' % beta)
# # if rho is provided, user rho is used, otherwise is computed at every beta
# rho = kwargs.get('rho', compute_vonneuman_density(L=self.L, beta=beta))
# # initialize at x0
# x = self.x0
# converged = False
# t = 0 # t is the iteration number
# while not converged:
# t += 1
# # get the relative entropy value and its gradient w.r.t. variables
# dkl, grad_t = self.gradient(x, rho, beta, batch_size=batch_size)
# # Convergence status
# if np.linalg.norm(grad_t) < gtol:
# converged = True
# adam_logger.info('Exceeded minimum gradient |grad|<%g' % gtol)
# if t > max_iters:
# adam_logger.info('Exceeded maximum iterations')
# converged = True
# # check the convergence based on the slope of dkl of the last 10 iterations
# last_iters = 100
# if len(all_dkl) > last_iters:
# dkl_iters = np.arange(last_iters)
# slope, intercept, r_value, p_value, std_err = linregress(x=dkl_iters, y=all_dkl[-last_iters:])
# if np.abs(slope - 1E-3) < 0:
# converged = True
# adam_logger.info('Optimization terminated for flatness')
# # TODO implement check boundaries in Adam
# # if np.any(np.ravel(self.model.bounds)):
# # raise RuntimeError('variable bounds exceeded')
# mt = beta1 * mt + (1.0 - beta1) * grad_t
# vt = beta2 * vt + (1.0 - beta2) * (grad_t * grad_t)
# mttilde = mt / (1.0 - (beta1 ** t)) # compute bias corrected first moment estimate
# vttilde = vt / (1.0 - (beta2 ** t)) # compute bias-corrected second raw moment estimate
# if quasi_hyperbolic: # quasi hyperbolic adam
# deltax = ((1-nu1) * grad_t + nu1 * mttilde)/(np.sqrt((1-nu2) * (grad_t**2) + nu2 * vttilde ) + epsilon)
# else: # vanilla Adam
# deltax = mttilde / np.sqrt(vttilde + epsilon)
# x -= eta * deltax
# #all_dkl.append(dkl)
# #print(np.hstack([t, x,beta,dkl]))
# #logfile.write( u"%g\t%g\t%g\n" % (x,beta,dkl) )
# # if t % refresh_frames == 0:
# # frames += 1
# # def draw_fig():
# # plot_beta_range = np.logspace(-3,3,100)
# # sol.append({'x': x.copy()})
# # #plt.figure(figsize=(8, 8))
# # A0 = np.mean(self.model.sample_adjacency(theta=x, batch_size=batch_size), axis=0)
# # plt.subplot(2, 2, 1)
# # im = plt.imshow(self.A)
# # plt.colorbar(im)
# # plt.title('Data')
# # plt.subplot(2, 2, 2)
# # im = plt.imshow(A0)
# # plt.colorbar(im)
# # plt.title('<Model>')
# # plt.subplot(2, 2, 3)
# # plt.plot(all_dkl)
# # plt.xlabel('iteration')
# # plt.ylabel('$S(\\boldsymbol \\rho,\\boldsymbol \\sigma)$')
# # plt.subplot(2, 2, 4)
# # plt.semilogx(plot_beta_range, batch_compute_vonneumann_entropy(self.L, plot_beta_range), '.-', label='data')
# # plt.semilogx(plot_beta_range, batch_compute_vonneumann_entropy(graph_laplacian(A0), plot_beta_range), '.-', label='model')
# # plt.plot(beta, batch_compute_vonneumann_entropy(graph_laplacian(A0), [beta]), 'ko', label='model')
# # plt.xlabel('$\\beta$')
# # plt.ylabel('$S$')
# # plt.title('Entropy')
# # plt.legend(loc='best')
# # plt.suptitle('$\\beta=$' + '{0:0>3}'.format(beta))
# # #plt.tight_layout()
# # drawnow(draw_fig)
# self.sol = sol
# adam_logger.info('Optimization finished\n' + str(self.sol))
# return sol