Source code for networkqit.algorithms.optimize

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# networkqit -- a python module for manipulations of spectral entropies framework
#
# Copyright (C) 2017-2018 Carlo Nicolini <carlo.nicolini@iit.it>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""

Define the base and inherited classes for model optimization, both in the continuous approximation
and for the stochastic optimization.

The `ModelOptimizer` class defines the base class where all the other optimization classes must inherit.
The most important class to optimize an expected adjacency model is `ContinuousOptimizer`.
In this class the gradients are defined as:

.. math::
    
        \\frac{\\partial S(\\boldsymbol \\rho \\| \\boldsymbol \\sigma(\\mathbb{E}_{\\boldsymbol \\theta}[\\mathbf{L}]))}{\\partial \\boldsymbol \\theta_k} = \\beta \\textrm{Tr}\\biggl \\lbrack \\left(\\boldsymbol \\rho - \\boldsymbol \\sigma(\\mathbb{E}_{\\boldsymbol \\theta}[L])\\right)\\frac{\\partial \\mathbb{E}_{\\boldsymbol \\theta}[\mathbf{L}]}{\\partial \\theta_k} \\biggr \\rbrack

In the `StochasticOptimizer` class we instead address the issue to implement stochastic gradient descent methods.
In these methods the gradients are defined as:

.. math::
   
   \\frac{\\partial \\mathbb{E}_{\\boldsymbol \\theta}[S(\\boldsymbol \\rho \\| \\boldsymbol \\sigma)]}{\\partial \\theta_k} = \\beta \\textrm{Tr}\\biggl \\lbrack \\boldsymbol \\rho \\frac{\\partial  \\mathbb{E}_{\\boldsymbol \\theta}[L]}{\\partial \\boldsymbol \\theta_k}\\biggr \\rbrack + \\frac{\\partial}{\\partial \\theta_k}\\mathbb{E}_{\\boldsymbol \\theta}\\biggl \\lbrack \\log \\left( \\textrm{Tr}\\left \\lbrack e^{-\\beta L(\\boldsymbol \\theta)} \\right \\rbrack \\right) \\biggr \\rbrack

The stochastic optimizer is the **correct** optimizer, as it makes no approximation on the Laplacian eigenvalues.
It is more suitable for small graphs and intermediate $\\beta$, where the differences between the random matrix s
pectrum and its expected counterpart are non-neglibile.
For large and dense enough graphs however the `ContinuousOptimizer` works well and yields deterministic results,
as the optimization landscape is smooth.

In order to minimize the expected relative entropy then, we need both the expected Laplacian formula, which is simple
to get, and a way to estimate the second summand in the gradient, that involves averaging over different realizations
of the log trace of  $e^{-\\beta L(\\boldsymbol \\theta)}$.
A good the approximation to the expected logtrace $\\mathbb{E}_{\\boldsymbol \\theta}[\\log \\textrm{Tr}[\\exp{(-\\beta L)}]]$ is,
makes a better is the estimate of the gradients.

Finally, the `MLEOptimizer` maximizes the standard likelihood of a model and it is not related to the spectral entropies
framework introduced in the paper on which **networkqit** is based.

"""
#    Copyright (C) 2018 by
#    Carlo Nicolini <carlo.nicolini@iit.it>
#    All rights reserved.
#    BSD license.

import autograd
import autograd.numpy as np
from autograd.numpy.linalg import eigh
from autograd.scipy.misc import logsumexp
from networkqit.graphtheory.matrices import softmax
from scipy.optimize import minimize, least_squares, OptimizeResult
from networkqit.graphtheory import * # imports GraphModel
from networkqit.infotheory.density import *

__all__ = ['MLEOptimizer',
           'ContinuousOptimizer',
           'StochasticOptimizer',
           'Adam']

[docs]class MLEOptimizer:
    """
    This class, inheriting from the model optimizer class solves the problem of 
    maximum likelihood parameters estimation in the classical settings.
    """

[docs]    def __init__(self, G: np.array, x0: np.array, model: GraphModel, **kwargs):
        """
        Initialize the optimizer with the observed graph, an initial guess and the 
        model to optimize.

        Args:
            G (numpy.array) :is the empirical network to study. A N x N adjacency matrix as numpy.array.
            x0 (numpy.array): is the k-element array of initial parameters estimates. Typically set as random.
            model (nq.GraphModel): the graph model to optimize the likelihood of.
        """
        self.G = G
        self.x0 = x0
        self.model = model
        self.sol = None
        super().__init__(**kwargs)

    def run(self, method, ftol=1E-10, gtol=1E-5, xtol=1E-8, maxiter=1E4, **kwargs):
        """
        Maximimize the likelihood of the model given the observed network G.

        Args:
            method (str): optimization method to use. Can be either 'MLE' or 'saddle_point'.
                          'MLE' uses L-BFGS-B method to optimize the likelihood of the model
                          'saddle_point' finds the saddle point solution solving a system of
                          nonlinear equation. This method finds the parameters such that the
                          gradients are zero. Here we adopt two different methods.
                          Models leading to convex likelihood are solved by a direct call to
                          the least-square optimizer, where we minimize the square error cost:
                          $sum_i (c_i^* - <c_i>)^2$
                          where c_i is the empirical value of the specific constraint from MaxEnt
                          and <c_i> is its ensemble average as defined from the model.
                          Empirically we found that the 'dogbox' method works best in most cases
                          with a nice tradeoff between speed and precision.

            xtol:       (float) set the termination tolerance on parameters
                        (default 1E-10)

            gtol:       (float) set the termination tolerance on gradients
                        (default 1E-10)

            maxiter:    (int) set the maximum number of iteration. Default 10.000 iterations

        Kwargs:
            maxfun:     (int) set the maximum number of evaluations of the cost.
                        Default value is very large ten thousand (1E4)

            verbose:    (int) Verbosity level. No output=0, 2=iterations output.

            basin_hopping: (bool) whether to run global optimization with local
                optimization by least_squares (only for saddle point method)

            basin_hopping_niter: (int) number of basin hopping repetitions

        Outputs:
            sol: (scipy.optimize.OptimizeResult) parameters at optimal likelihood
        """

        opts = {'ftol': ftol,
                'gtol': gtol, # as default of LBFGSB
                'xtol': xtol,
                'maxiter': maxiter,
                'eps': kwargs.get('eps', 1E-8), # as default of LBFGSB
                'maxfun': kwargs.get('maxfun', 1E10),
                'verbose' : kwargs.get('verbose', 0),
                'disp':  bool(kwargs.get('verbose', 0)),
                'iprint': kwargs.get('verbose', 0),
                'maxls' : 100
                }

        if method is 'MLE':
            # If the model has non-linear constraints, must use Sequential Linear Square Programming SLSQP
            from autograd import jacobian # using automatic jacobian from autograd
            J = jacobian(lambda z : -self.model.loglikelihood(self.G,z))
            if hasattr(self.model, 'constraints'):
                #H=hessian(lambda z : -self.model.loglikelihood(self.G,z))
                # remove some options to avoid warnings
                opts.pop('gtol'), opts.pop('verbose'), opts.pop('xtol'),opts.pop('maxfun')
                self.sol = minimize(fun=lambda z: -self.model.loglikelihood(self.G, z),
                                    x0=np.squeeze(self.x0),
                                    method='SLSQP',
                                    constraints={'fun':self.model.constraints, 'type':'ineq'},
                                    bounds=self.model.bounds,
                                    jac=J,
                                    tol=opts['ftol'],
                                    options=opts)
            else: # the model has only bound-constraints, hence use L-BFGS-B
                # Optimize using L-BFGS-B which typically returns good results
                opts.pop('xtol'), opts.pop('verbose')
                self.sol = minimize(fun=lambda z: -self.model.loglikelihood(self.G, z),
                                    x0=np.squeeze(self.x0),
                                    method='L-BFGS-B',
                                    jac=J,
                                    bounds=self.model.bounds,
                                    options=opts)
            if kwargs.get('verbose',0)>0:
                print(self.sol['message'])
                if self.sol['status'] != 0:
                    RuntimeWarning(self.sol['message'])
                #raise Exception('Method did not converge to maximum likelihood: ')
        elif method is 'saddle_point':
            if hasattr(self.model, 'constraints'):
                raise RuntimeError('Cannot solve saddle_point_equations with non-linear constraints')
            # Use the Dogbox method to optimize likelihood
            #ls_bounds = [np.min(np.ravel(self.model.bounds)), np.max(np.ravel(self.model.bounds))]
            ls_bounds = np.ravel(self.model.bounds).astype(float)
            ls_bounds[np.isnan(ls_bounds)] = np.inf
            ls_bounds = np.reshape(ls_bounds,[len(self.x0),2]).T.tolist() # it is required in a different format
            def basin_opt(*basin_opt_args, **basin_opt_kwargs):
                opt_result = least_squares(fun=basin_opt_args[0],
                                           x0=np.squeeze(basin_opt_args[1]),
                                           bounds=ls_bounds,
                                           method='trf',
                                           xtol=opts['xtol'],
                                           gtol=opts['gtol'],
                                           tr_solver='lsmr',
                                           max_nfev=opts['maxiter'],
                                           verbose=opts['verbose'])
                # use this form with linear loss as in the basinhopping
                # func argument to be consistent
                opt_result['fun'] = 0.5 * np.sum(opt_result['fun'] ** 2)
                return opt_result

            # If the model is convex, we simply run least_squares
            if not kwargs.get('basinhopping', False):
                self.sol = basin_opt(lambda z: self.model.saddle_point(self.G, z), self.x0)
            else:
                # otherwise combine local and global optimization with basinhopping
                nlineq = lambda z: self.model.saddle_point(self.G, z)
                from .basinhoppingmod import basinhopping, BHBounds, BHRandStepBounded
                bounds = BHBounds(xmin=ls_bounds[0], xmax=ls_bounds[1])
                bounded_step = BHRandStepBounded(ls_bounds[0], ls_bounds[1], stepsize=0.5)
                self.sol = basinhopping(func=lambda z: (nlineq(z)**2).sum(),
                                        x0=np.squeeze(self.x0),
                                        T=kwargs.get('T', 1),
                                        minimize_routine=basin_opt,
                                        minimizer_kwargs={'saddle_point_equations': nlineq },
                                        accept_test=bounds,
                                        take_step=bounded_step,
                                        niter=kwargs.get('basin_hopping_niter', 5),
                                        disp=bool(kwargs.get('verbose')))
        
        else:
            raise RuntimeError('Only MLE and saddle_point methods are supported')
        
        # Compute the corrected Akaike information and Bayes information criteria
        # http://downloads.hindawi.com/journals/complexity/2019/5120581.pdf
        K = len(self.sol['x'])
        N = len(self.G)
        n = N*(N-1)/2 # n is the sample size
        # Both AIC and BIC are minimum for the best explanatory model
        L = self.model.loglikelihood(self.G,self.sol['x'])
        self.sol['AIC'] = -2*L + 2*K + (2*K*(K+1)) / (n-K+1)
        self.sol['BIC'] = -2*L + K*np.log(n)
        return self.sol


[docs]class ContinuousOptimizer:
    """
    Continuos optimization method of spectral entropy in the continuous approximation S(rho, sigma(E[L])))
    """

[docs]    def __init__(self, A, x0, beta_range, model, **kwargs):
        """
        Initialization method, must provide the observed network in form of adjacency matrix,
        the initial optimization parameters and the range over which to span $\beta$.

        args:
            A (numpy.array): The observed adjacency matrix
            x0 (numpy.array): The initial value of the optimization parameters (also called θ_0)
            beta_range (numpy.array, list): The values for which to run optimization
        """
        self.A = A
        self.x0 = x0
        self.beta_range = beta_range
        self.L = graph_laplacian(A)
        self.model = model
        self.step_callback = None
        super().__init__(**kwargs)

    def gradient(self, x, rho, beta):
        """
        This method computes the gradient as 
        
        :math:`\\frac{s(\\boldsymbol \\rho \| \\boldsymbol \\sigma)}{\\partial \\boldsymbol \\theta_k} = \\beta \textrm{Tr}\left \lbrack \left( \rho - \sigma(\theta)\right) \frac{\mathbb{E}\mathbf{L}(\theta)}{\partial \theta_k} \right \rbrack`
        
        args:
            x (numpy.array): the current parameters
            rho (numpy.array): the observed density matrix
            beta (float): the beta, a positive real.

        Returns:
            the gradient as a three index numpy array. The last index is the one pertaining to the k-th component of x
        """
        sigma = compute_vonneuman_density(graph_laplacian(self.model.expected_adjacency(x)), beta)
        # Here instead of tracing over the matrix product, we just sum over the entrywise product of the two matrices
        # (rho-sigma) and the ∂E_θ[L]/.
        return np.array([np.sum(np.multiply(rho - sigma, self.model.expected_laplaplacian_grad(x)[:, i, :].T)) for i in
                         range(0, len(self.x0))])

    def run(self, **kwargs):
        """
        Starts the optimization.

        Args:
            method (string): 'BFGS'
            if the optimization problem is bounded, instead use one of the scipy constrained optimizer,
            which are 'L-BFGS-B', 'TNC', 'SLSQP' or 'least_squares'
        
        Kwargs:

            gtol: (float)
                gradient tolerance to be used in optimization (default 1E-12)

            maxfun (int):
                maximum number of function evaluations

            maxiter (int):
                maximum number of iterations of gradient descent

            xtol (float):
                tolerance in the solution change

        Output:
            sol: (scipy.optimize.OptimizeResult) parameters at optimal likelihood
        """

        self.method = kwargs.get('method', 'BFGS')
        if self.model.bounds is not None and self.method not in ['L-BFGS-B', 'TNC', 'SLSQP', 'least_squares']:
            raise RuntimeWarning('This model has bounds. BFGS cannot handle constraints nor bounds, switch to either '
                                 'L-BFGS-B, TNC or SLSQP')

        # Populate the solution list as function of beta
        # the list sol contains all optimization points
        sol = []
        # Iterate over all beta provided by the user
        for beta in self.beta_range:
            # define the relative entropy function, dependent on current beta
            self.rel_entropy_fun = lambda x: SpectralDivergence(Lobs=self.L,
                                                                Lmodel=graph_laplacian(self.model.expected_adjacency(x)),
                                                                beta=beta).rel_entropy

            # Define initially fgrad as None, relying on numerical computation of it
            fgrad = None

            # Define observed density rho as none initially, if necessary it is computed
            rho = None

            # If user provides gradients of the model, use them, redefyining fgrad to pass to solver
            if self.model.expected_laplaplacian_grad is not None:
                # compute rho for each beta, just once
                rho = compute_vonneuman_density(L=self.L, beta=beta)
                # define the gradient of the Dkl, given rho at current beta
                fgrad = lambda x: self.gradient(x, rho, beta)

            # Append the solutions
            # Here we adopt two different strategies, either minimizing the residual of gradients of Dkl
            # using least_squares or minimizing the Dkl itself.
            # The least_squares approach requires the gradients of the model, if they are not implemented 
            # in the model, the algorithmic differentiation is used, which is slower

            sol.append(minimize(fun=self.rel_entropy_fun,
                                x0=self.x0,
                                jac=fgrad,
                                method=self.method,
                                options={'disp': kwargs.get('disp', False),
                                         'gtol': kwargs.get('gtol', 1E-12),
                                         'maxiter': kwargs.get('maxiter', 5E4),
                                         'maxfun': kwargs.get('maxfun', 5E4)},
                                bounds=self.model.bounds))

            # important to reinitialize from the last solution, solution is restarted at every step otherwise
            if kwargs.get('reinitialize', True):
                self.x0 = sol[-1].x
            # Call the step_callback function to print or display current solution
            if self.step_callback is not None:
                self.step_callback(beta, sol[-1].x)

            # Here creates the output data structure as a dictionary of the optimization parameters and variables
            spect_div = SpectralDivergence(Lobs=self.L, Lmodel=graph_laplacian(self.model.expected_adjacency(sol[-1].x)),
                                           beta=beta)
            sol[-1]['DeltaL'] = (np.trace(self.L) - np.trace(graph_laplacian(self.model.expected_adjacency(sol[-1].x)))) / 2
            if kwargs.get('compute_sigma', False):
                Lmodel = graph_laplacian(self.model.expected_adjacency(sol[-1].x))
                rho = compute_vonneuman_density(L=self.L, beta=beta)
                sigma = compute_vonneuman_density(L=Lmodel, beta=beta)
                sol[-1]['<DeltaL>'] = np.sum((rho-sigma)*Lmodel)
            sol[-1]['T'] = 1 / beta
            sol[-1]['beta'] = beta
            sol[-1]['loglike'] = spect_div.loglike
            sol[-1]['rel_entropy'] = spect_div.rel_entropy
            sol[-1]['entropy'] = spect_div.entropy
            sol[-1]['AIC'] = 2 * len(self.model.expected_adjacency.args_mapping) - 2 * sol[-1]['loglike']
            for i in range(0, len(self.model.expected_adjacency.args_mapping)):
                sol[-1][self.model.expected_adjacency.args_mapping[i]] = sol[-1].x[i]
        self.sol = sol
        return sol

    def summary(self, to_dataframe=False):
        """
        A convenience function to summarize all the optimization process, with results of optimization.

        args:
            to_dataframe (bool): if True, returns a pandas dataframe, otherwise returns a list of dictionaries
        """
        if to_dataframe:
            import pandas as pd
            return pd.DataFrame(self.sol).set_index('T')  # it's 1/beta
        else:
            s = "{:>20} " * (len(self.model.expected_adjacency.args_mapping) + 1)
            print('=' * 20 * (len(self.model.expected_adjacency.args_mapping) + 1))
            print('Summary:')
            print('Model:\t' + str(self.model.expected_adjacency.formula))
            print('=' * 20 * (len(self.model.expected_adjacency.args_mapping) + 1))
            print('Optimization method: ' + self.method)
            print('Variables bounds: ')
            for i, b in enumerate(self.bounds):
                left = '-∞' if self.bounds[i][0] is None else str(
                    self.bounds[i][0])
                right = '+∞' if self.bounds[i][1] is None else str(
                    self.bounds[i][1])
                print("{: >1} {:} {: >10} {:} {: >1}".format(
                    left, '<=', self.model.expected_adjacency.args_mapping[i], '<=', right))
            print('=' * 20 * (len(self.model.expected_adjacency.args_mapping) + 1))
            print('Results:')
            print('=' * 20 * (len(self.model.expected_adjacency.args_mapping) + 1))
            print('Von Neumann Log Likelihood:\t' +
                  str(self.sol[-1]['loglike']))
            print('Von Neumann Entropy:\t\t' + str(self.sol[-1]['entropy']))
            print('Von Neumann Relative entropy:\t' +
                  str(self.sol[-1]['rel_entropy']))
            print('AIC:\t\t\t\t' + str(self.sol[-1]['AIC']))
            print('=' * 20 * (len(self.model.expected_adjacency.args_mapping) + 1))
            print('Estimate:')
            print('=' * 20 * (len(self.model.expected_adjacency.args_mapping) + 1))
            print(s.format('beta', *self.model.expected_adjacency.args_mapping))
            for i in range(0, len(self.sol)):
                row = [str(x) for x in self.sol[i].x]
                print(s.format(self.sol[i]['beta'], *row))


[docs]class StochasticOptimizer:
    """
    This class is at the base of implementation of methods based on stochastic gradient descent.
    The idea behind this class is to help the user in designing a nice stochastic gradient descent method,
    such as ADAM, AdaGrad or older methods, like the Munro-Robbins stochastic gradients optimizer.
    Working out the expression for the gradients of the relative entropy, one remains with the following:

    :math: `\nabla_{\theta}S(\rho \| \sigma) = \beta \textrm\biggl \lbrack \rho \nabla_{\theta}\mathbb{E}_{\theta}[L]} \biggr \rbrack`
        
    :math: `\frac{\partial S(\rho \| \sigma)}{\partial \theta_k} = \beta \textrm{Tr}\lbrack \rho \frac{\partial}{\partial \theta_k} \rbrack + \frac{\partial}{\partial \theta_k}\mathbb{E}_{\theta}\log \textrm{Tr} e^{-\beta L(\theta)}\lbrack \rbrack`
    
    This class requires either Tensorflow or Pytorch to support backpropagation in the eigenvalues routines.
    Alternatively you can use github.com/HIPS/autograd method for full CPU support.
    """

[docs]    def __init__(self, G: np.array, x0 : np.array, model : GraphModel, **kwargs):
        """
        Initialize the stochastic optimizer with the observed graph, an initial guess and the 
        model to optimize.

        Args:
            G (numpy.array) :is the empirical network to study. A N x N adjacency matrix as numpy.array.
            x0 (numpy.array): is the k-element array of initial parameters estimates. Typically set as random.
            model (nq.GraphModel): the graph model to optimize the likelihood of.
        """
        self.G = G
        self.L = graph_laplacian(self.G) # compute graph laplacian
        self.x0 = x0
        self.model = model
        self.sol = OptimizeResult(x=self.x0,
                                  success=False,
                                  status=None,
                                  message='',
                                  fun=None,
                                  jac=None,
                                  hess=None,
                                  hess_inv=None,
                                  nfev=0, # number of function evaluations
                                  njev=0, # number of jacobian evaluations
                                  nhev=0, # number of hessian evaluations
                                  nit=0,  # number of iterations
                                  maxcv=-1) # maximum constraint violation
        
        self.model.ls_bounds = np.ravel(self.model.bounds).astype(float)
        self.model.ls_bounds[np.isnan(self.model.ls_bounds)] = np.inf

        self.model.min_bounds = self.model.ls_bounds.reshape([len(self.x0),2])[:,0]
        self.model.max_bounds = self.model.ls_bounds.reshape([len(self.x0),2])[:,1]


    def gradient(self, x, rho, beta, batch_size=1):
        # Compute the relative entropy between rho and sigma, using tensors with autograd support
        # Entropy of rho is constant over the iterations
        # if beta is kept constant, otherwise a specific rho can be passed
        #lambd_rho = eigh(rho)[0]
        #entropy = -np.sum(lambd_rho * np.log(lambd_rho))

        def log_likelihood(z):
            N = len(self.G)
            # advanced broadcasting here!
            # Sample 'batch_size' adjacency matrices shape=[batch_size,N,N]
            Amodel = self.model.sample_adjacency(theta=z, batch_size=batch_size, with_grads=True)
            #print('Amodel nan?:', np.any(np.isnan(Amodel.ravel())))
            # Here exploit broadcasting to create batch_size diagonal matrices with the degrees
            Dmodel = np.eye(N) * np.transpose(np.zeros([1, 1, N]) + np.einsum('ijk->ik', Amodel, optimize=True), [1, 0, 2])
            Lmodel = Dmodel - Amodel  # returns a batch_size x N x N tensor
            # do average over batches of the sum of product of matrix elements (done with * operator)
            Emodel = np.mean(np.sum(np.sum(Lmodel * rho, axis=2), axis=1))
            lambd_model = eigh(Lmodel)[0]  # eigh is a batched operation, take the eigenvalues only
            Fmodel = - np.mean(logsumexp(-beta * lambd_model, axis=1) / beta)
            loglike = -beta * Emodel + Fmodel # - Tr[rho log(sigma)]
            #loglike  #- entropy # Tr[rho log(rho)] - Tr[rho log(sigma)]
            
            # Add a penalty term accouting for boundaries violation
            # Follows these ideas https://www.cs.jhu.edu/~ijwang/pub/01271742.pdf
            def quadratic_penalty(theta):
                z = np.zeros(theta.shape)
                # compute the penalty with respect to low bounds violation 
                penalty_low = np.sum(np.max(np.hstack([z, -theta + self.model.min_bounds]))**2)
                # compute the penalty with respect to high bounds violation 
                penalty_high = np.sum(np.max(np.hstack([z, theta - self.model.max_bounds]))**2)
                return 0.5 * (penalty_low + penalty_high)
            def absolute_value_penalty(theta):
                z = np.zeros(theta.shape)
                # compute the penalty with respect to low bounds violation 
                penalty_low = np.max(np.hstack([z, -theta + self.model.min_bounds]))
                # compute the penalty with respect to high bounds violation 
                penalty_high = np.max(np.hstack([z, theta - self.model.max_bounds]))
                return np.max(np.hstack(penalty_low,penalty_high))
            penalty = quadratic_penalty
            cost = -loglike + beta*penalty(z)
            return cost

        # value and gradient of relative entropy as a function
        dkl_and_dkldx = autograd.value_and_grad(log_likelihood)
        return dkl_and_dkldx(x)


[docs]class Adam(StochasticOptimizer):
    """
    Implements the ADAM stochastic gradient descent.
    Adam: A Method for Stochastic Optimization
    Diederik P. Kingma, Jimmy Ba

    https://arxiv.org/abs/1412.6980
    However here we use quasi-hyperbolic adam by default, with parameters nu1,nu2
    https://arxiv.org/pdf/1810.06801.pdf
    """
[docs]    def __init__(self, G: np.array, x0 : np.array, model : GraphModel, **kwargs):
        super().__init__(G, x0, model, **kwargs)
        self.logfile = open('adam.log','w')
        self.mt, self.vt = np.zeros(self.sol.x.shape), np.zeros(self.sol.x.shape)

    def run(self,
            beta,
            rho=None,
            maxiter=1E4,
            learning_rate=1E-3,
            beta1=0.9,
            beta2=0.999,
            nu1=0.7,
            nu2=1.0,
            epsilon=1E-8,
            batch_size=64,
            ftol=1E-10,
            gtol=1E-5,
            xtol=1E-8,
            last_iters = 100,
            quasi_hyperbolic = True,
            callback=None,
            **kwargs):
        
        opts = {'ftol': ftol,
                'gtol': gtol, # as default of LBFGSB
                'xtol': xtol,
                'maxiter': maxiter,
                'eps': kwargs.get('eps', 1E-8), # as default of LBFGSB
                'maxfun': kwargs.get('maxfun', 1E10),
                'verbose' : kwargs.get('verbose', 0),
                'disp':  bool(kwargs.get('verbose', 0)),
                'iprint': kwargs.get('verbose', 0)
                }

        # if rho is provided, user rho is used, otherwise is computed at every beta
        if rho is None:
            rho = compute_vonneuman_density(L=self.L, beta=beta)
        
        #logging.basicConfig(stream=sys.stderr, level=logging.INFO)
        #import logging
        #adam_logger = logging.getLogger('ADAM')
        #adam_logger.setLevel(logging.INFO)
        #all_dkl = []
        #from scipy.stats import linregress

        for t in range(1,int(maxiter)+1):
            self.sol.nit += 1
            # get the relative entropy value and its gradient w.r.t. variables
            dkl, grad_t = self.gradient(self.sol.x, rho, beta, batch_size=batch_size)
            print('\rIteration %d beta=%.3f' % (t,beta),end='')
            # Convergence status
            if np.linalg.norm(grad_t) < gtol:
                self.sol.success = True
                self.sol.message = 'Exceeded minimum gradient |grad| < %g' % gtol
                break
            #all_dkl.append(dkl)
            # check the convergence based on the slope of dkl of the last 10 iterations
            #if t > last_iters:
            #    dkl_iters = np.arange(last_iters)
            #    slope, intercept, r_value, p_value, std_err = linregress(x=dkl_iters, y=all_dkl[-last_iters:])
            #    if np.abs(slope - 1E-3) < 0:
            #        converged = True
            #        print('Optimization terminated for flatness')

            self.mt = beta1 * self.mt + (1.0 - beta1) * grad_t
            self.vt = beta2 * self.vt + (1.0 - beta2) * (grad_t * grad_t)
            mttilde = self.mt / (1.0 - (beta1 ** t))  # compute bias corrected first moment estimate
            vttilde = self.vt / (1.0 - (beta2 ** t))  # compute bias-corrected second raw moment estimate
            if quasi_hyperbolic: # quasi hyperbolic adam
                deltax = ((1-nu1) * grad_t + nu1 * mttilde)/(np.sqrt((1-nu2) * (grad_t**2) + nu2 * vttilde ) + epsilon)
            else: # vanilla Adam
                deltax = mttilde / np.sqrt(vttilde + epsilon)
            self.sol.x -= learning_rate * deltax
            self.logfile.write( u"%g\t%g\t%g\n" % (self.sol.x,beta,dkl) )
            if t % 50:
                self.logfile.flush()
        return self.sol

    # def run(self, **kwargs):
    #     x = self.x0
    #     batch_size = kwargs.get('batch_size', 1)
    #     max_iters = kwargs.get('max_iters', 1000)
    #     # ADAM parameters
    #     eta = kwargs.get('eta', 1E-3)
    #     gtol = kwargs.get('gtol', 1E-4)
    #     xtol = kwargs.get('xtol', 1E-3)
    #     beta1 = 0.9
    #     beta2 = 0.999
    #     epsilon = 1E-8 # avoid nan with large learning rates
    #     # Use quasi-hyperbolic adam by default
    #     # https://arxiv.org/pdf/1810.06801.pdf
    #     quasi_hyperbolic = kwargs.get('quasi_hyperbolic', True)
    #     nu1 = 0.7 # for quasi-hyperbolic adam
    #     nu2 = 1.0 # for quasi-hyperbolic adam
    #     # visualization options
    #     refresh_frames = kwargs.get('refresh_frames', 100)
    #     from drawnow import drawnow, figure
    #     figure(figsize=(8,8))
    #     import matplotlib.pyplot as plt
    #     #plt.figure(figsize=(8, 8))

    #     # Populate the solution list as function of beta
    #     # the list sol contains all optimization points
    #     sol = []
    #     # Iterate over all beta provided by the user
    #     mt, vt = np.zeros(self.x0.shape), np.zeros(self.x0.shape)
    #     all_dkl = []
        
    #     # TODO implement model boundaries in Adam
    #     frames = 0
    #     filename = 'adam.log'
    #     logfile = open(filename,'w')
    #     from scipy.stats import linregress
    #     import logging
    #     import sys
    #     logging.basicConfig(stream=sys.stderr, level=logging.INFO)
    #     adam_logger = logging.getLogger('ADAM')
    #     adam_logger.setLevel(logging.INFO)

    #     for beta in self.beta_range:
    #         adam_logger.info('Changed beta to %g' % beta)
    #         # if rho is provided, user rho is used, otherwise is computed at every beta
    #         rho = kwargs.get('rho', compute_vonneuman_density(L=self.L, beta=beta))
    #         # initialize at x0
    #         x = self.x0
    #         converged = False
    #         t = 0  # t is the iteration number
    #         while not converged:
    #             t += 1
    #             # get the relative entropy value and its gradient w.r.t. variables
    #             dkl, grad_t = self.gradient(x, rho, beta, batch_size=batch_size)
    #             # Convergence status
    #             if np.linalg.norm(grad_t) < gtol:
    #                 converged = True
    #                 adam_logger.info('Exceeded minimum gradient |grad|<%g' % gtol)
                
    #             if t > max_iters:
    #                 adam_logger.info('Exceeded maximum iterations')
    #                 converged = True

    #             # check the convergence based on the slope of dkl of the last 10 iterations
    #             last_iters = 100
    #             if len(all_dkl) > last_iters:
    #                 dkl_iters = np.arange(last_iters)
    #                 slope, intercept, r_value, p_value, std_err = linregress(x=dkl_iters, y=all_dkl[-last_iters:])
    #                 if np.abs(slope - 1E-3) < 0:
    #                         converged = True
    #                         adam_logger.info('Optimization terminated for flatness')

    #             # TODO implement check boundaries in Adam
    #             # if np.any(np.ravel(self.model.bounds)):
    #             #    raise RuntimeError('variable bounds exceeded')
    #             mt = beta1 * mt + (1.0 - beta1) * grad_t
    #             vt = beta2 * vt + (1.0 - beta2) * (grad_t * grad_t)
    #             mttilde = mt / (1.0 - (beta1 ** t))  # compute bias corrected first moment estimate
    #             vttilde = vt / (1.0 - (beta2 ** t))  # compute bias-corrected second raw moment estimate
    #             if quasi_hyperbolic: # quasi hyperbolic adam
    #                 deltax = ((1-nu1) * grad_t + nu1 * mttilde)/(np.sqrt((1-nu2) * (grad_t**2) + nu2 * vttilde ) + epsilon)
    #             else: # vanilla Adam
    #                 deltax = mttilde / np.sqrt(vttilde + epsilon)
    #             x -= eta * deltax
    #             #all_dkl.append(dkl)
    #             #print(np.hstack([t, x,beta,dkl]))
    #             #logfile.write( u"%g\t%g\t%g\n" % (x,beta,dkl) )
    #             # if t % refresh_frames == 0:
    #             #     frames += 1
    #             #     def draw_fig():
    #             #         plot_beta_range = np.logspace(-3,3,100)
    #             #         sol.append({'x': x.copy()})
    #             #         #plt.figure(figsize=(8, 8))
    #             #         A0 = np.mean(self.model.sample_adjacency(theta=x, batch_size=batch_size), axis=0)
    #             #         plt.subplot(2, 2, 1)
    #             #         im = plt.imshow(self.A)
    #             #         plt.colorbar(im)
    #             #         plt.title('Data')
    #             #         plt.subplot(2, 2, 2)
    #             #         im = plt.imshow(A0)
    #             #         plt.colorbar(im)
    #             #         plt.title('<Model>')
    #             #         plt.subplot(2, 2, 3)
    #             #         plt.plot(all_dkl)
    #             #         plt.xlabel('iteration')
    #             #         plt.ylabel('$S(\\boldsymbol \\rho,\\boldsymbol \\sigma)$')
    #             #         plt.subplot(2, 2, 4)
    #             #         plt.semilogx(plot_beta_range, batch_compute_vonneumann_entropy(self.L, plot_beta_range), '.-', label='data')
    #             #         plt.semilogx(plot_beta_range, batch_compute_vonneumann_entropy(graph_laplacian(A0), plot_beta_range), '.-', label='model')
    #             #         plt.plot(beta, batch_compute_vonneumann_entropy(graph_laplacian(A0), [beta]), 'ko', label='model')
    #             #         plt.xlabel('$\\beta$')
    #             #         plt.ylabel('$S$')
    #             #         plt.title('Entropy')
    #             #         plt.legend(loc='best')
    #             #         plt.suptitle('$\\beta=$' + '{0:0>3}'.format(beta))
    #             #         #plt.tight_layout()
    #             #     drawnow(draw_fig)
    #     self.sol = sol
    #     adam_logger.info('Optimization finished\n' + str(self.sol))
    #     return sol