import autograd.numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
In this post, I’ll be using Adagrad for solving linear regression. As usual, the purpose of this post is educational. This link gives a good overview of Adagrad alongwith other variants of Gradient Descent. To summarise from the link:
It adapts the learning rate to the parameters, performing larger updates for infrequent and smaller updates for frequent parameters. For this reason, it is well-suited for dealing with sparse data.
As I’d done previously, I’ll be using Autograd to compute the gradients. Please note Autograd and not Adagrad!
Formulation ([borrowed from here])((http://ruder.io/optimizing-gradient-descent/)))
In regular gradient descent, we would update the \(i^{th}\) parameter in the \(t+1^{th}\) iteration, given the learning rate \(\eta\), where \(g_{t, i}\) represents the gradient of the cost wrt \(i^{th}\) param at time \(t\).
\[ \theta_{t+1, i} = \theta_{t, i} - \eta \cdot g_{t, i} \tag{Eq 1} \]
In Adagrad, we update as follows:
\[\theta_{t+1, i} = \theta_{t, i} - \dfrac{\eta}{\sqrt{G_{t, ii} + \epsilon}} \cdot g_{t, i} \tag{Eq 2}\]
Here,
\(G_{t} \in \mathbb{R}^{d \times d}\) is a diagonal matrix where each diagonal element \(i, i\) is the sum of the squares of the gradients w.r.t. \(\theta_i\) up to time step \(t\) , while \(\epsilon\) is a smoothing term that avoids division by zero (usually on the order of 1e−8).
Customary imports
True model
\[Y = 10 X + 6\]
Generating data
0)
np.random.seed(= 50
n_samples = np.linspace(1, 50, n_samples)
X = 10*X + 6 + 2*np.random.randn(n_samples) Y
'k.')
plt.plot(X, Y, "X")
plt.xlabel("Y"); plt.ylabel(
Model to be learnt
We want to learn W
and b
such that:
\[Y = 10 W+ b\]
Defining the cost function
We will now write a general cost function that accepts a list of parameters.
def cost(param_list):
= param_list
w, b = w*X+b
pred return np.sqrt(((pred - Y) ** 2).mean(axis=None))/(2*len(Y))
Dry run of cost and gradient functioning
# Cost of w=0, b=0
= 0., 0.
w, b print("Cost at w={}, b={} is: {}".format(w, b, cost([w, b])))
# Cost of w=10, b=4. Should be lower than w=0, b=0
= 10., 4.
w, b print("Cost at w={}, b={} is: {}".format(w, b, cost([w, b])))
# Computing the gradient at w=0, b=0
from autograd import grad
=grad(cost)
grad_cost = 0., 0.
w, b print("Gradient at w={}, b={} is: {}".format(w, b, grad_cost([w, b])))
# Computing the gradient at w=10, b=4. We would expect it to be smaller than at 0, 0
= 10., 4.
w, b print("Gradient at w={}, b={} is: {}".format(w, b, grad_cost([w, b])))
Cost at w=0.0, b=0.0 is: 2.98090446495
Cost at w=10.0, b=4.0 is: 0.0320479471939
Gradient at w=0.0, b=0.0 is: [array(-0.29297046699711365), array(-0.008765162440358071)]
Gradient at w=10.0, b=4.0 is: [array(-0.14406455246023858), array(-0.007117830452061141)]
Adagrad algorithm (applied on whole data batch)
def adagrad_gd(param_init, cost, niter=5, lr=1e-2, eps=1e-8, random_seed=0):
"""
param_init: List of initial values of parameters
cost: cost function
niter: Number of iterations to run
lr: Learning rate
eps: Fudge factor, to avoid division by zero
"""
from copy import deepcopy
import math
# Fixing the random_seed
np.random.seed(random_seed)
# Function to compute the gradient of the cost function
= grad(cost)
grad_cost = deepcopy(param_init)
params = [params], [], [[lr for _ in params]], [cost(params)]
param_array, grad_array, lr_array, cost_array # Initialising sum of squares of gradients for each param as 0
= [np.zeros_like(param) for param in params]
sum_squares_gradients for i in range(niter):
= []
out_params = grad_cost(params)
gradients # At each iteration, we add the square of the gradients to `sum_squares_gradients`
= [eps + sum_prev + np.square(g) for sum_prev, g in zip(sum_squares_gradients, gradients)]
sum_squares_gradients# Adapted learning rate for parameter list
= [np.divide(lr, np.sqrt(sg)) for sg in sum_squares_gradients]
lrs # Paramter update
= [param-(adapted_lr*grad_param) for param, adapted_lr, grad_param in zip(params, lrs, gradients)]
params
param_array.append(params)
lr_array.append(lrs)
grad_array.append(gradients)
cost_array.append(cost(params))
return params, param_array, grad_array, lr_array, cost_array
Experiment time!
Evolution of learning rates for W
and b
Let us see how the learning rate for W
and b
will evolve over time. I will fix the initial learning rate to 0.01 as mot of the Adagrad literature out there seems to suggest.
# Fixing the random seed for reproducible init params for `W` and `b`
0)
np.random.seed(= [np.random.randn(), np.random.randn()]
param_init = 0.01
lr =1e-8
eps=1000
niter= adagrad_gd(param_init, cost, niter=niter, lr=lr, eps=eps) ada_params, ada_param_array, ada_grad_array, ada_lr_array, ada_cost_array
Let us first see the evolution of cost wrt time
='Cost').plot(title='Adagrad: Cost v/s # Iterations')
pd.Series(ada_cost_array, name"Cost")
plt.ylabel("# Iterations"); plt.xlabel(
Ok. While There seems to be a drop in the cost, the converegence will be very slow. Remember that we had earlier found
Cost at w=10.0, b=4.0 is: 0.0320479471939
I’m sure this means that our parameter estimates are similar to the initial parameters and far from the true parameters. Let’s just confirm the same.
print("After {} iterations, learnt `W` = {} and learnt `b` = {}".format(niter, *ada_params))
After 1000 iterations, learnt `W` = 2.38206194526 and learnt `b` = 1.01811878873
I would suspect that the learning rate, courtesy of the adaptive nature is falling very rapidly! How would the vanilla gradient descent have done starting with the same learning rate and initial values? My hunch is it would do better. Let’s confirm!
GD vs Adagrad!
def gd(param_init, cost, niter=5, lr=0.01, random_seed=0):
np.random.seed(random_seed)from copy import deepcopy
= grad(cost)
grad_cost = deepcopy(param_init)
params = [params], [], [cost(params)]
param_array, grad_array, cost_array for i in range(niter):
= []
out_params = grad_cost(params)
gradients = [param-lr*grad_param for param, grad_param in zip(params, gradients)]
params
param_array.append(params)
grad_array.append(gradients)
cost_array.append(cost(params))return params, param_array, grad_array, cost_array
# Fixing the random seed for reproducible init params for `W` and `b`
0)
np.random.seed(= [np.random.randn(), np.random.randn()]
param_init = 0.01
lr =1000
niter= gd(param_init, cost, niter=niter, lr=lr) gd_params, gd_param_array, gd_grad_array, gd_cost
='Cost').plot(label='Adagrad')
pd.Series(ada_cost_array, name='Cost').plot(label='GD')
pd.Series(gd_cost, name"Cost")
plt.ylabel("# Iterations")
plt.xlabel( plt.legend()
Ok. So, indeed with learning rate of 0.01, gradient descent fares better. Let’s just confirm that for Adagrad, the learning rates diminish rapidly leading to little reduction in cost!
=['LR for W', 'LR for b'])[::50].plot(subplots=True, marker='o')
pd.DataFrame(np.array(ada_lr_array), columns"# Iterations") plt.xlabel(
There are a couple of interesting observations:
- The learning rate for
b
actually increases from its initial value of 0.01. Even after 1000 iterations, it remains more than its initial value. This can be explained by the fact that the suim of squares gradients wrtb
would be less than 1. Thus, the denominator term by which the learning rate gets divided will be less than 1. Thus, increasing the learning rate wrt b. This can however be fixed by choosing \(\epsilon=1.0\) - The learning rate for
W
falls very rapidly. Learning would be negligble forW
after the initial few iterations. This can be fixed by choosing a larger initial learning rate \(\eta\).
Evolution of W
and b
, wrt \(\eta\) and \(\epsilon\)
# Fixing the random seed for reproducible init params for `W` and `b`
= {}
out for lr in [0.01, 0.1, 1, 10]:
= {}
out[lr] for eps in [1e-8, 1e-1, 1]:
print(lr, eps)
0)
np.random.seed(= [np.random.randn(), np.random.randn()]
param_init =10000
niter= adagrad_gd(param_init,
ada_params, ada_param_array, ada_grad_array, ada_lr_array, ada_cost_array
cost, =niter,
niter=lr,
lr=eps)
eps= {'Final-params':ada_params,
out[lr][eps] 'Param-array':ada_param_array,
'Cost-array':ada_cost_array}
(0.01, 1e-08)
(0.01, 0.1)
(0.01, 1)
(0.1, 1e-08)
(0.1, 0.1)
(0.1, 1)
(1, 1e-08)
(1, 0.1)
(1, 1)
(10, 1e-08)
(10, 0.1)
(10, 1)
Plotting cost v/s # Iterations
= plt.subplots(nrows=3, ncols=4, sharex=True, figsize=(8, 6), sharey=True)
fig, ax for row, eps in enumerate([1e-8, 1e-1, 1]):
for column, lr in enumerate([0.01, 0.1, 1, 10]):
'Cost-array']).plot(ax=ax[row, column])
pd.Series(out[lr][eps][0, column].set_title("Eta={}".format(lr))
ax[0].set_ylabel("Eps={}".format(eps))
ax[row, 0.5, 0.0, '# Iterations')
fig.text("Cost v/s # Iterations"); plt.suptitle(
It seems that choosing \(\eta=1\) or above the cost usually converges quickly. This seems to be different from most literature recommending \(\eta=0.01\). Aside: I confirmed that even using Tensorflow on the same dataset with Adagrad optimizer, the optimal learning rates are similar to the ones we found here!
W
v/s # Iterations
= plt.subplots(nrows=3, ncols=4, sharex=True, figsize=(8, 6), sharey=True)
fig, ax for row, eps in enumerate([1e-8, 1e-1, 1]):
for column, lr in enumerate([0.01, 0.1, 1, 10]):
'Param-array'])[0].plot(ax=ax[row, column])
pd.DataFrame(out[lr][eps][0, column].set_title("Eta={}".format(lr))
ax[0].set_ylabel("Eps={}".format(eps))
ax[row, 0.5, 0.0, '# Iterations')
fig.text("W v/s # Iterations"); plt.suptitle(
b
v/s # Iterations
= plt.subplots(nrows=3, ncols=4, sharex=True, figsize=(8, 6), sharey=True)
fig, ax for row, eps in enumerate([1e-8, 1e-1, 1]):
for column, lr in enumerate([0.01, 0.1, 1, 10]):
'Param-array'])[1].plot(ax=ax[row, column])
pd.DataFrame(out[lr][eps][0, column].set_title("Eta={}".format(lr))
ax[0].set_ylabel("Eps={}".format(eps))
ax[row, 0.5, 0.0, '# Iterations')
fig.text("b v/s # Iterations"); plt.suptitle(
Across the above two plots, we can see that at high \(\eta\), there are oscillations! In general, \(\eta=1\) and \(\epsilon=1e-8\) seem to give the best set of results.
Visualising the model learning
from matplotlib.animation import FuncAnimation
= plt.subplots(nrows=3, ncols=4, sharex=True, figsize=(8, 6), sharey=True)
fig, ax
def update(i):
#fig.clf()
for row, eps in enumerate([1e-8, 1e-1, 1]):
for column, lr in enumerate([0.01, 0.1, 1, 10]):
= out[lr][eps]['Param-array'][i]
params_i
ax[row, column].cla()= params_i
w_i, b_i 'k.', ms=1)
ax[row, column].plot(X, Y, *X+b_i, 'r')
ax[row, column].plot(X, w_i#https://stackoverflow.com/questions/12998430/remove-xticks-in-a-matplotlib-plot
ax[row, column].tick_params( ='both',
axis='both',
which='off',
bottom='off',
left='off',
top='off',
labelbottom='off')
labelleft0, column].set_title("Eta={}".format(lr))
ax[0].set_ylabel("Eps={}".format(eps))
ax[row, "Iteration number: {}".format(i))
fig.suptitle(
= FuncAnimation(fig, update, frames=np.arange(0, 5000, 200), interval=500)
anim 'adagrad.gif', dpi=80, writer='imagemagick')
anim.save( plt.close()
So, there you go. Implementing Adagrad and running this experiment was a lot of fun and learning. Feel free to comment!