import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings
'ignore')
warnings.filterwarnings(0)
np.random.seed(%matplotlib inline
We all get it - AI is the new electricity. Deep neural nets are everywhere around us. But you know what, getting labelled training data can still be a big issue in many domains. This is where active learning comes in - given that we only have a small amount of labelled data, do we randomly get labels for other samples, or can we create a smarter
strategy for the same? Active learning deals with the latter.
Various strategies for active learning have been proposed in the past. In this post, I’ll work out a trivial example of what is called query by committee. The key idea is that we create a committee of learners and choose to acquire labels for the unlabelled points for which there is maximum disaggrement amongst the committee.
I’d recommend the new readers to go through this survey.
In this particular post, I’d be looking at active learning via query by committee, where the committee members are trained on different subsets of the train data. In a future post, I’ll write about active learning via query by committee, where the committee members are trained on the same data, but with different parameters.
Standard imports
Creating dataset
= np.arange(1, 1001, 1)
X = 10*X + 4 + 400* np.random.randn(1000, ) Y
=0.1)
plt.scatter(X, Y, s"X")
plt.xlabel("Y") plt.ylabel(
Text(0, 0.5, 'Y')
Learning a linear regression model on the entire data
from sklearn.linear_model import LinearRegression
= LinearRegression() clf
-1,1), Y) clf.fit(X.reshape(
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
clf.intercept_
-10.370897712972692
clf.coef_
array([9.99254389])
Visualising the fit
=0.1)
plt.scatter(X, Y, s"X")
plt.xlabel("Y")
plt.ylabel(0]*X + clf.intercept_, color='k', label='Best fit on all data')
plt.plot(X, clf.coef_[
plt.legend()500, clf.coef_[0]*500 + clf.intercept_ +4000, "Y = {0:0.2f} X + {1:0.2f}".format(clf.coef_[0], clf.intercept_) ) plt.text(
Text(500, 8985.90104506115, 'Y = 9.99 X + -10.37')
Creating the initial train set, the test set and the pool
from sklearn.model_selection import train_test_split
= train_test_split(X, Y, test_size = 0.5) train_pool_X, test_X, train_pool_Y, test_Y
= train_test_split(train_pool_X, train_pool_Y, test_size=495) train_X, pool_X, train_Y, pool_Y
plt.scatter(train_X, train_Y)
Creating a committee each learnt on different subset of the data
= 5 committee_size
= {0:{}}
train_X_com = {0:{}}
train_Y_com = {0:{}}
models_com
= 0
iteration
for cur_committee in range(committee_size):
= train_test_split(train_X, train_Y, train_size=0.5,
train_X_com[iteration][cur_committee], _, train_Y_com[iteration][cur_committee], _ =cur_committee)
random_state= LinearRegression()
models_com[iteration][cur_committee] -1,1), train_Y_com[iteration][cur_committee]) models_com[iteration][cur_committee].fit(train_X_com[iteration][cur_committee].reshape(
Plotting the fit of the committee on the entire dataset
=0.2)
plt.scatter(X, Y, sfor cur_committee in range(committee_size):
0][cur_committee].coef_[0]*X + models_com[0][cur_committee].intercept_,
plt.plot(X, models_com[='Model {0}\nY = {1:0.2f} X + {2:0.2f}'.format(cur_committee,
label0][cur_committee].coef_[0],
models_com[0][cur_committee].intercept_))
models_com[ plt.legend()
Evaluate the performance on the test set
= {0:{}}
estimations_com for cur_committee in range(committee_size):
0][cur_committee] = models_com[0][cur_committee].predict(test_X.reshape(-1, 1)) estimations_com[
= {0:(pd.DataFrame(estimations_com[0]).mean(axis=1) - test_Y).abs().mean()} test_mae_error
The MAE on the test set is:
0] test_mae_error[
565.8837967341798
Active learning procedure
= 20
num_iterations =[]
points_added_x
=[]
points_added_y
print("Iteration, Cost\n")
print("-"*40)
for iteration in range(1, num_iterations):
# For each committee: making predictions on the pool set based on model learnt in the respective train set
= {cur_committee: models_com[iteration-1][cur_committee].predict(pool_X.reshape(-1, 1)) for cur_committee in range(committee_size)}
estimations_pool # Finding points from the pool with highest disagreement among the committee - highest standard deviation
= pd.DataFrame(estimations_pool).std(axis=1).argmax()
in_var
= pool_X[in_var]
to_add_x = pool_Y[in_var]
to_add_y
points_added_x.append(to_add_x)
points_added_y.append(to_add_y)
# For each committee - Adding the point where the committe most disagrees
for com in range(committee_size):
if iteration not in train_X_com:
= {}
train_X_com[iteration] = {}
train_Y_com[iteration] = {}
models_com[iteration] = np.append(train_X_com[iteration-1][com], to_add_x)
train_X_com[iteration][com] = np.append(train_Y_com[iteration-1][com], to_add_y)
train_Y_com[iteration][com]
# Deleting the point from the pool
= np.delete(pool_X, in_var)
pool_X = np.delete(pool_Y, in_var)
pool_Y
# Training on the new set for each committee
for cur_committee in range(committee_size):
= LinearRegression()
models_com[iteration][cur_committee] -1,1), train_Y_com[iteration][cur_committee])
models_com[iteration][cur_committee].fit(train_X_com[iteration][cur_committee].reshape(
= {}
estimations_com[iteration] for cur_committee in range(committee_size):
= models_com[iteration][cur_committee].predict(test_X.reshape(-1, 1))
estimations_com[iteration][cur_committee] =(pd.DataFrame(estimations_com[iteration]).mean(axis=1) - test_Y).abs().mean()
test_mae_error[iteration]print(iteration, (test_mae_error[iteration]))
Iteration, Cost
----------------------------------------
1 406.17664898054875
2 402.9897752715986
3 348.45182739054235
4 348.49519515039907
5 349.04197938475716
6 348.68188577804807
7 352.40882668573266
8 373.60417208279864
9 377.25044571705723
10 372.5302143045216
11 335.30243056115603
12 336.6073606660666
13 343.2867837998923
14 347.0491266373306
15 349.7464195274436
16 351.5990833631039
17 349.21957548034976
18 338.8765223206476
19 337.0132510959355
='ko-')
pd.Series(test_mae_error).plot(style-0.5, num_iterations+0.5))
plt.xlim(("MAE on test set")
plt.ylabel("# Points Queried") plt.xlabel(
Text(0.5, 0, '# Points Queried')
As expected, the error goes down as we increase the number of points queried
= plt.subplots()
fig, ax import os
from matplotlib.animation import FuncAnimation
'animation.ffmpeg_path'] = os.path.expanduser('/Users/nipun/ffmpeg')
plt.rcParams[def update(iteration):
ax.cla()=0.2)
ax.scatter(X, Y, s"Iteration: {} \n MAE = {:0.2f}".format(iteration, test_mae_error[iteration]))
ax.set_title(for cur_committee in range(committee_size):
0]*X + models_com[iteration][cur_committee].intercept_,
ax.plot(X, models_com[iteration][cur_committee].coef_[='Model {0}\nY = {1:0.2f} X + {2:0.2f}'.format(cur_committee,
label0],
models_com[iteration][cur_committee].coef_[
models_com[iteration][cur_committee].intercept_))
=100, color='red')
ax.scatter(points_added_x[iteration], points_added_y[iteration],s
ax.legend()
fig.tight_layout()
= FuncAnimation(fig, update, frames=np.arange(0, num_iterations-1, 1), interval=1000)
anim plt.close()
from IPython.display import HTML
HTML(anim.to_html5_video())
From the animation, we can see that how adding a new point to the train set (shown in red) reduces the variation in prediction amongst the different committee members.