import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
Dummy Variables and Multi-collinearity
ML
= np.array([1, 2, 3])
x1 = 2*x1
x2
= np.array([4, 6, 8]) y
= np.ones(x1.shape[0])
all_ones = np.array([all_ones, x1, x2]).T X
X.shape
(3, 3)
X
array([[1., 1., 2.],
[1., 2., 4.],
[1., 3., 6.]])
def solve_normal_equation(X, y):
try:
= np.linalg.inv(X.T @ X) @ X.T @ y
theta return theta
except np.linalg.LinAlgError:
print('The matrix is singular')
print("X.T @ X = \n", X.T @ X)
return None
### Assignment question: Use np.linalg.solve instead of inv. Why is this better?
solve_normal_equation(X, y)
The matrix is singular
X.T @ X =
[[ 3. 6. 12.]
[ 6. 14. 28.]
[12. 28. 56.]]
@ X) np.linalg.matrix_rank(X), np.linalg.matrix_rank(X.T
(2, 2)
from sklearn.linear_model import LinearRegression
= LinearRegression()
lr
= np.array([x1, x2]).T
data
lr.fit(data, y)
lr.coef_, lr.intercept_
# Assignment question: figure why sklearn is able to solve the problem
(array([0.4, 0.8]), 2.0)
# Regularization
= 1e-5
eps = np.array([all_ones, x1, x2]).T
X = np.eye(3)*eps + X
X X
array([[1.00001, 1. , 2. ],
[1. , 2.00001, 4. ],
[1. , 3. , 6.00001]])
np.linalg.matrix_rank(X)
3
solve_normal_equation(X, y)
array([2.00023248, 1.19987743, 0.40001887])
# Drop variables
= np.array([all_ones, x1]).T
X print(X)
[[1. 1.]
[1. 2.]
[1. 3.]]
solve_normal_equation(X, y)
array([2., 2.])
# Dummy variables
## dataset
= 12
num_records = np.random.randint(0, 10, num_records)
windspeed = np.random.randint(100, 500, num_records)
vehicles = np.random.choice(['N', 'S', 'E', 'W'], num_records)
direction = np.random.randint(0, 100, num_records)
pollution
= pd.DataFrame({'windspeed': windspeed, 'vehicles': vehicles, 'direction': direction, 'pollution': pollution})
df df
windspeed | vehicles | direction | pollution | |
---|---|---|---|---|
0 | 0 | 355 | W | 30 |
1 | 2 | 367 | S | 30 |
2 | 2 | 447 | S | 78 |
3 | 1 | 223 | E | 32 |
4 | 1 | 272 | S | 16 |
5 | 9 | 394 | S | 36 |
6 | 0 | 333 | N | 45 |
7 | 3 | 308 | W | 52 |
8 | 7 | 480 | N | 24 |
9 | 9 | 360 | N | 74 |
10 | 0 | 125 | S | 36 |
11 | 9 | 401 | S | 62 |
def fit_data(df, X, y):
try:
= LinearRegression()
lr
lr.fit(X, y)= f"y = {lr.intercept_:0.2f}"
rep for i, coef in enumerate(lr.coef_):
+= f" + {coef:0.2f}*{df.columns[i]}"
rep return rep
except Exception as e:
print(e)
return None
-1]], df['pollution']) fit_data(df, df[df.columns[:
could not convert string to float: 'W'
# Ordinal encoding
from sklearn.preprocessing import OrdinalEncoder
= OrdinalEncoder() enc
= df.copy()
df2 'direction'] = enc.fit_transform(df[['direction']]).flatten()
df2[ df2
windspeed | vehicles | direction | pollution | |
---|---|---|---|---|
0 | 0 | 355 | 3.0 | 30 |
1 | 2 | 367 | 2.0 | 30 |
2 | 2 | 447 | 2.0 | 78 |
3 | 1 | 223 | 0.0 | 32 |
4 | 1 | 272 | 2.0 | 16 |
5 | 9 | 394 | 2.0 | 36 |
6 | 0 | 333 | 1.0 | 45 |
7 | 3 | 308 | 3.0 | 52 |
8 | 7 | 480 | 1.0 | 24 |
9 | 9 | 360 | 1.0 | 74 |
10 | 0 | 125 | 2.0 | 36 |
11 | 9 | 401 | 2.0 | 62 |
-1]], df2['pollution']) fit_data(df2, df2[df2.columns[:
'y = 26.49 + 1.49*windspeed + 0.03*vehicles + 1.02*direction'
for i, x in enumerate(enc.categories_[0])}) pd.Series({x: i
E 0
N 1
S 2
W 3
dtype: int64
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder
= OneHotEncoder(sparse_output=False) ohe
= ohe.fit_transform(df[['direction']])
direction_ohe direction_ohe
array([[0., 0., 0., 1.],
[0., 0., 1., 0.],
[0., 0., 1., 0.],
[1., 0., 0., 0.],
[0., 0., 1., 0.],
[0., 0., 1., 0.],
[0., 1., 0., 0.],
[0., 0., 0., 1.],
[0., 1., 0., 0.],
[0., 1., 0., 0.],
[0., 0., 1., 0.],
[0., 0., 1., 0.]])
= [f"Is it {x}?" for x in enc.categories_[0]] col_names_ohe
= pd.DataFrame(direction_ohe, columns=col_names_ohe)
direction_ohe_df direction_ohe_df
Is it E? | Is it N? | Is it S? | Is it W? | |
---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 1.0 |
1 | 0.0 | 0.0 | 1.0 | 0.0 |
2 | 0.0 | 0.0 | 1.0 | 0.0 |
3 | 1.0 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 0.0 | 1.0 | 0.0 |
5 | 0.0 | 0.0 | 1.0 | 0.0 |
6 | 0.0 | 1.0 | 0.0 | 0.0 |
7 | 0.0 | 0.0 | 0.0 | 1.0 |
8 | 0.0 | 1.0 | 0.0 | 0.0 |
9 | 0.0 | 1.0 | 0.0 | 0.0 |
10 | 0.0 | 0.0 | 1.0 | 0.0 |
11 | 0.0 | 0.0 | 1.0 | 0.0 |
# Confirm that we can write Is it W? as a linear combination of the other columns
1-direction_ohe_df[["Is it N?", "Is it S?", "Is it E?"]].sum(axis=1) - direction_ohe_df["Is it W?"]
0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0
9 0.0
10 0.0
11 0.0
dtype: float64
= np.hstack([df[['windspeed', 'vehicles']].values, direction_ohe]) X
X
array([[ 0., 355., 0., 0., 0., 1.],
[ 2., 367., 0., 0., 1., 0.],
[ 2., 447., 0., 0., 1., 0.],
[ 1., 223., 1., 0., 0., 0.],
[ 1., 272., 0., 0., 1., 0.],
[ 9., 394., 0., 0., 1., 0.],
[ 0., 333., 0., 1., 0., 0.],
[ 3., 308., 0., 0., 0., 1.],
[ 7., 480., 0., 1., 0., 0.],
[ 9., 360., 0., 1., 0., 0.],
[ 0., 125., 0., 0., 1., 0.],
[ 9., 401., 0., 0., 1., 0.]])
= np.hstack([np.ones((X.shape[0], 1)), X]) X_aug
X_aug
array([[ 1., 0., 355., 0., 0., 0., 1.],
[ 1., 2., 367., 0., 0., 1., 0.],
[ 1., 2., 447., 0., 0., 1., 0.],
[ 1., 1., 223., 1., 0., 0., 0.],
[ 1., 1., 272., 0., 0., 1., 0.],
[ 1., 9., 394., 0., 0., 1., 0.],
[ 1., 0., 333., 0., 1., 0., 0.],
[ 1., 3., 308., 0., 0., 0., 1.],
[ 1., 7., 480., 0., 1., 0., 0.],
[ 1., 9., 360., 0., 1., 0., 0.],
[ 1., 0., 125., 0., 0., 1., 0.],
[ 1., 9., 401., 0., 0., 1., 0.]])
X_aug.shape
(12, 7)
@ X_aug), (X_aug.T @ X_aug).shape np.linalg.matrix_rank(X_aug), np.linalg.matrix_rank(X_aug.T
(6, 6, (7, 7))
@ X_aug) pd.DataFrame(X_aug.T
0 | 1 | 2 | 3 | 4 | 5 | 6 | |
---|---|---|---|---|---|---|---|
0 | 12.0 | 43.0 | 4065.0 | 1.0 | 3.0 | 6.0 | 2.0 |
1 | 43.0 | 311.0 | 16802.0 | 1.0 | 16.0 | 23.0 | 3.0 |
2 | 4065.0 | 16802.0 | 1481651.0 | 223.0 | 1173.0 | 2006.0 | 663.0 |
3 | 1.0 | 1.0 | 223.0 | 1.0 | 0.0 | 0.0 | 0.0 |
4 | 3.0 | 16.0 | 1173.0 | 0.0 | 3.0 | 0.0 | 0.0 |
5 | 6.0 | 23.0 | 2006.0 | 0.0 | 0.0 | 6.0 | 0.0 |
6 | 2.0 | 3.0 | 663.0 | 0.0 | 0.0 | 0.0 | 2.0 |
= OneHotEncoder(sparse_output=False, drop='first')
ohe 'direction']]) ohe.fit_transform(df[[
array([[0., 0., 1.],
[0., 1., 0.],
[0., 1., 0.],
[0., 0., 0.],
[0., 1., 0.],
[0., 1., 0.],
[1., 0., 0.],
[0., 0., 1.],
[1., 0., 0.],
[1., 0., 0.],
[0., 1., 0.],
[0., 1., 0.]])
= ohe.fit_transform(df[['direction']])
direction_ohe_n_1 = [f"Is it {x}?" for x in enc.categories_[0][1:]]
col_names_ohe_n_1 = pd.DataFrame(direction_ohe_n_1, columns=col_names_ohe_n_1)
df_ohe_n_1 df_ohe_n_1
Is it N? | Is it S? | Is it W? | |
---|---|---|---|
0 | 0.0 | 0.0 | 1.0 |
1 | 0.0 | 1.0 | 0.0 |
2 | 0.0 | 1.0 | 0.0 |
3 | 0.0 | 0.0 | 0.0 |
4 | 0.0 | 1.0 | 0.0 |
5 | 0.0 | 1.0 | 0.0 |
6 | 1.0 | 0.0 | 0.0 |
7 | 0.0 | 0.0 | 1.0 |
8 | 1.0 | 0.0 | 0.0 |
9 | 1.0 | 0.0 | 0.0 |
10 | 0.0 | 1.0 | 0.0 |
11 | 0.0 | 1.0 | 0.0 |
= np.hstack([df[['windspeed', 'vehicles']].values, df_ohe_n_1.values])
X = np.hstack([np.ones((X.shape[0], 1)), X])
X_aug
X_aug
array([[ 1., 0., 355., 0., 0., 1.],
[ 1., 2., 367., 0., 1., 0.],
[ 1., 2., 447., 0., 1., 0.],
[ 1., 1., 223., 0., 0., 0.],
[ 1., 1., 272., 0., 1., 0.],
[ 1., 9., 394., 0., 1., 0.],
[ 1., 0., 333., 1., 0., 0.],
[ 1., 3., 308., 0., 0., 1.],
[ 1., 7., 480., 1., 0., 0.],
[ 1., 9., 360., 1., 0., 0.],
[ 1., 0., 125., 0., 1., 0.],
[ 1., 9., 401., 0., 1., 0.]])
@ X_aug), (X_aug.T @ X_aug).shape np.linalg.matrix_rank(X_aug), np.linalg.matrix_rank(X_aug.T
(6, 6, (6, 6))
# Interepeting dummy variables
## dataset
= np.array(['F', 'F', 'F', 'M', 'M'])
X = np.array([5, 5.2, 5.4, 5.8, 6]) y
from sklearn.preprocessing import LabelBinarizer
= LabelBinarizer()
l l.fit_transform(X)
array([[0],
[0],
[0],
[1],
[1]])
= 1 - l.fit_transform(X) X_binary
X_binary
array([[1],
[1],
[1],
[0],
[0]])
= LinearRegression()
lr lr.fit(X_binary, y)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
lr.coef_, lr.intercept_
(array([-0.7]), 5.8999999999999995)
==0).flatten()].mean() y[(X_binary
5.9
==1).flatten()].mean() y[(X_binary
5.2