Dummy Variables and Multi-collinearity

ML
Author

Nipun Batra

Published

January 27, 2024

import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np
x1 = np.array([1, 2, 3])
x2 = 2*x1

y = np.array([4, 6, 8])
all_ones = np.ones(x1.shape[0])
X = np.array([all_ones, x1, x2]).T
X.shape
(3, 3)
X
array([[1., 1., 2.],
       [1., 2., 4.],
       [1., 3., 6.]])
def solve_normal_equation(X, y):
    try:
        theta = np.linalg.inv(X.T @ X) @ X.T @ y
        return theta
    except np.linalg.LinAlgError:
        print('The matrix is singular')
        print("X.T @ X = \n", X.T @ X)
        return None
    
### Assignment question: Use np.linalg.solve instead of inv. Why is this better?
solve_normal_equation(X, y)
The matrix is singular
X.T @ X = 
 [[ 3.  6. 12.]
 [ 6. 14. 28.]
 [12. 28. 56.]]
np.linalg.matrix_rank(X), np.linalg.matrix_rank(X.T @ X)
(2, 2)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

data = np.array([x1, x2]).T

lr.fit(data, y)
lr.coef_, lr.intercept_


# Assignment question: figure why sklearn is able to solve the problem
(array([0.4, 0.8]), 2.0)
# Regularization

eps = 1e-5
X = np.array([all_ones, x1, x2]).T
X = np.eye(3)*eps + X
X
array([[1.00001, 1.     , 2.     ],
       [1.     , 2.00001, 4.     ],
       [1.     , 3.     , 6.00001]])
np.linalg.matrix_rank(X)
3
solve_normal_equation(X, y)
array([2.00023248, 1.19987743, 0.40001887])
# Drop variables
X = np.array([all_ones, x1]).T
print(X)
[[1. 1.]
 [1. 2.]
 [1. 3.]]
solve_normal_equation(X, y)
array([2., 2.])
# Dummy variables

## dataset
num_records = 12
windspeed = np.random.randint(0, 10, num_records)
vehicles = np.random.randint(100, 500, num_records)
direction = np.random.choice(['N', 'S', 'E', 'W'], num_records)
pollution = np.random.randint(0, 100, num_records)

df = pd.DataFrame({'windspeed': windspeed, 'vehicles': vehicles, 'direction': direction, 'pollution': pollution})
df
windspeed vehicles direction pollution
0 0 355 W 30
1 2 367 S 30
2 2 447 S 78
3 1 223 E 32
4 1 272 S 16
5 9 394 S 36
6 0 333 N 45
7 3 308 W 52
8 7 480 N 24
9 9 360 N 74
10 0 125 S 36
11 9 401 S 62
def fit_data(df, X, y):
    try:
        lr = LinearRegression()
        lr.fit(X, y)
        rep = f"y = {lr.intercept_:0.2f}"
        for i, coef in enumerate(lr.coef_):
            rep += f" + {coef:0.2f}*{df.columns[i]}"
        return rep
    except Exception as e:
        print(e)
        return None
        
fit_data(df, df[df.columns[:-1]], df['pollution'])
could not convert string to float: 'W'
# Ordinal encoding
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
df2 = df.copy()
df2['direction'] = enc.fit_transform(df[['direction']]).flatten()
df2
windspeed vehicles direction pollution
0 0 355 3.0 30
1 2 367 2.0 30
2 2 447 2.0 78
3 1 223 0.0 32
4 1 272 2.0 16
5 9 394 2.0 36
6 0 333 1.0 45
7 3 308 3.0 52
8 7 480 1.0 24
9 9 360 1.0 74
10 0 125 2.0 36
11 9 401 2.0 62
fit_data(df2, df2[df2.columns[:-1]], df2['pollution'])
'y = 26.49 + 1.49*windspeed + 0.03*vehicles + 1.02*direction'
pd.Series({x: i for i, x in enumerate(enc.categories_[0])})
E    0
N    1
S    2
W    3
dtype: int64
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)
direction_ohe = ohe.fit_transform(df[['direction']])
direction_ohe
array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]])
col_names_ohe = [f"Is it {x}?" for x in enc.categories_[0]]
direction_ohe_df = pd.DataFrame(direction_ohe, columns=col_names_ohe)
direction_ohe_df
Is it E? Is it N? Is it S? Is it W?
0 0.0 0.0 0.0 1.0
1 0.0 0.0 1.0 0.0
2 0.0 0.0 1.0 0.0
3 1.0 0.0 0.0 0.0
4 0.0 0.0 1.0 0.0
5 0.0 0.0 1.0 0.0
6 0.0 1.0 0.0 0.0
7 0.0 0.0 0.0 1.0
8 0.0 1.0 0.0 0.0
9 0.0 1.0 0.0 0.0
10 0.0 0.0 1.0 0.0
11 0.0 0.0 1.0 0.0
# Confirm that we can write Is it W? as a linear combination of the other columns
1-direction_ohe_df[["Is it N?", "Is it S?", "Is it E?"]].sum(axis=1) - direction_ohe_df["Is it W?"]
0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
dtype: float64
X = np.hstack([df[['windspeed', 'vehicles']].values, direction_ohe])
X
array([[  0., 355.,   0.,   0.,   0.,   1.],
       [  2., 367.,   0.,   0.,   1.,   0.],
       [  2., 447.,   0.,   0.,   1.,   0.],
       [  1., 223.,   1.,   0.,   0.,   0.],
       [  1., 272.,   0.,   0.,   1.,   0.],
       [  9., 394.,   0.,   0.,   1.,   0.],
       [  0., 333.,   0.,   1.,   0.,   0.],
       [  3., 308.,   0.,   0.,   0.,   1.],
       [  7., 480.,   0.,   1.,   0.,   0.],
       [  9., 360.,   0.,   1.,   0.,   0.],
       [  0., 125.,   0.,   0.,   1.,   0.],
       [  9., 401.,   0.,   0.,   1.,   0.]])
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])
X_aug
array([[  1.,   0., 355.,   0.,   0.,   0.,   1.],
       [  1.,   2., 367.,   0.,   0.,   1.,   0.],
       [  1.,   2., 447.,   0.,   0.,   1.,   0.],
       [  1.,   1., 223.,   1.,   0.,   0.,   0.],
       [  1.,   1., 272.,   0.,   0.,   1.,   0.],
       [  1.,   9., 394.,   0.,   0.,   1.,   0.],
       [  1.,   0., 333.,   0.,   1.,   0.,   0.],
       [  1.,   3., 308.,   0.,   0.,   0.,   1.],
       [  1.,   7., 480.,   0.,   1.,   0.,   0.],
       [  1.,   9., 360.,   0.,   1.,   0.,   0.],
       [  1.,   0., 125.,   0.,   0.,   1.,   0.],
       [  1.,   9., 401.,   0.,   0.,   1.,   0.]])
X_aug.shape
(12, 7)
np.linalg.matrix_rank(X_aug), np.linalg.matrix_rank(X_aug.T @ X_aug), (X_aug.T @ X_aug).shape
(6, 6, (7, 7))
pd.DataFrame(X_aug.T @ X_aug)
0 1 2 3 4 5 6
0 12.0 43.0 4065.0 1.0 3.0 6.0 2.0
1 43.0 311.0 16802.0 1.0 16.0 23.0 3.0
2 4065.0 16802.0 1481651.0 223.0 1173.0 2006.0 663.0
3 1.0 1.0 223.0 1.0 0.0 0.0 0.0
4 3.0 16.0 1173.0 0.0 3.0 0.0 0.0
5 6.0 23.0 2006.0 0.0 0.0 6.0 0.0
6 2.0 3.0 663.0 0.0 0.0 0.0 2.0
ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe.fit_transform(df[['direction']])
array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])
direction_ohe_n_1 = ohe.fit_transform(df[['direction']])
col_names_ohe_n_1 = [f"Is it {x}?" for x in enc.categories_[0][1:]]
df_ohe_n_1 = pd.DataFrame(direction_ohe_n_1, columns=col_names_ohe_n_1)
df_ohe_n_1
Is it N? Is it S? Is it W?
0 0.0 0.0 1.0
1 0.0 1.0 0.0
2 0.0 1.0 0.0
3 0.0 0.0 0.0
4 0.0 1.0 0.0
5 0.0 1.0 0.0
6 1.0 0.0 0.0
7 0.0 0.0 1.0
8 1.0 0.0 0.0
9 1.0 0.0 0.0
10 0.0 1.0 0.0
11 0.0 1.0 0.0
X = np.hstack([df[['windspeed', 'vehicles']].values, df_ohe_n_1.values])
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])

X_aug
array([[  1.,   0., 355.,   0.,   0.,   1.],
       [  1.,   2., 367.,   0.,   1.,   0.],
       [  1.,   2., 447.,   0.,   1.,   0.],
       [  1.,   1., 223.,   0.,   0.,   0.],
       [  1.,   1., 272.,   0.,   1.,   0.],
       [  1.,   9., 394.,   0.,   1.,   0.],
       [  1.,   0., 333.,   1.,   0.,   0.],
       [  1.,   3., 308.,   0.,   0.,   1.],
       [  1.,   7., 480.,   1.,   0.,   0.],
       [  1.,   9., 360.,   1.,   0.,   0.],
       [  1.,   0., 125.,   0.,   1.,   0.],
       [  1.,   9., 401.,   0.,   1.,   0.]])
np.linalg.matrix_rank(X_aug), np.linalg.matrix_rank(X_aug.T @ X_aug), (X_aug.T @ X_aug).shape
(6, 6, (6, 6))
# Interepeting dummy variables

## dataset

X = np.array(['F', 'F', 'F', 'M', 'M'])
y = np.array([5, 5.2, 5.4, 5.8, 6])
from sklearn.preprocessing import LabelBinarizer
l = LabelBinarizer()
l.fit_transform(X)
array([[0],
       [0],
       [0],
       [1],
       [1]])
X_binary = 1 - l.fit_transform(X)
X_binary    
array([[1],
       [1],
       [1],
       [0],
       [0]])
lr = LinearRegression()
lr.fit(X_binary, y)
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
lr.coef_, lr.intercept_
(array([-0.7]), 5.8999999999999995)
y[(X_binary==0).flatten()].mean()
5.9
y[(X_binary==1).flatten()].mean()
5.2