Dummy Variables and Multi-collinearity

Author

Nipun Batra

Published

January 27, 2024

import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import numpy as np

x1 = np.array([1, 2, 3])
x2 = 2*x1

y = np.array([4, 6, 8])

all_ones = np.ones(x1.shape[0])
X = np.array([all_ones, x1, x2]).T

X.shape

(3, 3)

array([[1., 1., 2.],
       [1., 2., 4.],
       [1., 3., 6.]])

def solve_normal_equation(X, y):
    try:
        theta = np.linalg.inv(X.T @ X) @ X.T @ y
        return theta
    except np.linalg.LinAlgError:
        print('The matrix is singular')
        print("X.T @ X = \n", X.T @ X)
        return None
    
### Assignment question: Use np.linalg.solve instead of inv. Why is this better?

solve_normal_equation(X, y)

The matrix is singular
X.T @ X = 
 [[ 3.  6. 12.]
 [ 6. 14. 28.]
 [12. 28. 56.]]

np.linalg.matrix_rank(X), np.linalg.matrix_rank(X.T @ X)

(2, 2)

from sklearn.linear_model import LinearRegression
lr = LinearRegression()

data = np.array([x1, x2]).T

lr.fit(data, y)
lr.coef_, lr.intercept_


# Assignment question: figure why sklearn is able to solve the problem

(array([0.4, 0.8]), 2.0)

# Regularization

eps = 1e-5
X = np.array([all_ones, x1, x2]).T
X = np.eye(3)*eps + X
X

array([[1.00001, 1.     , 2.     ],
       [1.     , 2.00001, 4.     ],
       [1.     , 3.     , 6.00001]])

np.linalg.matrix_rank(X)

solve_normal_equation(X, y)

array([2.00023248, 1.19987743, 0.40001887])

# Drop variables
X = np.array([all_ones, x1]).T
print(X)

[[1. 1.]
 [1. 2.]
 [1. 3.]]

solve_normal_equation(X, y)

array([2., 2.])

# Dummy variables

## dataset
num_records = 12
windspeed = np.random.randint(0, 10, num_records)
vehicles = np.random.randint(100, 500, num_records)
direction = np.random.choice(['N', 'S', 'E', 'W'], num_records)
pollution = np.random.randint(0, 100, num_records)

df = pd.DataFrame({'windspeed': windspeed, 'vehicles': vehicles, 'direction': direction, 'pollution': pollution})
df

	windspeed	vehicles	direction	pollution
0	0	355	W	30
1	2	367	S	30
2	2	447	S	78
3	1	223	E	32
4	1	272	S	16
5	9	394	S	36
6	0	333	N	45
7	3	308	W	52
8	7	480	N	24
9	9	360	N	74
10	0	125	S	36
11	9	401	S	62

def fit_data(df, X, y):
    try:
        lr = LinearRegression()
        lr.fit(X, y)
        rep = f"y = {lr.intercept_:0.2f}"
        for i, coef in enumerate(lr.coef_):
            rep += f" + {coef:0.2f}*{df.columns[i]}"
        return rep
    except Exception as e:
        print(e)
        return None

fit_data(df, df[df.columns[:-1]], df['pollution'])

could not convert string to float: 'W'

# Ordinal encoding
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

df2 = df.copy()
df2['direction'] = enc.fit_transform(df[['direction']]).flatten()
df2

	windspeed	vehicles	direction	pollution
0	0	355	3.0	30
1	2	367	2.0	30
2	2	447	2.0	78
3	1	223	0.0	32
4	1	272	2.0	16
5	9	394	2.0	36
6	0	333	1.0	45
7	3	308	3.0	52
8	7	480	1.0	24
9	9	360	1.0	74
10	0	125	2.0	36
11	9	401	2.0	62

fit_data(df2, df2[df2.columns[:-1]], df2['pollution'])

'y = 26.49 + 1.49*windspeed + 0.03*vehicles + 1.02*direction'

pd.Series({x: i for i, x in enumerate(enc.categories_[0])})

E    0
N    1
S    2
W    3
dtype: int64

# One-hot encoding
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False)

direction_ohe = ohe.fit_transform(df[['direction']])
direction_ohe

array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 1.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]])

col_names_ohe = [f"Is it {x}?" for x in enc.categories_[0]]

direction_ohe_df = pd.DataFrame(direction_ohe, columns=col_names_ohe)
direction_ohe_df

	Is it E?	Is it N?	Is it S?	Is it W?
0	0.0	0.0	0.0	1.0
1	0.0	0.0	1.0	0.0
2	0.0	0.0	1.0	0.0
3	1.0	0.0	0.0	0.0
4	0.0	0.0	1.0	0.0
5	0.0	0.0	1.0	0.0
6	0.0	1.0	0.0	0.0
7	0.0	0.0	0.0	1.0
8	0.0	1.0	0.0	0.0
9	0.0	1.0	0.0	0.0
10	0.0	0.0	1.0	0.0
11	0.0	0.0	1.0	0.0

# Confirm that we can write Is it W? as a linear combination of the other columns
1-direction_ohe_df[["Is it N?", "Is it S?", "Is it E?"]].sum(axis=1) - direction_ohe_df["Is it W?"]

0     0.0
1     0.0
2     0.0
3     0.0
4     0.0
5     0.0
6     0.0
7     0.0
8     0.0
9     0.0
10    0.0
11    0.0
dtype: float64

X = np.hstack([df[['windspeed', 'vehicles']].values, direction_ohe])

array([[  0., 355.,   0.,   0.,   0.,   1.],
       [  2., 367.,   0.,   0.,   1.,   0.],
       [  2., 447.,   0.,   0.,   1.,   0.],
       [  1., 223.,   1.,   0.,   0.,   0.],
       [  1., 272.,   0.,   0.,   1.,   0.],
       [  9., 394.,   0.,   0.,   1.,   0.],
       [  0., 333.,   0.,   1.,   0.,   0.],
       [  3., 308.,   0.,   0.,   0.,   1.],
       [  7., 480.,   0.,   1.,   0.,   0.],
       [  9., 360.,   0.,   1.,   0.,   0.],
       [  0., 125.,   0.,   0.,   1.,   0.],
       [  9., 401.,   0.,   0.,   1.,   0.]])

X_aug = np.hstack([np.ones((X.shape[0], 1)), X])

X_aug

array([[  1.,   0., 355.,   0.,   0.,   0.,   1.],
       [  1.,   2., 367.,   0.,   0.,   1.,   0.],
       [  1.,   2., 447.,   0.,   0.,   1.,   0.],
       [  1.,   1., 223.,   1.,   0.,   0.,   0.],
       [  1.,   1., 272.,   0.,   0.,   1.,   0.],
       [  1.,   9., 394.,   0.,   0.,   1.,   0.],
       [  1.,   0., 333.,   0.,   1.,   0.,   0.],
       [  1.,   3., 308.,   0.,   0.,   0.,   1.],
       [  1.,   7., 480.,   0.,   1.,   0.,   0.],
       [  1.,   9., 360.,   0.,   1.,   0.,   0.],
       [  1.,   0., 125.,   0.,   0.,   1.,   0.],
       [  1.,   9., 401.,   0.,   0.,   1.,   0.]])

X_aug.shape

(12, 7)

np.linalg.matrix_rank(X_aug), np.linalg.matrix_rank(X_aug.T @ X_aug), (X_aug.T @ X_aug).shape

(6, 6, (7, 7))

pd.DataFrame(X_aug.T @ X_aug)

	0	1	2	3	4	5	6
0	12.0	43.0	4065.0	1.0	3.0	6.0	2.0
1	43.0	311.0	16802.0	1.0	16.0	23.0	3.0
2	4065.0	16802.0	1481651.0	223.0	1173.0	2006.0	663.0
3	1.0	1.0	223.0	1.0	0.0	0.0	0.0
4	3.0	16.0	1173.0	0.0	3.0	0.0	0.0
5	6.0	23.0	2006.0	0.0	0.0	6.0	0.0
6	2.0	3.0	663.0	0.0	0.0	0.0	2.0

ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe.fit_transform(df[['direction']])

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

direction_ohe_n_1 = ohe.fit_transform(df[['direction']])
col_names_ohe_n_1 = [f"Is it {x}?" for x in enc.categories_[0][1:]]
df_ohe_n_1 = pd.DataFrame(direction_ohe_n_1, columns=col_names_ohe_n_1)
df_ohe_n_1

	Is it N?	Is it S?	Is it W?
0	0.0	0.0	1.0
1	0.0	1.0	0.0
2	0.0	1.0	0.0
3	0.0	0.0	0.0
4	0.0	1.0	0.0
5	0.0	1.0	0.0
6	1.0	0.0	0.0
7	0.0	0.0	1.0
8	1.0	0.0	0.0
9	1.0	0.0	0.0
10	0.0	1.0	0.0
11	0.0	1.0	0.0

X = np.hstack([df[['windspeed', 'vehicles']].values, df_ohe_n_1.values])
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])

X_aug

array([[  1.,   0., 355.,   0.,   0.,   1.],
       [  1.,   2., 367.,   0.,   1.,   0.],
       [  1.,   2., 447.,   0.,   1.,   0.],
       [  1.,   1., 223.,   0.,   0.,   0.],
       [  1.,   1., 272.,   0.,   1.,   0.],
       [  1.,   9., 394.,   0.,   1.,   0.],
       [  1.,   0., 333.,   1.,   0.,   0.],
       [  1.,   3., 308.,   0.,   0.,   1.],
       [  1.,   7., 480.,   1.,   0.,   0.],
       [  1.,   9., 360.,   1.,   0.,   0.],
       [  1.,   0., 125.,   0.,   1.,   0.],
       [  1.,   9., 401.,   0.,   1.,   0.]])

np.linalg.matrix_rank(X_aug), np.linalg.matrix_rank(X_aug.T @ X_aug), (X_aug.T @ X_aug).shape

(6, 6, (6, 6))

# Interepeting dummy variables

## dataset

X = np.array(['F', 'F', 'F', 'M', 'M'])
y = np.array([5, 5.2, 5.4, 5.8, 6])

from sklearn.preprocessing import LabelBinarizer
l = LabelBinarizer()
l.fit_transform(X)

array([[0],
       [0],
       [0],
       [1],
       [1]])

X_binary = 1 - l.fit_transform(X)

X_binary

array([[1],
       [1],
       [1],
       [0],
       [0]])

lr = LinearRegression()
lr.fit(X_binary, y)

LinearRegression()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

lr.coef_, lr.intercept_

(array([-0.7]), 5.8999999999999995)

y[(X_binary==0).flatten()].mean()

5.9

y[(X_binary==1).flatten()].mean()

5.2