import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as npDummy Variables and Multi-collinearity
ML
x1 = np.array([1, 2, 3])
x2 = 2*x1
y = np.array([4, 6, 8])all_ones = np.ones(x1.shape[0])
X = np.array([all_ones, x1, x2]).TX.shape(3, 3)
Xarray([[1., 1., 2.],
[1., 2., 4.],
[1., 3., 6.]])
def solve_normal_equation(X, y):
try:
theta = np.linalg.inv(X.T @ X) @ X.T @ y
return theta
except np.linalg.LinAlgError:
print('The matrix is singular')
print("X.T @ X = \n", X.T @ X)
return None
### Assignment question: Use np.linalg.solve instead of inv. Why is this better?solve_normal_equation(X, y)The matrix is singular
X.T @ X =
[[ 3. 6. 12.]
[ 6. 14. 28.]
[12. 28. 56.]]
np.linalg.matrix_rank(X), np.linalg.matrix_rank(X.T @ X)(2, 2)
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
data = np.array([x1, x2]).T
lr.fit(data, y)
lr.coef_, lr.intercept_
# Assignment question: figure why sklearn is able to solve the problem(array([0.4, 0.8]), 2.0)
# Regularization
eps = 1e-5
X = np.array([all_ones, x1, x2]).T
X = np.eye(3)*eps + X
Xarray([[1.00001, 1. , 2. ],
[1. , 2.00001, 4. ],
[1. , 3. , 6.00001]])
np.linalg.matrix_rank(X)3
solve_normal_equation(X, y)array([2.00023248, 1.19987743, 0.40001887])
# Drop variables
X = np.array([all_ones, x1]).T
print(X)[[1. 1.]
[1. 2.]
[1. 3.]]
solve_normal_equation(X, y)array([2., 2.])
# Dummy variables
## dataset
num_records = 12
windspeed = np.random.randint(0, 10, num_records)
vehicles = np.random.randint(100, 500, num_records)
direction = np.random.choice(['N', 'S', 'E', 'W'], num_records)
pollution = np.random.randint(0, 100, num_records)
df = pd.DataFrame({'windspeed': windspeed, 'vehicles': vehicles, 'direction': direction, 'pollution': pollution})
df| windspeed | vehicles | direction | pollution | |
|---|---|---|---|---|
| 0 | 0 | 355 | W | 30 |
| 1 | 2 | 367 | S | 30 |
| 2 | 2 | 447 | S | 78 |
| 3 | 1 | 223 | E | 32 |
| 4 | 1 | 272 | S | 16 |
| 5 | 9 | 394 | S | 36 |
| 6 | 0 | 333 | N | 45 |
| 7 | 3 | 308 | W | 52 |
| 8 | 7 | 480 | N | 24 |
| 9 | 9 | 360 | N | 74 |
| 10 | 0 | 125 | S | 36 |
| 11 | 9 | 401 | S | 62 |
def fit_data(df, X, y):
try:
lr = LinearRegression()
lr.fit(X, y)
rep = f"y = {lr.intercept_:0.2f}"
for i, coef in enumerate(lr.coef_):
rep += f" + {coef:0.2f}*{df.columns[i]}"
return rep
except Exception as e:
print(e)
return None
fit_data(df, df[df.columns[:-1]], df['pollution'])could not convert string to float: 'W'
# Ordinal encoding
from sklearn.preprocessing import OrdinalEncoderenc = OrdinalEncoder()df2 = df.copy()
df2['direction'] = enc.fit_transform(df[['direction']]).flatten()
df2| windspeed | vehicles | direction | pollution | |
|---|---|---|---|---|
| 0 | 0 | 355 | 3.0 | 30 |
| 1 | 2 | 367 | 2.0 | 30 |
| 2 | 2 | 447 | 2.0 | 78 |
| 3 | 1 | 223 | 0.0 | 32 |
| 4 | 1 | 272 | 2.0 | 16 |
| 5 | 9 | 394 | 2.0 | 36 |
| 6 | 0 | 333 | 1.0 | 45 |
| 7 | 3 | 308 | 3.0 | 52 |
| 8 | 7 | 480 | 1.0 | 24 |
| 9 | 9 | 360 | 1.0 | 74 |
| 10 | 0 | 125 | 2.0 | 36 |
| 11 | 9 | 401 | 2.0 | 62 |
fit_data(df2, df2[df2.columns[:-1]], df2['pollution'])'y = 26.49 + 1.49*windspeed + 0.03*vehicles + 1.02*direction'
pd.Series({x: i for i, x in enumerate(enc.categories_[0])})E 0
N 1
S 2
W 3
dtype: int64
# One-hot encoding
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse_output=False)direction_ohe = ohe.fit_transform(df[['direction']])
direction_ohearray([[0., 0., 0., 1.],
[0., 0., 1., 0.],
[0., 0., 1., 0.],
[1., 0., 0., 0.],
[0., 0., 1., 0.],
[0., 0., 1., 0.],
[0., 1., 0., 0.],
[0., 0., 0., 1.],
[0., 1., 0., 0.],
[0., 1., 0., 0.],
[0., 0., 1., 0.],
[0., 0., 1., 0.]])
col_names_ohe = [f"Is it {x}?" for x in enc.categories_[0]]direction_ohe_df = pd.DataFrame(direction_ohe, columns=col_names_ohe)
direction_ohe_df| Is it E? | Is it N? | Is it S? | Is it W? | |
|---|---|---|---|---|
| 0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 0.0 | 0.0 | 1.0 | 0.0 |
| 2 | 0.0 | 0.0 | 1.0 | 0.0 |
| 3 | 1.0 | 0.0 | 0.0 | 0.0 |
| 4 | 0.0 | 0.0 | 1.0 | 0.0 |
| 5 | 0.0 | 0.0 | 1.0 | 0.0 |
| 6 | 0.0 | 1.0 | 0.0 | 0.0 |
| 7 | 0.0 | 0.0 | 0.0 | 1.0 |
| 8 | 0.0 | 1.0 | 0.0 | 0.0 |
| 9 | 0.0 | 1.0 | 0.0 | 0.0 |
| 10 | 0.0 | 0.0 | 1.0 | 0.0 |
| 11 | 0.0 | 0.0 | 1.0 | 0.0 |
# Confirm that we can write Is it W? as a linear combination of the other columns
1-direction_ohe_df[["Is it N?", "Is it S?", "Is it E?"]].sum(axis=1) - direction_ohe_df["Is it W?"]0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
5 0.0
6 0.0
7 0.0
8 0.0
9 0.0
10 0.0
11 0.0
dtype: float64
X = np.hstack([df[['windspeed', 'vehicles']].values, direction_ohe])Xarray([[ 0., 355., 0., 0., 0., 1.],
[ 2., 367., 0., 0., 1., 0.],
[ 2., 447., 0., 0., 1., 0.],
[ 1., 223., 1., 0., 0., 0.],
[ 1., 272., 0., 0., 1., 0.],
[ 9., 394., 0., 0., 1., 0.],
[ 0., 333., 0., 1., 0., 0.],
[ 3., 308., 0., 0., 0., 1.],
[ 7., 480., 0., 1., 0., 0.],
[ 9., 360., 0., 1., 0., 0.],
[ 0., 125., 0., 0., 1., 0.],
[ 9., 401., 0., 0., 1., 0.]])
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])X_augarray([[ 1., 0., 355., 0., 0., 0., 1.],
[ 1., 2., 367., 0., 0., 1., 0.],
[ 1., 2., 447., 0., 0., 1., 0.],
[ 1., 1., 223., 1., 0., 0., 0.],
[ 1., 1., 272., 0., 0., 1., 0.],
[ 1., 9., 394., 0., 0., 1., 0.],
[ 1., 0., 333., 0., 1., 0., 0.],
[ 1., 3., 308., 0., 0., 0., 1.],
[ 1., 7., 480., 0., 1., 0., 0.],
[ 1., 9., 360., 0., 1., 0., 0.],
[ 1., 0., 125., 0., 0., 1., 0.],
[ 1., 9., 401., 0., 0., 1., 0.]])
X_aug.shape(12, 7)
np.linalg.matrix_rank(X_aug), np.linalg.matrix_rank(X_aug.T @ X_aug), (X_aug.T @ X_aug).shape(6, 6, (7, 7))
pd.DataFrame(X_aug.T @ X_aug)| 0 | 1 | 2 | 3 | 4 | 5 | 6 | |
|---|---|---|---|---|---|---|---|
| 0 | 12.0 | 43.0 | 4065.0 | 1.0 | 3.0 | 6.0 | 2.0 |
| 1 | 43.0 | 311.0 | 16802.0 | 1.0 | 16.0 | 23.0 | 3.0 |
| 2 | 4065.0 | 16802.0 | 1481651.0 | 223.0 | 1173.0 | 2006.0 | 663.0 |
| 3 | 1.0 | 1.0 | 223.0 | 1.0 | 0.0 | 0.0 | 0.0 |
| 4 | 3.0 | 16.0 | 1173.0 | 0.0 | 3.0 | 0.0 | 0.0 |
| 5 | 6.0 | 23.0 | 2006.0 | 0.0 | 0.0 | 6.0 | 0.0 |
| 6 | 2.0 | 3.0 | 663.0 | 0.0 | 0.0 | 0.0 | 2.0 |
ohe = OneHotEncoder(sparse_output=False, drop='first')
ohe.fit_transform(df[['direction']])array([[0., 0., 1.],
[0., 1., 0.],
[0., 1., 0.],
[0., 0., 0.],
[0., 1., 0.],
[0., 1., 0.],
[1., 0., 0.],
[0., 0., 1.],
[1., 0., 0.],
[1., 0., 0.],
[0., 1., 0.],
[0., 1., 0.]])
direction_ohe_n_1 = ohe.fit_transform(df[['direction']])
col_names_ohe_n_1 = [f"Is it {x}?" for x in enc.categories_[0][1:]]
df_ohe_n_1 = pd.DataFrame(direction_ohe_n_1, columns=col_names_ohe_n_1)
df_ohe_n_1| Is it N? | Is it S? | Is it W? | |
|---|---|---|---|
| 0 | 0.0 | 0.0 | 1.0 |
| 1 | 0.0 | 1.0 | 0.0 |
| 2 | 0.0 | 1.0 | 0.0 |
| 3 | 0.0 | 0.0 | 0.0 |
| 4 | 0.0 | 1.0 | 0.0 |
| 5 | 0.0 | 1.0 | 0.0 |
| 6 | 1.0 | 0.0 | 0.0 |
| 7 | 0.0 | 0.0 | 1.0 |
| 8 | 1.0 | 0.0 | 0.0 |
| 9 | 1.0 | 0.0 | 0.0 |
| 10 | 0.0 | 1.0 | 0.0 |
| 11 | 0.0 | 1.0 | 0.0 |
X = np.hstack([df[['windspeed', 'vehicles']].values, df_ohe_n_1.values])
X_aug = np.hstack([np.ones((X.shape[0], 1)), X])
X_augarray([[ 1., 0., 355., 0., 0., 1.],
[ 1., 2., 367., 0., 1., 0.],
[ 1., 2., 447., 0., 1., 0.],
[ 1., 1., 223., 0., 0., 0.],
[ 1., 1., 272., 0., 1., 0.],
[ 1., 9., 394., 0., 1., 0.],
[ 1., 0., 333., 1., 0., 0.],
[ 1., 3., 308., 0., 0., 1.],
[ 1., 7., 480., 1., 0., 0.],
[ 1., 9., 360., 1., 0., 0.],
[ 1., 0., 125., 0., 1., 0.],
[ 1., 9., 401., 0., 1., 0.]])
np.linalg.matrix_rank(X_aug), np.linalg.matrix_rank(X_aug.T @ X_aug), (X_aug.T @ X_aug).shape(6, 6, (6, 6))
# Interepeting dummy variables
## dataset
X = np.array(['F', 'F', 'F', 'M', 'M'])
y = np.array([5, 5.2, 5.4, 5.8, 6])from sklearn.preprocessing import LabelBinarizer
l = LabelBinarizer()
l.fit_transform(X)array([[0],
[0],
[0],
[1],
[1]])
X_binary = 1 - l.fit_transform(X)X_binary array([[1],
[1],
[1],
[0],
[0]])
lr = LinearRegression()
lr.fit(X_binary, y)LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
lr.coef_, lr.intercept_(array([-0.7]), 5.8999999999999995)
y[(X_binary==0).flatten()].mean()5.9
y[(X_binary==1).flatten()].mean()5.2