# Numpy Pandas Basics

R Yeeshu Dhurandhar, Nipun Batra  
2023-12-31

<figure>
<a
href="https://colab.research.google.com/github/nipunbatra/ml-teaching/blob/master/notebooks/numpy-pandas-basics.ipynb"><img
src="https://colab.research.google.com/assets/colab-badge.svg" /></a>
<figcaption>Open In Colab</figcaption>
</figure>

In [2]:
# importing necessary libraries
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Lists

In [3]:
# Creating lists
list_a = [1, 2, 3, 4, 5]
list_b = [6, 7, 8, 9, 10]

In [4]:
# Operations on lists
# Adding lists
list_sum = [a + b for a, b in zip(list_a, list_b)]
print("List Sum:", list_sum)

# Vector product using lists    
vector_product = [a * b for a, b in zip(list_a, list_b)]
print("Vector Product:", vector_product)

List Sum: [7, 9, 11, 13, 15]
Vector Product: [6, 14, 24, 36, 50]

# Numpy Array

In [5]:
# Creating numpy arrays
numpy_array_a = np.array(list_a)
numpy_array_b = np.array(list_b)

In [6]:
# Operations on numpy arrays
# Adding numpy arrays
numpy_sum = numpy_array_a + numpy_array_b
print("Numpy Sum:", numpy_sum)

# Vector product using numpy arrays
numpy_vector_product = np.multiply(numpy_array_a, numpy_array_b)
print("Numpy Vector Product:", numpy_vector_product)

Numpy Sum: [ 7  9 11 13 15]
Numpy Vector Product: [ 6 14 24 36 50]

In [7]:
np.allclose(list_sum, numpy_sum), np.allclose(vector_product, numpy_vector_product)

(True, True)

# Time comparison between list and numpy array

In [8]:
# Creating large arrays and lists for time comparison
numpy_array_a = np.random.randint(0, 100, size=10000)
numpy_array_b = np.random.randint(0, 100, size=10000)

list_a = list(numpy_array_a)
list_b = list(numpy_array_b)

In [9]:
# Time for list addition
start_time = time.time()
for _ in range(1000):
    list_sum = [a + b for a, b in zip(list_a, list_b)]
end_time = time.time()
print("Time taken for lists addition:", end_time - start_time)

# Time for numpy addition
start_time = time.time()
for _ in range(1000):
    numpy_sum = numpy_array_a + numpy_array_b
end_time = time.time()
print("Time taken for numpy addition:", end_time - start_time)

Time taken for lists addition: 0.5500102043151855
Time taken for numpy addition: 0.0038487911224365234

In [10]:
# Time for list vector product
start_time = time.time()
for _ in range(10000):
    list_product = [a * b for a, b in zip(list_a, list_b)]

end_time = time.time()
print("Time taken for list vector product:", end_time - start_time)

# Time for numpy vector product 
start_time = time.time()
for _ in range(10000):
    numpy_product = np.multiply(numpy_array_a, numpy_array_b)

end_time = time.time()
print("Time taken for numpy vector product:", end_time - start_time)

Time taken for list vector product: 5.371699571609497
Time taken for numpy vector product: 0.047417640686035156

In [11]:
np.allclose(list_sum, numpy_sum), np.allclose(vector_product, numpy_vector_product)

(True, True)

In [12]:
timeit_add_list = %timeit -o [a + b for a, b in zip(list_a, list_b)]

542 µs ± 593 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)

In [13]:
timeit_add_numpy = %timeit -o numpy_array_a + numpy_array_b

3.5 µs ± 6.1 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)

# Code clarity

In [14]:
# Numpy code is often more concise and readable than list comprehensions
# Example: Calculate the element-wise product of two lists
list_product = [a * b for a, b in zip(list_a, list_b)]
numpy_product = np.multiply(numpy_array_a, numpy_array_b)

In [15]:
numpy_product

array([5950, 1995,  264, ..., 2436,  928,  665])

In [16]:
numpy_array_a@numpy_array_b

24470992

# Reading CSV file using Numpy

In [17]:
!head ../datasets/tennis-discrete-output.csv

Day,Outlook,Temp,Humidity,Windy,Play
D1,Sunny,Hot,High,Weak,No
D2,Sunny,Hot,High,Strong,No
D3,Overcast,Hot,High,Weak,Yes
D4,Rain,Mild,High,Weak,Yes
D5,Rain,Cool,Normal,Weak,Yes
D6,Rain,Cool,Normal,Strong,No
D7,Overcast,Cool,Normal,Strong,Yes
D8,Sunny,Mild,High,Weak,No
D9,Sunny,Cool,Normal,Weak,Yes

In [19]:
np.genfromtxt?

In [18]:
data = np.genfromtxt('../datasets/tennis-discrete-output.csv', delimiter=',')
data

array([[nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan],
       [nan, nan, nan, nan, nan, nan]])

Wait! What happened?

In [23]:
data = np.genfromtxt('../datasets/tennis-discrete-output.csv', delimiter=',', dtype=str)
data

array([['Day', 'Outlook', 'Temp', 'Humidity', 'Windy', 'Play'],
       ['D1', 'Sunny', 'Hot', 'High', 'Weak', 'No'],
       ['D2', 'Sunny', 'Hot', 'High', 'Strong', 'No'],
       ['D3', 'Overcast', 'Hot', 'High', 'Weak', 'Yes'],
       ['D4', 'Rain', 'Mild', 'High', 'Weak', 'Yes'],
       ['D5', 'Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
       ['D6', 'Rain', 'Cool', 'Normal', 'Strong', 'No'],
       ['D7', 'Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
       ['D8', 'Sunny', 'Mild', 'High', 'Weak', 'No'],
       ['D9', 'Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
       ['D10', 'Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
       ['D11', 'Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
       ['D12', 'Overcast', 'Mild', 'High', 'Strong', 'Yes'],
       ['D13', 'Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
       ['D14', 'Rain', 'Mild', 'High', 'Strong', 'No']], dtype='<U8')

In [24]:
data.shape

(15, 6)

Question: Find the outlook on D11

In [25]:
idx = np.argwhere(data[:, 0] == 'D11')[0, 0]
idx

11

In [18]:
data[idx]

array(['D11', 'Sunny', 'Mild', 'Normal', 'Strong', 'Yes'], dtype='<U8')

In [19]:
data[idx][1]

'Sunny'

# Reading CSV file using Pandas

In [32]:
df = pd.read_csv('../datasets/tennis-discrete-output.csv')

In [33]:
df

In [34]:
df['Day'] == 'D11'

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10     True
11    False
12    False
13    False
Name: Day, dtype: bool

In [35]:
df[df['Day'] == 'D11']

In [65]:
df[df['Day'] == 'D11']['Outlook']

10    Sunny
Name: Outlook, dtype: object

In [66]:
df.query('Day == "D11"')['Outlook']

10    Sunny
Name: Outlook, dtype: object

In [36]:
df.shape

(14, 6)

Question. How many times do we play v/s not play tennis

In [44]:
ser = df['Play']
ser

0      No
1      No
2     Yes
3     Yes
4     Yes
5      No
6     Yes
7      No
8     Yes
9     Yes
10    Yes
11    Yes
12    Yes
13     No
Name: Play, dtype: object

In [40]:
unique_play_options = df['Play'].unique()
unique_play_options

array(['No', 'Yes'], dtype=object)

In [41]:
for option in unique_play_options:
    print(option, (df['Play'] == option).sum())


No 5
Yes 9

In [42]:
df['Play'].value_counts()

Play
Yes    9
No     5
Name: count, dtype: int64

In [46]:
df.groupby('Play').size()

Play
No     5
Yes    9
dtype: int64

In [48]:
gby = df.groupby('Play')

In [51]:
{k: len(v) for k, v in gby.groups.items()}

{'No': 5, 'Yes': 9}

In [52]:
pd.crosstab(index=df['Play'], columns='count')

What is the distribution of any given attribute?

In [53]:
def distribution(df, attribute):
    return df[attribute].value_counts()

In [55]:
ser = distribution(df, 'Outlook')

In [56]:
ser

Outlook
Sunny       5
Rain        5
Overcast    4
Name: count, dtype: int64

In [57]:
type(ser)

pandas.core.series.Series

In [58]:
ser.values

array([5, 5, 4])

In [59]:
ser.index

Index(['Sunny', 'Rain', 'Overcast'], dtype='object', name='Outlook')

In [29]:
distribution(df, 'Temp')

Temp
Mild    6
Hot     4
Cool    4
Name: count, dtype: int64

Finding entropy for target variable

In [60]:
target_attribute = 'Play'
dist_target = distribution(df, target_attribute)

In [61]:
dist_target

Play
Yes    9
No     5
Name: count, dtype: int64

Normalize distribution

In [32]:
dist_target/dist_target.sum()

Play
Yes    0.642857
No     0.357143
Name: count, dtype: float64

In [33]:
df['Play'].value_counts(normalize=True)

Play
Yes    0.642857
No     0.357143
Name: proportion, dtype: float64

In [63]:
normalized_dist_target = dist_target/dist_target.sum()

For loop way of calculating entropy

In [64]:
e = 0.0
for value, p in normalized_dist_target.items():
    e = e - p * np.log2(p + 1e-6) # 1e-6 is added to avoid log(0)
print(e)

0.9402830732836911

In [65]:
normalized_dist_target.apply(lambda x: -x * np.log2(x + 1e-6))

Play
Yes    0.409775
No     0.530508
Name: count, dtype: float64

In [36]:
normalized_dist_target.apply(lambda x: -x * np.log2(x + 1e-6)).sum()

0.9402830732836911

More on crosstab

In [37]:
pd.crosstab(index=df['Outlook'], columns=df['Play'])

In [38]:
pd.crosstab(index=df['Outlook'], columns=df['Play']).T

In [70]:
df_attr = pd.crosstab(index=df['Play'], columns=df['Outlook'], normalize='columns')
df_attr

Using groupby

In [71]:
df.groupby(['Play', 'Outlook']).size()

Play  Outlook 
No    Rain        2
      Sunny       3
Yes   Overcast    4
      Rain        3
      Sunny       2
dtype: int64

In [72]:
df.groupby(['Play', 'Outlook']).size().index

MultiIndex([( 'No',     'Rain'),
            ( 'No',    'Sunny'),
            ('Yes', 'Overcast'),
            ('Yes',     'Rain'),
            ('Yes',    'Sunny')],
           names=['Play', 'Outlook'])

In [73]:
df.groupby(['Play', 'Outlook']).size().unstack('Outlook')

In [74]:
df_attr_groupby = df.groupby(['Play', 'Outlook']).size().unstack('Outlook').fillna(0)
df_attr_groupby

Apply

In [75]:
neg_plogp = df_attr.apply(lambda x: -x * np.log2(x + 1e-6), axis=0)
neg_plogp

In [76]:
neg_plogp.sum(axis=0).sort_index()

Outlook
Overcast   -0.000001
Rain        0.970948
Sunny       0.970948
dtype: float64

In [77]:
df_attr_dist = distribution(df, 'Outlook')
norm_attr_dist = df_attr_dist/df_attr_dist.sum()
norm_attr_dist

Outlook
Sunny       0.357143
Rain        0.357143
Overcast    0.285714
Name: count, dtype: float64

In [78]:
(norm_attr_dist*neg_plogp.sum(axis=0).sort_index()).sum()

0.6935336657070463