Sklearn on GPU

ML
Tutorial
Author

Kalash Kankaria

Published

February 12, 2024

References:

https://scikit-learn.org/stable/modules/array_api.html

https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn.decomposition.PCA

https://github.com/data-apis/array-api-compat

https://labs.quansight.org/blog/array-api-support-scikit-learn

Install the latest version of scikit-learn library

! pip install scikit-learn==1.4
Collecting scikit-learn==1.4
  Downloading scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.1/12.1 MB 57.9 MB/s eta 0:00:00
Requirement already satisfied: numpy<2.0,>=1.19.5 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.4) (1.23.5)
Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.4) (1.11.4)
Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.4) (1.3.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.4) (3.2.0)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.4.0
Unable to display output for mime type(s): application/vnd.colab-display-data+json

Install array-api-compat library that supports Array API

!python -m pip install array-api-compat
Collecting array-api-compat
  Downloading array_api_compat-1.4.1-py3-none-any.whl (30 kB)
Installing collected packages: array-api-compat
Successfully installed array-api-compat-1.4.1

Imports

import sklearn
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.datasets import make_classification
import torch

print(sklearn.__version__)
1.4.0

Create Classification Data

X_np, y_np = make_classification(random_state=0, n_samples=500_000, n_features=300)
X_np, y_np
(array([[-1.81619458, -1.85550544,  1.0407837 , ...,  0.05972718,
         -0.78920817, -1.17862744],
        [-0.81112283,  1.22246367,  0.5230838 , ...,  0.34797711,
         -0.96876649, -1.09807831],
        [ 0.2102088 ,  1.78862513,  0.19437414, ...,  1.05634817,
         -1.71915482,  0.09357639],
        ...,
        [ 0.41402928,  0.89958434,  0.98362959, ..., -0.80880198,
          0.27123176, -0.52871488],
        [-0.87256303,  1.32778686, -0.97017018, ...,  0.22443048,
         -0.80234779,  0.78283347],
        [ 0.50504494, -0.39312103,  1.56072382, ...,  2.01801143,
         -0.34582501, -0.71471391]]),
 array([0, 1, 1, ..., 0, 1, 0]))

Globally set configuration

sklearn.set_config(array_api_dispatch=True)

LinearDiscriminantAnalysis

Sklearn Performance on CPU

import time

t1 = time.perf_counter()

X_torch_cpu, y_torch_cpu = torch.asarray(X_np), torch.asarray(y_np)

lda_torch_cpu = LinearDiscriminantAnalysis()
lda_torch_cpu.fit(X_torch_cpu, y_torch_cpu)

predictions = lda_torch_cpu.predict(X_torch_cpu)

print(predictions.device)
print(type(predictions))

t2 = time.perf_counter()
print('time taken to run:',t2-t1)
cpu
<class 'torch.Tensor'>
time taken to run: 30.194514389999767

Sklearn Performance on GPU

X_torch_cuda = torch.asarray(X_np, device="cuda", dtype=torch.float32)
y_torch_cuda = torch.asarray(y_np, device="cuda", dtype=torch.float32)
print(type(X_torch_cuda))
print(type(y_torch_cuda))
<class 'torch.Tensor'>
<class 'torch.Tensor'>
import time

t1 = time.perf_counter()

lda_torch_cuda = LinearDiscriminantAnalysis()
lda_torch_cuda.fit(X_torch_cuda, y_torch_cuda)
predictions = lda_torch_cuda.predict(X_torch_cuda)

print(predictions.device)
print(type(predictions))

t2 = time.perf_counter()
print('time taken to run:',t2-t1)
cuda:0
<class 'torch.Tensor'>
time taken to run: 0.5633445190001112
print(type(predictions))
<class 'torch.Tensor'>

Train model on GPU and transfer to CPU for deployment

from sklearn.utils._array_api import _estimator_with_converted_arrays

tensor_to_ndarray = lambda array : array.cpu().numpy()
lda_np = _estimator_with_converted_arrays(lda_torch_cuda, tensor_to_ndarray)
X_trans = lda_np.transform(X_np)

print(type(X_np))
print(type(X_trans))
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>

PCA Decomposition

from sklearn.decomposition import PCA
import numpy as np

X_np = np.random.rand(1000000, 100)

Sklearn Performance on CPU

X_torch_cpu = torch.asarray(X_np)

t1 = time.perf_counter()

pca_cpu = PCA(n_components=2, svd_solver='full')
pca_cpu.fit(X_torch_cpu)
print(pca_cpu.explained_variance_ratio_)
print(pca_cpu.singular_values_)

t2 = time.perf_counter()
print('time taken to run:',t2-t1)
tensor([0.0102, 0.0102], dtype=torch.float64)
tensor([291.4060, 291.3705], dtype=torch.float64)
time taken to run: 10.91760363100002

Sklearn Performance on GPU

X_torch_cuda = torch.asarray(X_np, device="cuda", dtype=torch.float32)

t1 = time.perf_counter()

pca_cuda = PCA(n_components=2, svd_solver='full')
pca_cuda.fit(X_torch_cuda)
print(pca_cuda.explained_variance_ratio_)
print(pca_cuda.singular_values_)

t2 = time.perf_counter()
print('time taken to run:',t2-t1)
tensor([0.0102, 0.0102], device='cuda:0')
tensor([291.4084, 291.3728], device='cuda:0')
time taken to run: 0.278887709999708