Set

ML
Author

Nipun Batra

Published

December 7, 2024

import matplotlib.pyplot as plt
import numpy as np
import torch 
import pandas as pd
# Retina mode
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
A = np.linspace(0, 1, 100)
A = np.arange(0, 1, 0.01)
# uncountable set
x = np.random.rand(100000000)
#print(x)
np.unique(x).size
100000000
# Set of lines in 2d
def line_fx(x, m, c):
    return m*x + c

line_fx(1, 2, 3)
5
x_lin = np.linspace(-10, 10, 100)
y_lin_2_3 = line_fx(x_lin, 2, 3)

plt.plot(x_lin, y_lin_2_3)

from ipywidgets import interact
import ipywidgets as widgets



def plot_line(m, c):
    x_lin = np.linspace(-10, 10, 100)
    y_lin = line_fx(x_lin, m, c)
    
    plt.figure(figsize=(8, 6))
    plt.plot(x_lin, y_lin, label=f'Line: y = {m}x + {c}')
    plt.axhline(0, color='black', linewidth=0.8, linestyle='--')
    plt.axvline(0, color='black', linewidth=0.8, linestyle='--')
    plt.grid(alpha=0.5)
    plt.legend()
    plt.title("Interactive Line Plot")
    plt.xlabel("x")
    plt.ylabel("y")
    plt.ylim(-10, 10)
    plt.show()

# Interactive widget
interact(plot_line, 
         m=widgets.FloatSlider(value=1, min=-10, max=10, step=0.1, description='Slope (m)'),
         c=widgets.FloatSlider(value=0, min=-10, max=10, step=0.1, description='Intercept (c)'));
samples_uniform = np.random.rand(50000)
plt.hist(samples_uniform)
(array([5044., 4923., 5075., 4798., 4977., 4982., 4989., 5158., 4992.,
        5062.]),
 array([3.93041625e-05, 1.00032757e-01, 2.00026211e-01, 3.00019664e-01,
        4.00013117e-01, 5.00006570e-01, 6.00000024e-01, 6.99993477e-01,
        7.99986930e-01, 8.99980383e-01, 9.99973836e-01]),
 <BarContainer object of 10 artists>)

samples_uniform
array([0.18674884, 0.81968297, 0.80087693, ..., 0.46548473, 0.47943922,
       0.73968035])
samples_normal = np.random.randn(50000)
plt.hist(samples_normal)
(array([   23.,   285.,  1985.,  7467., 14803., 14914.,  7988.,  2201.,
          309.,    25.]),
 array([-4.18722189, -3.35305381, -2.51888572, -1.68471764, -0.85054956,
        -0.01638148,  0.8177866 ,  1.65195468,  2.48612276,  3.32029084,
         4.15445892]),
 <BarContainer object of 10 artists>)

samples_normal[:50]
array([-1.02610077,  1.30693882, -0.60887913, -2.21509403, -0.49288884,
       -0.24255606, -0.06006607, -0.43524481,  0.09849432,  0.65723845,
        0.26736775, -0.23655818, -2.18103935, -0.49017392,  1.62213243,
        0.38596106,  0.93529816,  1.08752614, -0.4461042 , -0.95299851,
        1.38512913,  0.09622675, -0.72466762, -0.12871054, -0.50039256,
        0.11997974, -1.54530777,  0.27708632, -1.59812337,  0.91816234,
       -0.07142259, -1.00183667,  0.77816444,  0.24435284,  0.91035827,
        0.60326872,  0.57121044,  1.26167048, -1.15016846, -0.69882365,
       -1.07502868, -0.11305347,  0.82249031,  0.49697962, -1.21883061,
       -1.96468898, -0.01928378, -0.56361649,  0.48693249, -0.27086149])
# Plot some 20 lines for random m and c
m = np.random.rand(20)*20 - 10
c = np.random.rand(20)*20 - 10

for i in range(20):
    y_lin = line_fx(x_lin, m[i], c[i])
    plt.plot(x_lin, y_lin, label=f'Line: y = {m[i]:.2f}x + {c[i]:.2f}')

np.random.randn?
Signature: np.random.randn(*args)

Docstring:

randn(d0, d1, ..., dn)



Return a sample (or samples) from the "standard normal" distribution.



.. note::

    This is a convenience function for users porting code from Matlab,

    and wraps `standard_normal`. That function takes a

    tuple to specify the size of the output, which is consistent with

    other NumPy functions like `numpy.zeros` and `numpy.ones`.



.. note::

    New code should use the

    `~numpy.random.Generator.standard_normal`

    method of a `~numpy.random.Generator` instance instead;

    please see the :ref:`random-quick-start`.



If positive int_like arguments are provided, `randn` generates an array

of shape ``(d0, d1, ..., dn)``, filled

with random floats sampled from a univariate "normal" (Gaussian)

distribution of mean 0 and variance 1. A single float randomly sampled

from the distribution is returned if no argument is provided.



Parameters

----------

d0, d1, ..., dn : int, optional

    The dimensions of the returned array, must be non-negative.

    If no argument is given a single Python float is returned.



Returns

-------

Z : ndarray or float

    A ``(d0, d1, ..., dn)``-shaped array of floating-point samples from

    the standard normal distribution, or a single such float if

    no parameters were supplied.



See Also

--------

standard_normal : Similar, but takes a tuple as its argument.

normal : Also accepts mu and sigma arguments.

random.Generator.standard_normal: which should be used for new code.



Notes

-----

For random samples from the normal distribution with mean ``mu`` and

standard deviation ``sigma``, use::



    sigma * np.random.randn(...) + mu



Examples

--------

>>> np.random.randn()

2.1923875335537315  # random



Two-by-four array of samples from the normal distribution with

mean 3 and standard deviation 2.5:



>>> 3 + 2.5 * np.random.randn(2, 4)

array([[-4.49401501,  4.00950034, -1.81814867,  7.29718677],   # random

       [ 0.39924804,  4.68456316,  4.99394529,  4.84057254]])  # random

Type:      method
# Plot some 20 lines for random m and c
N = 5000
m = np.random.randn(N)*0.5
c = np.random.randn(N)*0.5

for i in range(N):
    y_lin = line_fx(x_lin, m[i], c[i])
    plt.plot(x_lin, y_lin, label=f'Line: y = {m[i]:.2f}x + {c[i]:.2f}', 
             color='k', alpha=0.01)

# Set of cosines with varying phase (fixed amplitude and frequency)
def cosine_fx(x, A=1, f=1, phi=0):
    return A*np.cos(2*np.pi*f*x + phi)
x_lin = np.linspace(-10, 10, 1000)
y_cos_1_1_0 = cosine_fx(x_lin, 1, 1, 0)

plt.plot(x_lin, y_cos_1_1_0)

def plot_cosine(A, f, phi):
    x = np.linspace(0, 2, 500)  # x range for visualization
    y = cosine_fx(x, A, f, phi)
    
    plt.figure(figsize=(8, 6))
    plt.plot(x, y, label=f'Cosine: y = {A}cos(2π{f}x + {phi})')
    plt.axhline(0, color='black', linewidth=0.8, linestyle='--')
    plt.grid(alpha=0.5)
    plt.legend()
    plt.title("Interactive Cosine Plot")
    plt.xlabel("x")
    plt.ylabel("y")
    plt.ylim(-2, 2)
    

# Interactive widget
interact(plot_cosine, 
         A=widgets.FloatSlider(value=1, min=0.1, max=2, step=0.1, description='Amplitude (A)'),
         f=widgets.FloatSlider(value=1, min=0.1, max=5, step=0.1, description='Frequency (f)'),
         phi=widgets.FloatSlider(value=0, min=0, max=2*np.pi, step=0.1, description='Phase (φ)'));

# Set datastructure in Python

A = {1, 2, 3, 4, 5}
print(A, type(A))
{1, 2, 3, 4, 5} <class 'set'>
A = set([1, 2, 3, 4, 5])
print(A, type(A))
{1, 2, 3, 4, 5} <class 'set'>
# unique elements
A = {1, 2, 3, 4, 5, 1, 2, 3, 4, 5}
print(A, len(A))
{1, 2, 3, 4, 5} 5
# Can set contain a set?
A = {1, 2, 3}
for a in A:
    print(a)
1
2
3
# why below code doesn't work? homework
try:
    A = {1, 2, 3, {4, 5}}
except Exception as e:
    print(e)
unhashable type: 'set'
# subset
A = {1, 2, 3, 4, 5}
B = {1, 2, 3}

print(B.issubset(A))
True
print(A.issubset(B))
False
print(A.issuperset(B))
True
A.issubset(A)
True
# Other methods in set
A
{1, 2, 3, 4, 5}
# check proper subset
def is_proper_subset(A, B):
    return A.issubset(B) and A != B
is_proper_subset({1, 2, 3}, {1, 2, 3, 4})
True
is_proper_subset({1, 2, 3, 4}, {1, 2, 3, 4})
False
is_proper_subset({1, 2, 6}, {1, 2, 3, 4})
False
# empty set subset of every set
empty_set = set()
A = {1, 2, 3}

empty_set.issubset(A)
True
is_proper_subset(empty_set, A)
True
# be careful with empty set definition. Below is not an empty set but 
# a dictionary
empty_set= {}
print(type(empty_set))
<class 'dict'>
# Sets in NumPy
A = np.array([1, 2, 3, 4, 5])
B = np.array([1, 2, 3])

# is B a subset of A?
print(np.in1d(B, A).all())
True
np.in1d?
Signature:       np.in1d(ar1, ar2, assume_unique=False, invert=False, *, kind=None)

Call signature:  np.in1d(*args, **kwargs)

Type:            _ArrayFunctionDispatcher

String form:     <function in1d at 0x107c70fe0>

File:            ~/mambaforge/lib/python3.12/site-packages/numpy/lib/_arraysetops_impl.py

Docstring:      

Test whether each element of a 1-D array is also present in a second array.



.. deprecated:: 2.0

    Use :func:`isin` instead of `in1d` for new code.



Returns a boolean array the same length as `ar1` that is True

where an element of `ar1` is in `ar2` and False otherwise.



Parameters

----------

ar1 : (M,) array_like

    Input array.

ar2 : array_like

    The values against which to test each value of `ar1`.

assume_unique : bool, optional

    If True, the input arrays are both assumed to be unique, which

    can speed up the calculation.  Default is False.

invert : bool, optional

    If True, the values in the returned array are inverted (that is,

    False where an element of `ar1` is in `ar2` and True otherwise).

    Default is False. ``np.in1d(a, b, invert=True)`` is equivalent

    to (but is faster than) ``np.invert(in1d(a, b))``.

kind : {None, 'sort', 'table'}, optional

    The algorithm to use. This will not affect the final result,

    but will affect the speed and memory use. The default, None,

    will select automatically based on memory considerations.



    * If 'sort', will use a mergesort-based approach. This will have

      a memory usage of roughly 6 times the sum of the sizes of

      `ar1` and `ar2`, not accounting for size of dtypes.

    * If 'table', will use a lookup table approach similar

      to a counting sort. This is only available for boolean and

      integer arrays. This will have a memory usage of the

      size of `ar1` plus the max-min value of `ar2`. `assume_unique`

      has no effect when the 'table' option is used.

    * If None, will automatically choose 'table' if

      the required memory allocation is less than or equal to

      6 times the sum of the sizes of `ar1` and `ar2`,

      otherwise will use 'sort'. This is done to not use

      a large amount of memory by default, even though

      'table' may be faster in most cases. If 'table' is chosen,

      `assume_unique` will have no effect.



    .. versionadded:: 1.8.0



Returns

-------

in1d : (M,) ndarray, bool

    The values `ar1[in1d]` are in `ar2`.



See Also

--------

isin                  : Version of this function that preserves the

                        shape of ar1.



Notes

-----

`in1d` can be considered as an element-wise function version of the

python keyword `in`, for 1-D sequences. ``in1d(a, b)`` is roughly

equivalent to ``np.array([item in b for item in a])``.

However, this idea fails if `ar2` is a set, or similar (non-sequence)

container:  As ``ar2`` is converted to an array, in those cases

``asarray(ar2)`` is an object array rather than the expected array of

contained values.



Using ``kind='table'`` tends to be faster than `kind='sort'` if the

following relationship is true:

``log10(len(ar2)) > (log10(max(ar2)-min(ar2)) - 2.27) / 0.927``,

but may use greater memory. The default value for `kind` will

be automatically selected based only on memory usage, so one may

manually set ``kind='table'`` if memory constraints can be relaxed.



.. versionadded:: 1.4.0



Examples

--------

>>> import numpy as np

>>> test = np.array([0, 1, 2, 5, 0])

>>> states = [0, 2]

>>> mask = np.in1d(test, states)

>>> mask

array([ True, False,  True, False,  True])

>>> test[mask]

array([0, 2, 0])

>>> mask = np.in1d(test, states, invert=True)

>>> mask

array([False,  True, False,  True, False])

>>> test[mask]

array([1, 5])

Class docstring:

Class to wrap functions with checks for __array_function__ overrides.



All arguments are required, and can only be passed by position.



Parameters

----------

dispatcher : function or None

    The dispatcher function that returns a single sequence-like object

    of all arguments relevant.  It must have the same signature (except

    the default values) as the actual implementation.

    If ``None``, this is a ``like=`` dispatcher and the

    ``_ArrayFunctionDispatcher`` must be called with ``like`` as the

    first (additional and positional) argument.

implementation : function

    Function that implements the operation on NumPy arrays without

    overrides.  Arguments passed calling the ``_ArrayFunctionDispatcher``

    will be forwarded to this (and the ``dispatcher``) as if using

    ``*args, **kwargs``.



Attributes

----------

_implementation : function

    The original implementation passed in.
np.isin?
Signature:      

np.isin(

    element,

    test_elements,

    assume_unique=False,

    invert=False,

    *,

    kind=None,

)

Call signature:  np.isin(*args, **kwargs)

Type:            _ArrayFunctionDispatcher

String form:     <function isin at 0x107c711c0>

File:            ~/mambaforge/lib/python3.12/site-packages/numpy/lib/_arraysetops_impl.py

Docstring:      

Calculates ``element in test_elements``, broadcasting over `element` only.

Returns a boolean array of the same shape as `element` that is True

where an element of `element` is in `test_elements` and False otherwise.



Parameters

----------

element : array_like

    Input array.

test_elements : array_like

    The values against which to test each value of `element`.

    This argument is flattened if it is an array or array_like.

    See notes for behavior with non-array-like parameters.

assume_unique : bool, optional

    If True, the input arrays are both assumed to be unique, which

    can speed up the calculation.  Default is False.

invert : bool, optional

    If True, the values in the returned array are inverted, as if

    calculating `element not in test_elements`. Default is False.

    ``np.isin(a, b, invert=True)`` is equivalent to (but faster

    than) ``np.invert(np.isin(a, b))``.

kind : {None, 'sort', 'table'}, optional

    The algorithm to use. This will not affect the final result,

    but will affect the speed and memory use. The default, None,

    will select automatically based on memory considerations.



    * If 'sort', will use a mergesort-based approach. This will have

      a memory usage of roughly 6 times the sum of the sizes of

      `element` and `test_elements`, not accounting for size of dtypes.

    * If 'table', will use a lookup table approach similar

      to a counting sort. This is only available for boolean and

      integer arrays. This will have a memory usage of the

      size of `element` plus the max-min value of `test_elements`.

      `assume_unique` has no effect when the 'table' option is used.

    * If None, will automatically choose 'table' if

      the required memory allocation is less than or equal to

      6 times the sum of the sizes of `element` and `test_elements`,

      otherwise will use 'sort'. This is done to not use

      a large amount of memory by default, even though

      'table' may be faster in most cases. If 'table' is chosen,

      `assume_unique` will have no effect.





Returns

-------

isin : ndarray, bool

    Has the same shape as `element`. The values `element[isin]`

    are in `test_elements`.



Notes

-----



`isin` is an element-wise function version of the python keyword `in`.

``isin(a, b)`` is roughly equivalent to

``np.array([item in b for item in a])`` if `a` and `b` are 1-D sequences.



`element` and `test_elements` are converted to arrays if they are not

already. If `test_elements` is a set (or other non-sequence collection)

it will be converted to an object array with one element, rather than an

array of the values contained in `test_elements`. This is a consequence

of the `array` constructor's way of handling non-sequence collections.

Converting the set to a list usually gives the desired behavior.



Using ``kind='table'`` tends to be faster than `kind='sort'` if the

following relationship is true:

``log10(len(test_elements)) >

(log10(max(test_elements)-min(test_elements)) - 2.27) / 0.927``,

but may use greater memory. The default value for `kind` will

be automatically selected based only on memory usage, so one may

manually set ``kind='table'`` if memory constraints can be relaxed.



.. versionadded:: 1.13.0



Examples

--------

>>> import numpy as np

>>> element = 2*np.arange(4).reshape((2, 2))

>>> element

array([[0, 2],

       [4, 6]])

>>> test_elements = [1, 2, 4, 8]

>>> mask = np.isin(element, test_elements)

>>> mask

array([[False,  True],

       [ True, False]])

>>> element[mask]

array([2, 4])



The indices of the matched values can be obtained with `nonzero`:



>>> np.nonzero(mask)

(array([0, 1]), array([1, 0]))



The test can also be inverted:



>>> mask = np.isin(element, test_elements, invert=True)

>>> mask

array([[ True, False],

       [False,  True]])

>>> element[mask]

array([0, 6])



Because of how `array` handles sets, the following does not

work as expected:



>>> test_set = {1, 2, 4, 8}

>>> np.isin(element, test_set)

array([[False, False],

       [False, False]])



Casting the set to a list gives the expected result:



>>> np.isin(element, list(test_set))

array([[False,  True],

       [ True, False]])

Class docstring:

Class to wrap functions with checks for __array_function__ overrides.



All arguments are required, and can only be passed by position.



Parameters

----------

dispatcher : function or None

    The dispatcher function that returns a single sequence-like object

    of all arguments relevant.  It must have the same signature (except

    the default values) as the actual implementation.

    If ``None``, this is a ``like=`` dispatcher and the

    ``_ArrayFunctionDispatcher`` must be called with ``like`` as the

    first (additional and positional) argument.

implementation : function

    Function that implements the operation on NumPy arrays without

    overrides.  Arguments passed calling the ``_ArrayFunctionDispatcher``

    will be forwarded to this (and the ``dispatcher``) as if using

    ``*args, **kwargs``.



Attributes

----------

_implementation : function

    The original implementation passed in.
A, B
(array([1, 2, 3, 4, 5]), array([1, 2, 3]))
response = np.in1d(B, A)
if False in response:
    print("B is not a subset of A")
else:
    print("B is a subset of A")
B is a subset of A
response.all()
np.True_
np.array([False, True]).astype(int).sum()
np.int64(1)
response.astype(int).sum() == len(B)
np.True_
np.isin(B, A).all()
np.True_
# Case where B is not a subset of A
B = np.array([1, 2, 6])
A = np.array([1, 2, 3, 4, 5])

np.isin(B, A).all()
np.False_
empty_set = np.array([])
A = np.array([1, 2, 3])

np.isin(empty_set, A).all()
np.True_
# Visualising sets using Venn diagrams
from matplotlib_venn import venn2

# Define the sets
set1 = {1, 2, 3}
set2 = {2, 3, 5}

# Create the Venn diagram
venn = venn2([set1, set2], ('Set 1', 'Set 2'))

# Define the sets
setA = {1, 2, 3}
setB = {2, 3, 5}

# Create the Venn diagram
venn = venn2([setA, setB], ('A', 'B'))

# Customize the labels to show the elements and sizes
try:
    venn.get_label_by_id('10').set_text(
        f"A: {', '.join(map(str, setA - setB))}\n(Size: {len(setA - setB)})"
    )  # Only in A
    venn.get_label_by_id('01').set_text(
        f"B: {', '.join(map(str, setB - setA))}\n(Size: {len(setB - setA)})"
    )  # Only in B
    venn.get_label_by_id('11').set_text(
        f"A ∩ B: {', '.join(map(str, setA & setB))}\n(Size: {len(setA & setB)})"
    )  # Intersection (A ∩ B)
except:
    pass

# Display the plot
plt.title("Venn Diagram with Labels A, B, and A ∩ B")
Text(0.5, 1.0, 'Venn Diagram with Labels A, B, and A ∩ B')

Set_A = set([1,2,3])
Set_B = set([2,3,5])

# Union
Union = Set_A.union(Set_B)
print('Union:', Union)
Union: {1, 2, 3, 5}
# numpy
Set_A = np.array(list(Set_A))
Set_B = np.array(list(Set_B))

Union = np.union1d(Set_A, Set_B)
print('Union:', Union)
Union: [1 2 3 5]
# From scratch
a = np.array([1, 2, 3])
b = np.array([2, 3, 5])

union = a.copy()
for i in b:
    if i not in union:
        union = np.append(union, i)
print('Union:', union)
Union: [1 2 3 5]
union = []
for element in a:
    if element not in union:
        union.append(element)
for element in b:
    if element not in union:
        union.append(element)

union = np.array(union)
print('Union:', union)
Union: [1 2 3 5]
np.unique(np.concatenate([a, b]))
array([1, 2, 3, 5])
# Intersection
Set_A = set([1,2,3])
Set_B = set([2,3,5])
Intersection = Set_A.intersection(Set_B)
print('Intersection:', Intersection)
Intersection: {2, 3}
# Intersection using numpy
A = np.array([1, 2, 3])
B = np.array([2, 3, 5])

Intersection = np.intersect1d(A, B)
print('Intersection:', Intersection)
Intersection: [2 3]
# From scratch
intersection = []
for i in a:
    if i in b:
        intersection.append(i)
intersection = np.array(intersection)
print('Intersection:', intersection)
Intersection: [2 3]
# Difference
Difference = Set_A.difference(Set_B)
print('Difference:', Difference)
Difference: {1}
Set_B.difference(Set_A)
{5}
# Difference in numpy
Difference_A_B = np.setdiff1d(A, B)
print('Difference A/B:', Difference_A_B)
Difference_B_A = np.setdiff1d(B, A)
print('Difference B/A:', Difference_B_A)
Difference A/B: [1]
Difference B/A: [5]
# From scratch
difference_A_B = []
for i in a:
    if i not in b:
        difference_A_B.append(i)
difference_A_B = np.array(difference_A_B)
print('Difference A/B:', difference_A_B)
Difference A/B: [1]
def difference(A, B):
    """
    Function to find the difference between two sets A and B
    A: numpy array 1d
    B: numpy array 1d

    Returns:
    difference_A_B: numpy array 1d
    """
    difference_A_B = []
    for i in A:
        if i not in B:
            difference_A_B.append(i)
    difference_A_B = np.array(difference_A_B)
    return difference_A_B
# Complement
universal_set = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

A = np.array([1, 2, 3])

complement_A = np.setdiff1d(universal_set, A)
print('Complement of A:', complement_A)
Complement of A: [ 4  5  6  7  8  9 10]
# Disjoint sets
Set_A = set([1,2,3])
Set_B = set([4,5,6])

Intersection = Set_A.intersection(Set_B)
print('Intersection:', Intersection)

if len(Intersection) == 0:
    print('Sets are disjoint')
else:
    print('Sets are not disjoint')
Intersection: set()
Sets are disjoint
collection_sets = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
collection_sets

# check if all sets are disjoint
disjoint = True
# Associative property of union

# (A ∪ B) ∪ C = A ∪ (B ∪ C)

A = np.array([1, 2, 3])
B = np.array([2, 3, 4])
C = np.array([3, 4, 5])

# (A ∪ B) ∪ C
lhs = np.union1d(np.union1d(A, B), C)
print('(A ∪ B) ∪ C:', lhs)

# A ∪ (B ∪ C)
rhs = np.union1d(A, np.union1d(B, C))
print('A ∪ (B ∪ C):', rhs)
(A ∪ B) ∪ C: [1 2 3 4 5]
A ∪ (B ∪ C): [1 2 3 4 5]
# Associative property of intersection 
# De Morgan's laws