Tabular data

Covariant shift detection for tabular data

This example of feature drift detection for tabular data is inspired by https://docs.seldon.io/projects/alibi-detect/en/latest/examples/cd_online_wine.html

In [1]:

                
                    Copied!
                    
import sys 
sys.path.append("../../") # Adding path to project root for this specific notebook
import sys 
sys.path.append("../../") # Adding path to project root for this specific notebook

In [2]:

                
                    Copied!
                    
                        
                        
                    
                    

            
import pandas as pd
import numpy as np

red = pd.read_csv(
    "https://storage.googleapis.com/seldon-datasets/wine_quality/winequality-red.csv", sep=';'
)
white = pd.read_csv(
    "https://storage.googleapis.com/seldon-datasets/wine_quality/winequality-white.csv", sep=';'
)
white = white.drop(["quality"], axis=1)
red = red.drop(["quality"], axis=1)
white_test = white[400:800].astype(np.float64)
white = white[:400].astype(np.float64)
red = red[:400].astype(np.float64)
import pandas as pd
import numpy as np

red = pd.read_csv(
    "https://storage.googleapis.com/seldon-datasets/wine_quality/winequality-red.csv", sep=';'
)
white = pd.read_csv(
    "https://storage.googleapis.com/seldon-datasets/wine_quality/winequality-white.csv", sep=';'
)
white = white.drop(["quality"], axis=1)
red = red.drop(["quality"], axis=1)
white_test = white[400:800].astype(np.float64)
white = white[:400].astype(np.float64)
red = red[:400].astype(np.float64)

In [3]:

                
                    Copied!
                    
from drifting import DriftingClient, DriftType

client = DriftingClient()
detector_name = "WineDriftDetector"
from drifting import DriftingClient, DriftType

client = DriftingClient()
detector_name = "WineDriftDetector"

In [4]:

                
                    Copied!
                    
client.fit(white, drift_type=DriftType.TABULAR, detector_name=detector_name, ert=400, window_size=40, n_bootstraps=7000)
client.fit(white, drift_type=DriftType.TABULAR, detector_name=detector_name, ert=400, window_size=40, n_bootstraps=7000)

Out[4]:

<Response [200]>

In [5]:

                
                    Copied!
                    
client.load(detector_name=detector_name)
client.load(detector_name=detector_name)

Out[5]:

<Response [200]>

In [6]:

                
                    Copied!
                    
                        
                        
                    
                    

            
# Use white test to check if the detector didn't overfit
red_white_red = pd.concat([red, white_test, red], axis=0)
drifts, test_stats = [], []
for i in range(len(red_white_red)):
    is_drift, test_stat = client.predict(red_white_red.iloc[[i], :], drift_type=DriftType.TABULAR, detector_name=detector_name)
    # print(response)
    test_stats.append(test_stat)
    if is_drift:
        # The drift detector indicates after each sample if there is a drift in the data
        drifts.append(i)
# Use white test to check if the detector didn't overfit
red_white_red = pd.concat([red, white_test, red], axis=0)
drifts, test_stats = [], []
for i in range(len(red_white_red)):
    is_drift, test_stat = client.predict(red_white_red.iloc[[i], :], drift_type=DriftType.TABULAR, detector_name=detector_name)
    # print(response)
    test_stats.append(test_stat)
    if is_drift:
        # The drift detector indicates after each sample if there is a drift in the data
        drifts.append(i)

In [7]:

                
                    Copied!
                    
                        
                        
                    
                    

            
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(7,3), tight_layout=True)

detections = np.zeros((len(red_white_red, )))
detections[drifts] = 1
plt.plot(detections)
plt.plot(test_stats, 'orange')
plt.axvline(0, color='red', label="out of domain")
plt.axvline(400, color='green', label="training data")
plt.axvline(800, color='red', label="out of domain")
plt.legend(loc='upper right')
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(7,3), tight_layout=True)

detections = np.zeros((len(red_white_red, )))
detections[drifts] = 1
plt.plot(detections)
plt.plot(test_stats, 'orange')
plt.axvline(0, color='red', label="out of domain")
plt.axvline(400, color='green', label="training data")
plt.axvline(800, color='red', label="out of domain")
plt.legend(loc='upper right')

Out[7]:

<matplotlib.legend.Legend at 0x14ae091c0>

We can see that the red wine rows where detected as drifted, and most of the white wines are correctly detected as not drifted.

In [ ]: