Skip to main content
scikit-learn

scikit-learn #

Impute missing data #

Doc: 6.4. Imputation of missing values — scikit-learn 1.3.0 documentation

SimpleImputer #

from sklearn.impute import SimpleImputer

# get # of rows with missing values
# ref: https://note.nkmk.me/en/python-pandas-nan-extract/
sum(df.isnull().any(axis=1))
# 230

# use imputer
imp_simple = SimpleImputer()
imp_simple.fit(df)

imp_simple_df = pd.DataFrame(
    data = imp_simple.transform(df),
    columns = list(df.columns),
    index = list(df.index),
)

# get # of rows with missing values
sum(imp_simple_df.isnull().any(axis=1))
# 0

IterativeImputer #

from sklearn.experimental import enable_iterative_imputer   #noqa
from sklearn.impute import IterativeImputer

# get # of rows with missing values
sum(df.isnull().any(axis=1))
# 230

# use imputer
imp_iter = IterativeImputer(max_iter=10, random_state=0)
imp_iter.fit(df)

imp_iter_df = pd.DataFrame(
    data = imp_iter.transform(df),
    columns = list(df.columns),
    index = list(df.index),
)

# get # of rows with missing values
sum(imp_iter_df.isnull().any(axis=1))
# 0

LASSO for variable selection #

Docs:

Refs:

Normal Lasso #

Step 0: All imports

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LassoCV
from sklearn.preprocessing import StandardScaler

Step 1: Split datasets

# Assuming dependent variable is df.columns[0]
features = df.columns[1:]
target = df.columns[0]

# Get X and y values
X = df[features].values
y = df[target].values

# Split training set and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
print("The dimension of X_train is", X_train.shape)
print("The dimension of X_test is", X_train.shape)

Step 2: Normalise the independent variables

# scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Step 3: Run LASSO

# Decide on some alpha
alpha = 0.001

# Lasso cross validation
lasso = Lasso(alpha=alpha, random_state=0).fit(X_train, y_train)

# Get R2 values
lasso_r2_train = lasso.score(X_train,y_train)
lasso_r2_test  = lasso.score(X_test,y_test)
print("The train R2 for lasso model is", round(lasso_r2_train*100, 2))
print("The test R2 for lasso model is",  round(lasso_r2_test*100, 2))

Step 4: Get selection results

# Make spec dataframe
lasso_spec = pd.DataFrame()
lasso_spec["feature"] = features
lasso_spec["coef"] = lasso.coef_

# Inspect zero coefs
# Ref: https://stackoverflow.com/a/4588654
lasso_spec.iloc[np.where(lasso_spec["coef"]==0)[0]]

Lasso with Cross Validation #

Follow Steps 0-2 from last section, then

Step 3: Run LASSO with CV

# Lasso cross validation
lasso_cv = LassoCV(cv=10, random_state=0).fit(X_train, y_train)

# Get R2 values
lasso_cv_r2_train = lasso_cv.score(X_train,y_train)
lasso_cv_r2_test  = lasso_cv.score(X_test,y_test)
print("The train R2 for lasso cv model is", round(lasso_cv_r2_train*100, 2))
print("The test R2 for lasso cv model is",  round(lasso_cv_r2_train*100, 2))

# Get optimal alpha
alpha_best = lasso_cv.alpha_
print("The optimal alpha is", alpha_best)

Step 4: Run LASSO again with best alpha

# Best Lasso model from cross validation
lasso_best = Lasso(alpha=alpha_best, random_state=0).fit(X_train, y_train)

# Get R2 values
lasso_best_r2_train = lasso_best.score(X_train,y_train)
lasso_best_r2_test  = lasso_best.score(X_test,y_test)
print("The train R2 for best lasso model is", round(lasso_best_r2_train*100, 2))
print("The test R2 for best lasso model is",  round(lasso_best_r2_test*100, 2))

Step 5: Get selection results

# Make spec dataframe
lasso_best_spec = pd.DataFrame()
lasso_best_spec["feature"] = features
lasso_best_spec["coef"] = lasso_best.coef_

# Inspect zero coefs
# Ref: https://stackoverflow.com/a/4588654
elim = list(lasso_best_spec.iloc[np.where(lasso_best_spec["coef"]==0)[0]]["feature"])

print(elim)
print("The number of eliminated independent variables is", len(elim))

Group Lasso with Cross Validation #

…to accommodate categorical variable dummies.

Refs:

Step 0: All imports

import numpy as np
import pandas as pd

from celer import GroupLasso, GroupLassoCV

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Step 0.5: Get dummies of categorical variable (strings)

df_dummy = pd.get_dummies(df["Rating"], prefix="Rating", drop_first=True)
n_dummy = len(df_dummy.columns)
  
df_group = df.drop(["Rating"], axis=1).join(    # Remove original rating data
    df_dummy,
    how="inner",
)
n_var = len(df_group.columns) - n_dummy - 1

Step 0.5.5: Get group list

# Ref: https://stackoverflow.com/a/14802726
group_list = [1] * n_var    # Get an n_var length list of ones
group_list.append(n_dummy)  # ... and the dummies as their own group

Step 1: Split datasets

# Assuming dependent variable is df.columns[0]
features_group = df_group.columns[1:]
target_group = df_group.columns[0]

# Get X and y values
X_group = df_group[features_group].values
y_group = df_group[target_group].values

# Split training set and testing set
X_train_group, X_test_group, y_train_group, y_test_group = train_test_split(X_group, y_group, test_size=0.3, random_state=10)
print("The dimension of X_train_group is", X_train_group.shape)
print("The dimension of X_test_group is", X_train_group.shape)

Step 2: Normalise the independent variables

# scale features
scaler = StandardScaler()
X_train_group = scaler.fit_transform(X_train_group)
X_test_group = scaler.transform(X_test_group)

Step 3: Run Group LASSO with CV

# Lasso cross validation
lasso_cv_group = GroupLassoCV(cv=10, groups=group_list).fit(X_train_group, y_train_group)

# Get R2 values
lasso_cv_group_r2_train = lasso_cv_group.score(X_train_group, y_train_group)
lasso_cv_group_r2_test  = lasso_cv_group.score(X_test_group, y_test_group)
print("The train R2 for group lasso cv model is", round(lasso_cv_group_r2_train*100, 2))
print("The test R2 for group lasso cv model is",  round(lasso_cv_group_r2_test*100, 2))

# Get optimal alpha
alpha_best_group = lasso_cv_group.alpha_
print("The optimal alpha is", alpha_best_group)

Step 4: Run Group LASSO again with best alpha

# Best Lasso model from cross validation
lasso_best_group = GroupLasso(alpha=alpha_best_group, groups=group_list).fit(X_train_group, y_train_group)

# Get R2 values
lasso_best_group_r2_train = lasso_best_group.score(X_train_group, y_train_group)
lasso_best_group_r2_test  = lasso_best_group.score(X_test_group, y_test_group)
print("The train R2 for best group lasso model is", round(lasso_best_group_r2_train*100, 2))
print("The test R2 for best group lasso model is",  round(lasso_best_group_r2_test*100, 2))

Step 5: Get selection results

# Make spec dataframe
lasso_best_group_spec = pd.DataFrame()
lasso_best_group_spec["feature"] = features_group
lasso_best_group_spec["coef"] = lasso_best_group.coef_

# Inspect zero coefs
# Ref: https://stackoverflow.com/a/4588654
elim = list(lasso_best_group_spec.iloc[np.where(lasso_best_group_spec["coef"]==0)[0]]["feature"])

print(elim)
print("The number of eliminated independent variables is", len(elim))

Principal Component Analysis (PCA) #

Refs:

from sklearn.decomposition import PCA

# check shape of original data
imp_iter_df.shape
# (800, 50)

pca = PCA(n_components=2)

pca_df = pd.DataFrame(
    data = pca.fit_transform(imp_qof_s_df),
    columns = ["PC1", "PC2"],
    index = list(df.index),
)

# check shape of pca data
pca_df.shape
# (800, 2)