Yes, here
import pandas as pd
adult_census = pd.read_csv("../datasets/adult-census.csv")
# drop the duplicated column `"education-num"` as stated in the first notebook
adult_census = adult_census.drop(columns="education-num")
target_name = "class"
target = adult_census[target_name]
data = adult_census.drop(columns=[target_name])
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 48842 non-null int64
1 workclass 48842 non-null object
2 education 48842 non-null object
3 marital-status 48842 non-null object
4 occupation 48842 non-null object
5 relationship 48842 non-null object
6 race 48842 non-null object
7 sex 48842 non-null object
8 capital-gain 48842 non-null int64
9 capital-loss 48842 non-null int64
10 hours-per-week 48842 non-null int64
11 native-country 48842 non-null object
dtypes: int64(4), object(8)
memory usage: 4.5+ MB
from sklearn.compose import make_column_selector as selector
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
categorical_columns = selector(dtype_include=object)(data)
numerical_columns = selector(dtype_exclude=object)(data)
preprocessor = make_column_transformer(
(OneHotEncoder(handle_unknown="ignore"), categorical_columns),
(StandardScaler(), numerical_columns),
)
model = make_pipeline(preprocessor, LogisticRegression(max_iter=5000))
cv_results = cross_validate(
model, data, target, cv=10, return_estimator=True, n_jobs=2
)
cv_results["test_score"].mean()
0.8515212248630878
preprocessor.fit(data)
feature_names = (preprocessor.named_transformers_["onehotencoder"]
.get_feature_names(categorical_columns)).tolist()
feature_names += numerical_columns
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import seaborn as sns
coefs = [estimator[-1].coef_ for estimator in cv_results["estimator"]]
coefs = pd.DataFrame(coefs, columns=feature_names)
coefs.describe().loc[["min", "max"]]
That is all