The result I got for Q6 was:
The model using all features is performing better 5 times out of 10 than the model using only numerical features.
I used the code that was supplied in the solution in Q5 and Q6, verbatim. Literally cut and pasted it. Consequently I got Q6 wrong, and I don’t know why. There is no opportunity for me to learn from this.
Here’s my complete code:
import pandas as pd
ames_housing = pd.read_csv("../datasets/ames_housing_no_missing.csv")
target_name = "SalePrice"
data, target = ames_housing.drop(columns=target_name), ames_housing[target_name]
target = (target > 200_000).astype(int)
from sklearn.compose import make_column_selector as selector
num = selector(dtype_exclude=object)
cat = selector(dtype_include=object)
numerical_features = num(data)
categorical_features = cat(data)
data_numerical = data[numerical_features]
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn import set_config
set_config(display='diagram')
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
data_numerical = data[numerical_features]
model = make_pipeline(StandardScaler(), LogisticRegression())
cv_results_num = cross_validate(model, data_numerical, target, cv=10)
cv_results_num["test_score"].mean()
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
categorical_features = data.columns.difference(numerical_features)
categorical_processor = OneHotEncoder(handle_unknown="ignore")
numerical_processor = StandardScaler()
preprocessor = make_column_transformer(
(categorical_processor, categorical_features),
(numerical_processor, numerical_features),
)
model = make_pipeline(preprocessor, LogisticRegression(max_iter=1_000))
cv_results_all = cross_validate(model, data, target, cv=10)
cv_results_all["test_score"].mean()
cv_results_num["test_score"] > cv_results_all["test_score"]
print("The model using all features is performing better "
f"{sum(cv_results_num['test_score'] < cv_results_all['test_score'])} "
"times out of 10 than the model using only numerical features.")