So happy to manage questions 1 to 3…but 4 ?
running my code, I receive a “nan” at the end
I understood there are null values in dataset but was not able to fix this.
import pandas as pd
ames_housing = pd.read_csv("../datasets/house_prices.csv", na_values="?")
target_name = "SalePrice"
data = ames_housing.drop(columns=target_name)
target = ames_housing[target_name]
numerical_features = [
"LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
"BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
"GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
"GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
"3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]
data_numerical = data[numerical_features]
from sklearn.compose import make_column_selector as selector
categorical_columns_selector = selector(dtype_include=object)
data_categorical = categorical_columns_selector(data)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = SimpleImputer(strategy=“mean”)
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer([
('one-hot-encoder', categorical_preprocessor, data_categorical),
('simpleimputer', numerical_preprocessor, data_numerical)])
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
model = Pipeline([
("preprocessor", preprocessor),
("classifier",
DecisionTreeRegressor(random_state=0))])
from sklearn.model_selection import cross_validate
cv_results_tree = cross_validate(
model, data, target, cv=10, n_jobs=2
)
cv_results_tree["test_score"].mean()