Question 6 Cross Validation Issue

Alvin19 · 6 June 2021 07:23

Hi all,

I have an error.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
in
----> 1 cv_result4 = cross_validate(model4, data, target, error_score=‘raise’)

/opt/conda/lib/python3.9/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     61             extra_args = len(args) - len(all_args)
     62             if extra_args <= 0:
---> 63                 return f(*args, **kwargs)
     64 
     65             # extra_args > 0

/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
    248     parallel = Parallel(n_jobs=n_jobs, verbose=verbose,
    249                         pre_dispatch=pre_dispatch)
--> 250     results = parallel(
    251         delayed(_fit_and_score)(
    252             clone(estimator), X, y, scorers, train, test, verbose, None,

/opt/conda/lib/python3.9/site-packages/joblib/parallel.py in __call__(self, iterable)
   1039             # remaining jobs.
   1040             self._iterating = False
-> 1041             if self.dispatch_one_batch(iterator):
   1042                 self._iterating = self._original_iterator is not None
   1043 

/opt/conda/lib/python3.9/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
    857                 return False
    858             else:
--> 859                 self._dispatch(tasks)
    860                 return True
    861 

/opt/conda/lib/python3.9/site-packages/joblib/parallel.py in _dispatch(self, batch)
    775         with self._lock:
    776             job_idx = len(self._jobs)
--> 777             job = self._backend.apply_async(batch, callback=cb)
    778             # A job can complete so quickly than its callback is
    779             # called before we get here, causing self._jobs to

/opt/conda/lib/python3.9/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
    206     def apply_async(self, func, callback=None):
    207         """Schedule a func to be run"""
--> 208         result = ImmediateResult(func)
    209         if callback:
    210             callback(result)

/opt/conda/lib/python3.9/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
    570         # Don't delay the application, to avoid keeping the input
    571         # arguments in memory
--> 572         self.results = batch()
    573 
    574     def get(self):

/opt/conda/lib/python3.9/site-packages/joblib/parallel.py in __call__(self)
    260         # change the default number of processes to -1
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262             return [func(*args, **kwargs)
    263                     for func, args, kwargs in self.items]
    264 

/opt/conda/lib/python3.9/site-packages/joblib/parallel.py in <listcomp>(.0)
    260         # change the default number of processes to -1
    261         with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262             return [func(*args, **kwargs)
    263                     for func, args, kwargs in self.items]
    264 

/opt/conda/lib/python3.9/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
    220     def __call__(self, *args, **kwargs):
    221         with config_context(**self.config):
--> 222             return self.function(*args, **kwargs)

/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
    596             estimator.fit(X_train, **fit_params)
    597         else:
--> 598             estimator.fit(X_train, y_train, **fit_params)
    599 
    600     except Exception as e:

/opt/conda/lib/python3.9/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
    339         """
    340         fit_params_steps = self._check_fit_params(**fit_params)
--> 341         Xt = self._fit(X, y, **fit_params_steps)
    342         with _print_elapsed_time('Pipeline',
    343                                  self._log_message(len(self.steps) - 1)):

/opt/conda/lib/python3.9/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
    301                 cloned_transformer = clone(transformer)
    302             # Fit or load from cache the current transformer
--> 303             X, fitted_transformer = fit_transform_one_cached(
    304                 cloned_transformer, X, y, None,
    305                 message_clsname='Pipeline',

/opt/conda/lib/python3.9/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

/opt/conda/lib/python3.9/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    752     with _print_elapsed_time(message_clsname, message):
    753         if hasattr(transformer, 'fit_transform'):
--> 754             res = transformer.fit_transform(X, y, **fit_params)
    755         else:
    756             res = transformer.fit(X, y, **fit_params).transform(X)

/opt/conda/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
    503         self._validate_transformers()
    504         self._validate_column_callables(X)
--> 505         self._validate_remainder(X)
    506 
    507         result = self._fit_transform(X, y, _fit_transform_one)

/opt/conda/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py in _validate_remainder(self, X)
    322 
    323         # Make it possible to check for reordered named columns on transform
--> 324         self._has_str_cols = any(_determine_key_type(cols) == 'str'
    325                                  for cols in self._columns)
    326         if hasattr(X, 'columns'):

/opt/conda/lib/python3.9/site-packages/sklearn/compose/_column_transformer.py in <genexpr>(.0)
    322 
    323         # Make it possible to check for reordered named columns on transform
--> 324         self._has_str_cols = any(_determine_key_type(cols) == 'str'
    325                                  for cols in self._columns)
    326         if hasattr(X, 'columns'):

/opt/conda/lib/python3.9/site-packages/sklearn/utils/__init__.py in _determine_key_type(key, accept_slice)
    266         except KeyError:
    267             raise ValueError(err_msg)
--> 268     raise ValueError(err_msg)
    269 
    270 

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed

I have ensure the numerical_fea and categorical_fea are in data frame format.

from sklearn.compose import make_column_selector as selector, ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

categorical = selector(dtype_include=object)
categorical_fea = categorical(data)

numerical = selector(dtype_exclude=object)
numerical_fea = numerical(data

numerical_fea_df = pd.DataFrame(data[numerical_fea])
numerical_fea_df.head(2)

categorical_fea_df = pd.DataFrame(data[categorical_fea])
categorical_fea_df.head(2)

numerical_preprocessor = make_pipeline(StandardScaler(), SimpleImputer())

categorical_preprocessor = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))

preprocessor = ColumnTransformer(transformers=[('nume', numerical_preprocessor, numerical_fea_df), ('cate', categorical_preprocessor, categorical_fea_df)])

model4 = make_pipeline(preprocessor, LogisticRegression(max_iter=1000))

cv_result4 = cross_validate(model4, data, target, error_score='raise')

Anyone can let me know where is the error and how I can improve to get the cross validate step?

ThomasLoock · 6 June 2021 10:20

Hi Alvin,

you need to rethink the creation of your categorical features.
You already have the variable “numerical_features”, so no need to recreate this.
In Q6 it is said “the left-out columns should be treated as categorical variables”
so you need to compute the complement or the difference of the already known numerical features to get the wanted result.

But the cause of your error is creating new DataFrames using the features.

glemaitre58 · 6 June 2021 19:52

I did not look specifically at which columns you are using. I will only comment regarding the error raise by scikit-learn.

You are passing numerical_fea_df and categorical_fea_df to ColumnTransformer. However, ColumnTransformer does not expect dataframe with the data but only the name of the columns (scikit-learn will take care selecting the columns).

So replace the above variable by numerical_fea and categorical_fea should work (if you properly selected the right column names).

Alvin19 · 7 June 2021 05:31

Thanks @ThomasLoock and @glemaitre58

I have follow the question requirement to get the categorical column using .columns.difference(numerical_column) method. It will display as data frame format so in cross validation I make it to become list by putting list(numerical_column).

So, it is like this:

preprocessor = ColumnTransformer(transformers=[(‘num’, num_preprocessor, list(num_features)), (‘cat’, cat_preprocessor, cat_features)])

May I know what is the purpose of finding the standard deviation, that the output is 0.0?

3*score_all.std()

Lastly, will we able to access all the course information and jupyter notebook after this course is ended?

glemaitre58 · 7 June 2021 08:24

I would not expect the standard deviation to be 0 indeed.

I am linking to the answer to a similar question: Access to the course - #3 by lfarhi

Alvin19 · 7 June 2021 08:35

@glemaitre58 Can you highlight to me where would be the possible error that resulting the standard deviation is 0.0?

Here is my code:

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate
from sklearn. pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
 
cat_features = data.columns.difference(list(num_features)) 

num_preprocessor = make_pipeline(StandardScaler(), SimpleImputer())
cat_preprocessor = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore'))

preprocessor = ColumnTransformer(transformers=[('num', num_preprocessor, list(num_features)), ('cat', cat_preprocessor, cat_features)]) 
 
model = make_pipeline(preprocessor, LogisticRegression(max_iter=1000))
 
cv_result5 = cross_validate(model, data, target, error_score='raise')
score_all = cv_result5['test_score'].mean()
# output is 0.923972602739726

improvement_result = score_all - cv_results_num["test_score"].mean()
# output is 0.004109589041095929

3*score_all.std()
# output is 0.0

glemaitre58 · 7 June 2021 08:40

The issue is that score_all corresponds to the mean value already:

So this is a single number and thus an std. dev. I assume that you wanted instead compute:

cv_results5["test_score"].std()

Alvin19 · 7 June 2021 08:57

@glemaitre58 , I got what you mean I was taking the mean to do the standard deviation
When update the code, I got 0.030059052407522862.

The lower the standard deviation the better as most of the data is lying around the average value thus the dataset can give reliable insight. Is this the purpose of why we want to find out the standard deviation?

glemaitre58 · 7 June 2021 09:13

Yes. It gives a piece of information regarding the distribution of the scores.

Alvin19 · 7 June 2021 09:18

@glemaitre58 Thank you very much for your guidance.