Source code for python.model.sklearn

import joblib
import numpy as np
import pandas as pd

from .base import  BaseModel, Task, _check

try:
    import shap
except ImportError:
    pass


[docs]class SklearnModel(BaseModel): """Class that handles the loaded model. This class can handle models that respect the scikit-learn API. This includes `sklearn.pipeline.Pipeline <https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html>`_. The data coming from a request if validated using the metadata setored with the model. The data fed to the `predict`, `predict_proba`, `explain` handle `preprocess` should be a dictionary that object must contain one key per feature or a list of such dictionaries (recors). Example: `{'feature1': 5, 'feature2': 'A', 'feature3': 10}` Args: file_name (str): File path of the serialized model. It must be a file that can be loaded using :mod:`joblib` """ family = 'SKLEARN_MODEL' # Explainable models _explainable_models = ( # Sklearn 'DecisionTreeClassifier', 'DecisionTreeRegressor', 'RandomForestClassifier', 'RandomForestRegressor', # XGBoost 'XGBClassifier', 'XGBRegressor', 'Booster', # CatBoost 'CatBoostClassifier', 'CatBoostRegressor', # LightGBM 'LGBMClassifier', 'LGBMRegressor') # Private def _load(self): # Load serialized model (dict expected) loaded = joblib.load(self._file_name) self._hydrate(loaded['model'], loaded['metadata']) @_check() def _get_predictor(self): return SklearnModel._extract_base_predictor(self._model) @_check(task='classification') def _get_class_names(self): return np.array(self._get_predictor().classes_, str) # Private (static) @staticmethod def _extract_base_predictor(model): model_name = type(model).__name__ if model_name == 'Pipeline': return SklearnModel._extract_base_predictor(model.steps[-1][1]) elif 'CalibratedClassifier' in model_name: return SklearnModel._extract_base_predictor(model.base_estimator) else: return model # Public
[docs] @_check() def preprocess(self, features): """Preprocess data This function is used before prediction or interpretation. Args: features (dict): The expected object must contain one key per feature. Example: `{'feature1': 5, 'feature2': 'A', 'feature3': 10}` Returns: dict: Processed data if a preprocessing function was definded in the model's metadata. The format must be the same as the input. Raises: RuntimeError: If the model is not ready. """ input = self._validate(features) if hasattr(self._model, 'transform'): return self._model.transform(input) else: return input
[docs] @_check() def predict(self, features): """Make a prediciton Prediction function that returns the predicted class. The returned value is an integer when the class names are not expecified in the model's metadata. Args: features (dict): Record to be used as input data to make predictions. The expected object must contain one key per feature. Example: `{'feature1': 5, 'feature2': 'A', 'feature3': 10}` Returns: int or str: Predicted class. Raises: RuntimeError: If the model is not ready. """ input = self._validate(features) result = self._model.predict(input) return result
[docs] @_check(task='classification') def predict_proba(self, features): """Make a prediciton Prediction function that returns the probability of the predicted classes. The returned object contais one value per class. The keys of the dictionary are the classes of the model. Args: features (dict): Record to be used as input data to make predictions. The expected object must contain one key per feature. Example: {'feature1': 5, 'feature2': 'A', 'feature3': 10} Returns: dict: Predicted class probabilities. Raises: RuntimeError: If the model isn't ready or the task isn't classification. """ input = self._validate(features) prediction = self._model.predict_proba(input) df = pd.DataFrame(prediction, columns=self._get_class_names()) return df.to_dict(orient='records')
[docs] @_check(explainable=True) def explain(self, features, samples=None): """Explain the prediction of a model. Explanation function that returns the SHAP value for each feture. The returned object contais one value per feature of the model. If `samples` is not given, then the explanations are the raw output of the trees, which varies by model (for binary classification in XGBoost this is the log odds ratio). On the contrary, if `sample` is given, then the explanations are the output of the model transformed into probability space (note that this means the SHAP values now sum to the probability output of the model). See the `SHAP documentation <https://shap.readthedocs.io/en/latest/#shap.TreeExplainer>`_ for details. Args: features (dict): Record to be used as input data to explain the model. The expected object must contain one key per feature. Example: {'feature1': 5, 'feature2': 'A', 'feature3': 10} samples (dict): Records to be used as a sample pool for the explanations. It must have the same structure as `features` parameter. According to SHAP documentation, anywhere from 100 to 1000 random background samples are good sizes to use. Returns: dict: Explanations. Raises: RuntimeError: If the model is not ready. ValueError: If the model' predictor doesn't support SHAP explanations or the model is not already loaded. Or if the explainer outputs an unknown object """ # Process input preprocessed = self.preprocess(features) # Define parameters if samples is None: params = { 'feature_dependence': 'tree_path_dependent', 'model_output': 'margin'} else: params = { 'data': self.preprocess(self._validate(samples)), 'feature_dependence': 'independent', 'model_output': 'probability'} # Explainer explainer = shap.TreeExplainer(self._get_predictor(), **params) colnames = self._feature_names() shap_values = explainer.shap_values(preprocessed[colnames].values) # Create an index to handle multiple samples input index = preprocessed.index result = {} if self._is_classification: class_names = self._get_class_names() if isinstance(shap_values, list): # The result is one set of explanations per target class process_shap_values = False elif isinstance(shap_values, np.ndarray) and self._is_binary_classification: # The result is one ndarray set of explanations for one class # Expected only for binary classification for some models. # Ex: LGBMClassifier process_shap_values = True else: raise ValueError('Unknown objet class for shap_values variable') # Format output for i, c in enumerate(class_names): if process_shap_values: _values = shap_values * (-1 if i == 0 else 1) else: _values = shap_values[i] result[c] = pd.DataFrame(_values, index=index, columns=colnames).to_dict(orient='records') else: # self._is_regression result = pd.DataFrame(shap_values, index=index, columns=colnames).to_dict(orient='records') return result