Source code for python.model.base

import joblib
import numpy as np
import pandas as pd

from pandas.api.types import CategoricalDtype
from threading import Thread
from copy import deepcopy
from functools import wraps
from enum import Enum


try:
    import shap
except ImportError:
    SHAP_AVAILABLE = False
else:
    SHAP_AVAILABLE = True


def _check(ready=True, explainable=False, task=None):
    def actual_decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            self = args[0]
            # Check rediness
            if ready and not self.is_ready():
                raise RuntimeError('Model is not ready yet.')
            # Check explainable
            if explainable and not self._is_explainable:
                model_name = type(self._model).__name__
                raise ValueError('Model not supported for explanations: {}'.format(model_name))
            # Check for task
            if task is not None:
                self_task = self.task_type()
                if not getattr(self_task, '__ge__' if task.upper() == 'CLASSIFICATION' else '__eq__')(Task(task)):
                    raise RuntimeError('This method is not available for {} tasks'.format(self_task.name.lower()))
            # Execute function
            return func(*args, **kwargs)
        return wrapper
    return actual_decorator


class Task(int):
    _REGRESSION, _CLASSIFICATION = 0, 1
    _BINARY_CLASSIFICATION, _MULTILABEL_CLASSIFICATION = 2, 3

    def __new__(cls, name):
        assert(isinstance(name, str))
        try:
            val = getattr(cls, '_{}'.format(name.upper()))
        except AttributeError:
            raise AttributeError('Unknown task-name: {}'.format(name))
        else:
            return  super(Task, cls).__new__(cls, val)

    def __init__(self, name):
        self.name = name.upper()
        self._id = int(self)

    def __repr__(self):
        return "Task('{}')".format(self._name)


[docs]class BaseModel(object):
    """Abstract class that handles the loaded model."""
    family = ''
    # Explainable models
    _explainable_models = tuple()

    def __init__(self, file_name):
        self._file_name = file_name
        self._is_ready = False
        self._model = None
        self._metadata = None
        self._task_type = None
        self._is_explainable = False

    # Abstract
    def _load(self):
        """Abstract method"""
        raise NotImplementedError()

    @_check()
    def _get_predictor(self):
        """Abstract method"""
        raise NotImplementedError()

    @_check(task='classification')
    def _get_class_names(self):
        """Abstract method"""
        raise NotImplementedError()

[docs]    @_check()
    def preprocess(self, features):
        """Abstract method"""
        raise NotImplementedError()

[docs]    @_check()
    def predict(self, features):
        """Abstract method"""
        raise NotImplementedError()

[docs]    @_check(task='classification')
    def predict_proba(self, features):
        """Abstract method"""
        raise NotImplementedError()

[docs]    @_check(explainable=True)
    def explain(self, features, samples=None):
        """Abstract method"""
        raise NotImplementedError()

    # Private
    def _hydrate(self, model, metadata):
        # Fill attributes
        self._model = model
        self._metadata = metadata
        self._is_ready = True
        # Hydrate class
        clf = self._get_predictor()
        # SHAP
        model_name = type(clf).__name__
        self._is_explainable = SHAP_AVAILABLE and (model_name in self._explainable_models)
        # Feature importances
        if hasattr(clf, 'feature_importances_'):
            importance = clf.feature_importances_
            for imp, feat in zip(importance, metadata['features']):
                feat['importance'] = imp
        # Set model task type
        if not hasattr(clf, 'classes_'):
            self._task_type = Task('REGRESSION')
        elif len(clf.classes_) <= 2:
            self._task_type = Task('BINARY_CLASSIFICATION')
        elif len(clf.classes_) > 2:
            self._task_type = Task('MULTILABEL_CLASSIFICATION')

    @_check()
    def _feature_names(self):
        return [variable['name'] for variable in self.features()]

    @_check()
    def _validate(self, input):
        if self.metadata.get('features') is None:
            raise AttributeError("Missing key 'features' in model's metadata")

        # Ensure input is lislike shaped
        input = self._get_list_from(input)
        # Get feature names in order
        feature_names = [f['name'] for f in self.metadata['features']]
        # Create an index to handle multiple samples input
        index = list(range(len(input)))
        # Create DataFrame
        df = pd.DataFrame(input, index=index, columns=feature_names)
        # Convert features to expected types
        for feature in self.metadata['features']:
            name, var_type = feature['name'], feature['type']
            default = feature.get('default', None)
            categories = feature.get('categories', None)
            if name not in df.columns:
                df[name] = default or np.nan
            else:
                if var_type == 'numeric':
                    var_type = float
                elif var_type == 'string':
                    var_type = str
                elif (var_type == 'category') and (categories is not None):
                    var_type = CategoricalDtype(categories=categories,
                                                ordered=True)
                else:
                    msg = 'Unknown variable type: {}'.format(var_type)
                    raise ValueError(msg)

                if default is None:
                    df[name] =  df[name].astype(var_type)
                else:
                    df[name] =  df[name].fillna(default).astype(var_type)
            # TO DO: add more validation logic
        return df

    @property
    @_check()
    def _is_classification(self):
        return self._task_type >= Task('CLASSIFICATION')

    @property
    @_check()
    def _is_binary_classification(self):
        return self._task_type == Task('BINARY_CLASSIFICATION')

    @property
    @_check()
    def _is_multilabel_classification(self):
        return self._task_type == Task('MULTILABEL_CLASSIFICATION')

    @property
    @_check()
    def _is_regression(self):
        return self._task_type == Task('REGRESSION')

    # Private (static)
    @staticmethod
    def _get_list_from(data):
        if isinstance(data, dict):
            return [data]
        elif pd.api.types.is_list_like(data):
            return data
        else:
            return [data]

    # Public
[docs]    def load(self):
        """Launch model loading in a separated thread

        Once it finishes, the instance `_is_ready` parameter is set to `True`.

        The loaded object is expected to be a :class:`dict` containing the
        following keys: `model` (model object) and `metadata` (:class:`dict`).
        The later contains one or two elements: `features`
        (:class:`list` of :class:`dict`) with at least the `name` and `type` of
        the variables and optional `class_names` (:class:`list` of :class:`str`)
        with the list of class-names in order (for classification).
        """
        Thread(target=self._load).start()

[docs]    def is_ready(self):
        """Check if model is already loaded.

        Returns:
            bool:
                Is the model already loaded and ready for predictions?
        """
        return self._is_ready

    @property
    @_check()
    def metadata(self):
        """Get metadata of the model_name.

        Returns:
            dict:
                Metadata of the model containing information about the features
                and classes (optional)

        Raises:
            RuntimeError: If the model is not ready.
        """
        return self._metadata

[docs]    @_check()
    def task_type(self, as_text=False):
        """Get task type of the model

        Either 'REGRESSION', 'CLASSIFICATION', 'BINARY_CLASSIFICATION' or
        'MULTILABEL_CLASSIFICATION'.

        Returns:
            :class:`Task` or :class:`str`:
                If `as_text=False`, returns the task of the model
                (classification, regression, etc.) as a :class:`Task` class
                instance. If `as_text=True`, returns the task of the model as
                text.

        Raises:
            RuntimeError: If the model is not ready.
        """
        return self._task_type.name if as_text else self._task_type

[docs]    @_check()
    def features(self):
        """Get the features of the model

        The returned list contains records. Each record contais (at least)
        the `name` and `type` of the variable. If the model supports
        feature importances calculation (if the clasifier has
        `feature_importances_` atribute), they will also be present.

        Returns:
            list[dict]:
                Model features.

        Raises:
            RuntimeError: If the model is not ready.
        """
        return deepcopy(self.metadata['features'])

    @property
    @_check()
    def info(self):
        """Get model information.

        This function gives complete description of the model.
        The returned ibject contais the following keys:

            metadata (:class:`dict`): Model metadata (see :func:`~python.model.base.BaseModel.metadata`).

            model (:class:`dict`): Context information of the learnt model.
                type (:class:`str`):
                    Type of the underlying model object.
                predictor_type (:class:`str`):
                    It could be the same as 'type'. However, for
                    `sklearn.pipeline.Pipeline <https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html>`_
                    it will output the class of the predictor inside it.
                is_explainable (:class:`bool`):
                    `True` if the model class allows SHAP explanations to be
                    computed.
                task (:class:`str`):
                    Task type. Either 'BINARY_CLASSIFICATION',
                    'MULTILABEL_CLASSIFICATION' or 'REGRESSION'
                class_names (:class:`list` or :class:`None`):
                    Class names if defined (for classification only).

        Returns:
            dict:
                Information about the model.

        Raises:
            RuntimeError: If the model is not ready.
        """
        result = {}
        # Metadata
        result['metadata'] = self._metadata
        # Info from model
        result['model'] = {
            'type': str(type(self._model)),
            'predictor_type': str(type(self._get_predictor())),
            'is_explainable': self._is_explainable,
            'task': self.task_type(as_text=True),
            'family': self.family
        }
        if self._is_classification:
            result['model']['class_names'] = self._get_class_names()
        return result