`thoth.utils`

`ScikitModel`

Bases: Protocol

A protocol for compatible scikit-learn models

See the scikit-learn documentation for details here

Source code in thoth/utils.py

class ScikitModel(Protocol):
    """A protocol for compatible scikit-learn models

    See the scikit-learn documentation for details
    [here](https://scikit-learn.org/stable/developers/develop.html#apis-of-scikit-learn-objects)
    """

    def fit(
        self: "ScikitModelT",
        X: Union[pd.DataFrame, np.ndarray],
        y: Union[pd.Series, pd.DataFrame, np.ndarray],
    ) -> "ScikitModelT":
        """Fits the model to a dataset"""
        ...

    def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
        """Predict with the model on a dataset"""
        ...

`fit(X, y)`

Fits the model to a dataset

Source code in thoth/utils.py

def fit(
    self: "ScikitModelT",
    X: Union[pd.DataFrame, np.ndarray],
    y: Union[pd.Series, pd.DataFrame, np.ndarray],
) -> "ScikitModelT":
    """Fits the model to a dataset"""
    ...

`predict(X)`

Predict with the model on a dataset

Source code in thoth/utils.py

32
33
34

def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
    """Predict with the model on a dataset"""
    ...

`get_metrics(clf, x, y)`

Evaluate the performance of a scikit-learn predictor on a given dataset

Parameters:

Name	Type	Description	Default
`clf`	`ScikitModel`	The trained classifier to evaluate	required
`x`	`pd.DataFrame`	The input data	required
`y`	`pd.Series`	Lables for each sample in the input data	required

Returns:

Type	Description
`pd.DataFrame`	pd.DataFrame: A DataFrame containing the Precision, Recall and F1 scores Macro average is used for multiclass datasets, and micro average is used for binary classification.

Source code in thoth/utils.py

@st.cache
def get_metrics(
    clf: ScikitModel,
    x: Union[pd.DataFrame, np.ndarray],
    y: Union[pd.Series, np.ndarray],
) -> pd.DataFrame:
    """Evaluate the performance of a scikit-learn predictor on a given dataset

    Args:
        clf: The trained classifier to evaluate
        x (pd.DataFrame): The input data
        y (pd.Series): Lables for each sample in the input data

    Returns:
        pd.DataFrame: A DataFrame containing the Precision, Recall and F1 scores
            Macro average is used for multiclass datasets, and micro average is used
            for binary classification.
    """
    average = "macro" if len(np.unique(y)) > 2 else "micro"
    metrics = {
        "Precision": precision_score(y, clf.predict(x), average=average),
        "Recall": recall_score(y, clf.predict(x), average=average),
        "F1": f1_score(y, clf.predict(x), average=average),
    }
    return pd.DataFrame(metrics, index=[0])

`load_process_data(dataset_name)`

Loads and formats a dataset based on its name

Parameters:

Name	Type	Description	Default
`dataset_name`	`str`	The name of the dataset to load and process	required

Returns:

Type	Description
`Tuple[dict, pd.DataFrame]`	Tuple[dict, pd.DataFrame]: A tuple of the dataset metadata dict and the data

Source code in thoth/utils.py

@st.cache(show_spinner=False)
def load_process_data(dataset_name: str) -> Tuple[dict, pd.DataFrame]:
    """Loads and formats a dataset based on its name

    Args:
        dataset_name (str): The name of the dataset to load and process

    Returns:
        Tuple[dict, pd.DataFrame]: A tuple of the dataset metadata dict and the data
    """
    dataloaders = {
        "Breast Cancer": load_breast_cancer,
        "Iris": load_iris,
        "Wine": load_wine,
    }
    dataloader = dataloaders[dataset_name]
    dataset = cast(Dict[str, Any], dataloader())
    dataset["DESCR"] = dataset["DESCR"].split(":", 1)[1]
    data = pd.DataFrame(dataset.pop("data"), columns=dataset["feature_names"])
    labels = pd.Series(dataset.pop("target")).map(
        dict(enumerate(dataset["target_names"]))
    )
    data = pd.DataFrame(labels, columns=["label"]).join(data)
    return (
        dataset,
        data,
    )

`train_model(model, params, train_x, train_y)`

Initialise and train a given scikit-learn model with the provided parameters and data

Parameters:

Name	Type	Description	Default
`model`	`Type[ScikitModelT]`	The model architecture to use	required
`params`	`dict`	A parameter dictionary containing parameter_name: value pairs	required
`train_x`	`pd.DataFrame`	The training data, should be of shape (n_samples, n_features)	required
`train_y`	`pd.Series`	The training labels, should be of shape (n_samples)	required

Returns:

Name	Type	Description
`model`	`ScikitModelT`	The trained model

Source code in thoth/utils.py

@st.cache
def train_model(
    model: Type[ScikitModelT],
    params: Dict[str, Any],
    train_x: Union[pd.DataFrame, np.ndarray],
    train_y: Union[pd.Series, np.ndarray],
) -> ScikitModelT:
    """Initialise and train a given scikit-learn model with the provided parameters and data

    Args:
        model: The model architecture to use
        params (dict): A parameter dictionary containing parameter_name: value pairs
        train_x (pd.DataFrame): The training data, should be of shape (n_samples, n_features)
        train_y (pd.Series): The training labels, should be of shape (n_samples)

    Returns:
        model: The trained model
    """
    return model(**params).fit(train_x, train_y)  # type: ignore