Skip to content

thoth.utils

ScikitModel

Bases: Protocol

A protocol for compatible scikit-learn models

See the scikit-learn documentation for details here

Source code in thoth/utils.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
class ScikitModel(Protocol):
    """A protocol for compatible scikit-learn models

    See the scikit-learn documentation for details
    [here](https://scikit-learn.org/stable/developers/develop.html#apis-of-scikit-learn-objects)
    """

    def fit(
        self: "ScikitModelT",
        X: Union[pd.DataFrame, np.ndarray],
        y: Union[pd.Series, pd.DataFrame, np.ndarray],
    ) -> "ScikitModelT":
        """Fits the model to a dataset"""
        ...

    def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
        """Predict with the model on a dataset"""
        ...

fit(X, y)

Fits the model to a dataset

Source code in thoth/utils.py
24
25
26
27
28
29
30
def fit(
    self: "ScikitModelT",
    X: Union[pd.DataFrame, np.ndarray],
    y: Union[pd.Series, pd.DataFrame, np.ndarray],
) -> "ScikitModelT":
    """Fits the model to a dataset"""
    ...

predict(X)

Predict with the model on a dataset

Source code in thoth/utils.py
32
33
34
def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray:
    """Predict with the model on a dataset"""
    ...

get_metrics(clf, x, y)

Evaluate the performance of a scikit-learn predictor on a given dataset

Parameters:

Name Type Description Default
clf ScikitModel

The trained classifier to evaluate

required
x pd.DataFrame

The input data

required
y pd.Series

Lables for each sample in the input data

required

Returns:

Type Description
pd.DataFrame

pd.DataFrame: A DataFrame containing the Precision, Recall and F1 scores Macro average is used for multiclass datasets, and micro average is used for binary classification.

Source code in thoth/utils.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
@st.cache
def get_metrics(
    clf: ScikitModel,
    x: Union[pd.DataFrame, np.ndarray],
    y: Union[pd.Series, np.ndarray],
) -> pd.DataFrame:
    """Evaluate the performance of a scikit-learn predictor on a given dataset

    Args:
        clf: The trained classifier to evaluate
        x (pd.DataFrame): The input data
        y (pd.Series): Lables for each sample in the input data

    Returns:
        pd.DataFrame: A DataFrame containing the Precision, Recall and F1 scores
            Macro average is used for multiclass datasets, and micro average is used
            for binary classification.
    """
    average = "macro" if len(np.unique(y)) > 2 else "micro"
    metrics = {
        "Precision": precision_score(y, clf.predict(x), average=average),
        "Recall": recall_score(y, clf.predict(x), average=average),
        "F1": f1_score(y, clf.predict(x), average=average),
    }
    return pd.DataFrame(metrics, index=[0])

load_process_data(dataset_name)

Loads and formats a dataset based on its name

Parameters:

Name Type Description Default
dataset_name str

The name of the dataset to load and process

required

Returns:

Type Description
Tuple[dict, pd.DataFrame]

Tuple[dict, pd.DataFrame]: A tuple of the dataset metadata dict and the data

Source code in thoth/utils.py
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
@st.cache(show_spinner=False)
def load_process_data(dataset_name: str) -> Tuple[dict, pd.DataFrame]:
    """Loads and formats a dataset based on its name

    Args:
        dataset_name (str): The name of the dataset to load and process

    Returns:
        Tuple[dict, pd.DataFrame]: A tuple of the dataset metadata dict and the data
    """
    dataloaders = {
        "Breast Cancer": load_breast_cancer,
        "Iris": load_iris,
        "Wine": load_wine,
    }
    dataloader = dataloaders[dataset_name]
    dataset = cast(Dict[str, Any], dataloader())
    dataset["DESCR"] = dataset["DESCR"].split(":", 1)[1]
    data = pd.DataFrame(dataset.pop("data"), columns=dataset["feature_names"])
    labels = pd.Series(dataset.pop("target")).map(
        dict(enumerate(dataset["target_names"]))
    )
    data = pd.DataFrame(labels, columns=["label"]).join(data)
    return (
        dataset,
        data,
    )

train_model(model, params, train_x, train_y)

Initialise and train a given scikit-learn model with the provided parameters and data

Parameters:

Name Type Description Default
model Type[ScikitModelT]

The model architecture to use

required
params dict

A parameter dictionary containing parameter_name: value pairs

required
train_x pd.DataFrame

The training data, should be of shape (n_samples, n_features)

required
train_y pd.Series

The training labels, should be of shape (n_samples)

required

Returns:

Name Type Description
model ScikitModelT

The trained model

Source code in thoth/utils.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
@st.cache
def train_model(
    model: Type[ScikitModelT],
    params: Dict[str, Any],
    train_x: Union[pd.DataFrame, np.ndarray],
    train_y: Union[pd.Series, np.ndarray],
) -> ScikitModelT:
    """Initialise and train a given scikit-learn model with the provided parameters and data

    Args:
        model: The model architecture to use
        params (dict): A parameter dictionary containing parameter_name: value pairs
        train_x (pd.DataFrame): The training data, should be of shape (n_samples, n_features)
        train_y (pd.Series): The training labels, should be of shape (n_samples)

    Returns:
        model: The trained model
    """
    return model(**params).fit(train_x, train_y)  # type: ignore