Source code for reval.relative_validation

import numpy as np
from sklearn.metrics import zero_one_loss
from reval.utils import kuhn_munkres_algorithm
import logging

logging.basicConfig(format='%(asctime)s, %(levelname)s %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.INFO)


[docs]class RelativeValidation:
    """
    This class allows to perform the relative clustering validation procedure.
    A supervised algorithm is required to test cluster stability.
    Labels output from a clustering algorithm are used as true labels.

    :param s: initialized class for the supervised method.
    :type s: class
    :param c: initialized class for clustering algorithm.
    :type c: class
    :param nrand: number of iterations to normalize cluster stability.
    :type nrand: int
    """

    def __init__(self, s, c, nrand=10):
        """
        Construct method.
        """
        self.class_method = s
        self.clust_method = c
        self.nrand = nrand

[docs]    def train(self, train_data, tr_lab=None):
        """
        Method that performs training. It compares the clustering labels on training set
        (i.e., A(X) computed by :class:`reval.relative_validation.RelativeValidation.clust_method`) against
        the labels obtained from the classification algorithm
        (i.e., f(X), computed by :class:`reval.relative_validation.RelativeValidation.class_method`).
        It returns the misclassification error, the supervised model fitted to the data,
        and both clustering and classification labels.

        :param train_data: training dataset.
        :type train_data: ndarray, (n_samples, n_features)
        :param tr_lab: cluster labels found during CV for clustering methods with no `n_clusters` parameter.
            If not None the clustering method is not performed on the whole test set. Default None.
        :type tr_lab: list
        :return: misclassification error, fitted supervised model object, clustering and classification labels.
        :rtype: float, object, ndarray (n_samples,)
        """
        if tr_lab is None:
            clustlab_tr = self.clust_method.fit_predict(train_data)  # A_k(X)
        else:
            clustlab_tr = tr_lab
        if len([cl for cl in clustlab_tr if cl >= 0]) == 0:
            logging.info(f"No clusters found during training with {self.clust_method}.")
            return None
        fitclass_tr = self.class_method.fit(train_data, clustlab_tr)
        classlab_tr = fitclass_tr.predict(train_data)
        misclass = zero_one_loss(clustlab_tr, classlab_tr)
        return misclass, fitclass_tr, clustlab_tr

[docs]    def test(self, test_data, fit_model):
        """
        Method that compares test set clustering labels (i.e., A(X'), computed by
        :class:`reval.relative_validation.RelativeValidation.clust_method`) against
        the (permuted) labels obtained through the classification algorithm fitted to the training set
        (i.e., f(X'), computed by
        :class:`reval.relative_validation.RelativeValidation.class_method`).
        It returns the misclassification error, together with
        both clustering and classification labels.

        :param test_data: test dataset.
        :type test_data: ndarray, (n_samples, n_features)
        :param fit_model: fitted supervised model.
        :type fit_model: class
        :return: misclassification error, clustering and classification labels.
        :rtype: float, dictionary of ndarrays (n_samples,)
        """
        clustlab_ts = self.clust_method.fit_predict(test_data)  # A_k(X')
        if len([cl for cl in clustlab_ts if cl >= 0]) == 0:
            logging.info(f"No clusters found during testing with {self.clust_method}")
            return None
        classlab_ts = fit_model.predict(test_data)
        bestperm = kuhn_munkres_algorithm(classlab_ts, clustlab_ts)  # array of integers
        misclass = zero_one_loss(classlab_ts, bestperm)
        return misclass, bestperm

[docs]    def rndlabels_traineval(self, train_data, test_data, train_labels, test_labels):
        """
        Method that performs random labeling on the training set
        (N times according to
        :class:`reval.relative_validation.RelativeValidation.nrand` instance attribute) and evaluates
        the fitted models on test set.

        :param train_data: training dataset.
        :type train_data: ndarray, (n_samples, n_features)
        :param test_data: test dataset.
        :type test_data: ndarray, (n_samples, n_features)
        :param train_labels: training set clustering labels.
        :type train_labels: ndarray, (n_samples,)
        :param test_labels: test set clustering labels.
        :type test_labels: ndarray, (n_samples,)
        :return: averaged misclassification error on the test set.
        :rtype: float
        """
        np.random.seed(0)
        shuf_tr = [np.random.permutation(train_labels)
                   for _ in range(self.nrand)]
        misclass_ts = list(map(lambda x: self._rescale_score_(train_data, test_data, x, test_labels), shuf_tr))
        return np.mean(misclass_ts)

[docs]    def _rescale_score_(self, xtr, xts, randlabtr, labts):
        """
        Private method that computes the misclassification error when predicting test labels
        with classification model fitted on training set with random labels.

        :param xtr: training dataset.
        :type xtr: ndarray, (n_samples, n_features)
        :param xts: test dataset.
        :type xts: ndarray, (n_samples, n_features)
        :param randlabtr: random labels.
        :type randlabtr: ndarray, (n_samples,)
        :param labts: test set labels.
        :type labts: ndarray, (n_samples,)
        :return: misclassification error.
        :rtype: float
        """
        self.class_method.fit(xtr, randlabtr)
        pred_lab = self.class_method.predict(xts)
        me_ts = zero_one_loss(pred_lab, kuhn_munkres_algorithm(pred_lab, labts))
        return me_ts