Source code for reval.relative_validation

import numpy as np
from sklearn.metrics import zero_one_loss
from reval.utils import kuhn_munkres_algorithm
import logging

logging.basicConfig(format='%(asctime)s, %(levelname)s %(message)s',
                    datefmt='%H:%M:%S',
                    level=logging.INFO)


[docs]class RelativeValidation: """ This class allows to perform the relative clustering validation procedure. A supervised algorithm is required to test cluster stability. Labels output from a clustering algorithm are used as true labels. :param s: initialized class for the supervised method. :type s: class :param c: initialized class for clustering algorithm. :type c: class :param nrand: number of iterations to normalize cluster stability. :type nrand: int """ def __init__(self, s, c, nrand=10): """ Construct method. """ self.class_method = s self.clust_method = c self.nrand = nrand
[docs] def train(self, train_data, tr_lab=None): """ Method that performs training. It compares the clustering labels on training set (i.e., A(X) computed by :class:`reval.relative_validation.RelativeValidation.clust_method`) against the labels obtained from the classification algorithm (i.e., f(X), computed by :class:`reval.relative_validation.RelativeValidation.class_method`). It returns the misclassification error, the supervised model fitted to the data, and both clustering and classification labels. :param train_data: training dataset. :type train_data: ndarray, (n_samples, n_features) :param tr_lab: cluster labels found during CV for clustering methods with no `n_clusters` parameter. If not None the clustering method is not performed on the whole test set. Default None. :type tr_lab: list :return: misclassification error, fitted supervised model object, clustering and classification labels. :rtype: float, object, ndarray (n_samples,) """ if tr_lab is None: clustlab_tr = self.clust_method.fit_predict(train_data) # A_k(X) else: clustlab_tr = tr_lab if len([cl for cl in clustlab_tr if cl >= 0]) == 0: logging.info(f"No clusters found during training with {self.clust_method}.") return None fitclass_tr = self.class_method.fit(train_data, clustlab_tr) classlab_tr = fitclass_tr.predict(train_data) misclass = zero_one_loss(clustlab_tr, classlab_tr) return misclass, fitclass_tr, clustlab_tr
[docs] def test(self, test_data, fit_model): """ Method that compares test set clustering labels (i.e., A(X'), computed by :class:`reval.relative_validation.RelativeValidation.clust_method`) against the (permuted) labels obtained through the classification algorithm fitted to the training set (i.e., f(X'), computed by :class:`reval.relative_validation.RelativeValidation.class_method`). It returns the misclassification error, together with both clustering and classification labels. :param test_data: test dataset. :type test_data: ndarray, (n_samples, n_features) :param fit_model: fitted supervised model. :type fit_model: class :return: misclassification error, clustering and classification labels. :rtype: float, dictionary of ndarrays (n_samples,) """ clustlab_ts = self.clust_method.fit_predict(test_data) # A_k(X') if len([cl for cl in clustlab_ts if cl >= 0]) == 0: logging.info(f"No clusters found during testing with {self.clust_method}") return None classlab_ts = fit_model.predict(test_data) bestperm = kuhn_munkres_algorithm(classlab_ts, clustlab_ts) # array of integers misclass = zero_one_loss(classlab_ts, bestperm) return misclass, bestperm
[docs] def rndlabels_traineval(self, train_data, test_data, train_labels, test_labels): """ Method that performs random labeling on the training set (N times according to :class:`reval.relative_validation.RelativeValidation.nrand` instance attribute) and evaluates the fitted models on test set. :param train_data: training dataset. :type train_data: ndarray, (n_samples, n_features) :param test_data: test dataset. :type test_data: ndarray, (n_samples, n_features) :param train_labels: training set clustering labels. :type train_labels: ndarray, (n_samples,) :param test_labels: test set clustering labels. :type test_labels: ndarray, (n_samples,) :return: averaged misclassification error on the test set. :rtype: float """ np.random.seed(0) shuf_tr = [np.random.permutation(train_labels) for _ in range(self.nrand)] misclass_ts = list(map(lambda x: self._rescale_score_(train_data, test_data, x, test_labels), shuf_tr)) return np.mean(misclass_ts)
[docs] def _rescale_score_(self, xtr, xts, randlabtr, labts): """ Private method that computes the misclassification error when predicting test labels with classification model fitted on training set with random labels. :param xtr: training dataset. :type xtr: ndarray, (n_samples, n_features) :param xts: test dataset. :type xts: ndarray, (n_samples, n_features) :param randlabtr: random labels. :type randlabtr: ndarray, (n_samples,) :param labts: test set labels. :type labts: ndarray, (n_samples,) :return: misclassification error. :rtype: float """ self.class_method.fit(xtr, randlabtr) pred_lab = self.class_method.predict(xts) me_ts = zero_one_loss(pred_lab, kuhn_munkres_algorithm(pred_lab, labts)) return me_ts