1
0
Fork 0

region-based method

This commit is contained in:
Alejandro Moreo Fernandez 2022-01-21 09:53:10 +01:00
parent 3aed410722
commit 4d4cf6eb3f
1 changed files with 221 additions and 0 deletions

221
eDiscovery/method.py Normal file
View File

@ -0,0 +1,221 @@
from typing import Union
import numpy as np
from sklearn.base import BaseEstimator, clone
from sklearn.cluster import KMeans, OPTICS
from sklearn.decomposition import TruncatedSVD
from sklearn.mixture import GaussianMixture
from quapy.method.base import BaseQuantifier, BinaryQuantifier
from quapy.data import LabelledCollection
from quapy.method.aggregative import ACC, PACC
class RegionAdjustment(ACC):
def __init__(self, learner: BaseEstimator, val_split=0.4, k=2):
self.learner = learner
self.val_split = val_split
# lets say k is the number of regions (here: clusters of k-means) for each class
self.k = k
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
X, y = data.Xy
Xp, Xn = X[y==1], X[y==0]
kmeans = KMeans(n_clusters=self.k)
rn = kmeans.fit_predict(Xn) # regions negative
rp = kmeans.fit_predict(Xp)+self.k # regions positive
classes = np.arange(self.k*2)
pos = LabelledCollection(Xp, rp, classes_=classes)
neg = LabelledCollection(Xn, rn, classes_=classes)
region_data = pos + neg
super(RegionAdjustment, self).fit(region_data, fit_learner, val_split)
self.reg2class = {r:(0 if r < self.k else 1) for r in range(2*self.k)}
return self
def classify(self, data):
regions = super(RegionAdjustment, self).classify(data)
return regions
def aggregate(self, classif_predictions):
region_prevalence = super(RegionAdjustment, self).aggregate(classif_predictions)
bin_prevalence = np.zeros(shape=2, dtype=np.float)
for r, prev in enumerate(region_prevalence):
bin_prevalence[self.reg2class[r]] += prev
return bin_prevalence
class RegionProbAdjustment(PACC):
def __init__(self, learner: BaseEstimator, val_split=0.4, k=2):
self.learner = learner
self.val_split = val_split
# lets say k is the number of regions (here: clusters of k-means) for all classes
self.k = k
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
X, y = data.Xy
Xp, Xn = X[y==1], X[y==0]
nk_per_class = (data.prevalence()*self.k).round().astype(int)
print(f'number of clusters per class {nk_per_class}')
kmeans_neg = KMeans(n_clusters=nk_per_class[0])
rn = kmeans_neg.fit_predict(Xn) # regions negative
kmeans_pos = KMeans(n_clusters=nk_per_class[1])
rp = kmeans_pos.fit_predict(Xp)+nk_per_class[0] # regions positive
classes = np.arange(self.k)
pos = LabelledCollection(Xp, rp, classes_=classes)
neg = LabelledCollection(Xn, rn, classes_=classes)
region_data = pos + neg
super(RegionProbAdjustment, self).fit(region_data, fit_learner, val_split)
self.reg2class = {r:(0 if r < nk_per_class[0] else 1) for r in range(2*self.k)}
return self
def classify(self, data):
regions = super(RegionProbAdjustment, self).classify(data)
return regions
def aggregate(self, classif_predictions):
region_prevalence = super(RegionProbAdjustment, self).aggregate(classif_predictions)
bin_prevalence = np.zeros(shape=2, dtype=np.float)
for r, prev in enumerate(region_prevalence):
bin_prevalence[self.reg2class[r]] += prev
return bin_prevalence
class RegionProbAdjustmentGlobal(BaseQuantifier):
def __init__(self, quantifier_fn: BaseQuantifier, k=5, clustering='gmm'):
self.quantifier_fn = quantifier_fn
self.k = k
self.clustering = clustering
def _find_regions(self, X):
if self.clustering == 'gmm':
self.svd = TruncatedSVD(n_components=500)
X = self.svd.fit_transform(X)
lowest_bic = np.infty
bic = []
for n_components in range(3, 8):
# Fit a Gaussian mixture with EM
gmm = GaussianMixture(n_components).fit(X)
bic.append(gmm.bic(X))
print(bic)
if bic[-1] < lowest_bic:
lowest_bic = bic[-1]
best_gmm = gmm
print(f'choosen GMM with {len(best_gmm.weights_)} components')
self.cluster = best_gmm
regions = self.cluster.predict(X)
elif self.clustering == 'kmeans':
print(f'kmeans with k={self.k}')
self.cluster = KMeans(n_clusters=self.k)
regions = self.cluster.fit_predict(X)
elif self.clustering == 'optics':
print('optics')
self.svd = TruncatedSVD(n_components=500)
X = self.svd.fit_transform(X)
self.cluster = OPTICS()
regions = self.cluster.fit_predict(X)
else:
raise NotImplementedError
return regions
def _get_regions(self, X):
if self.clustering == 'gmm':
return self.cluster.predict(self.svd.transform(X))
elif self.clustering == 'kmeans':
return self.cluster.predict(X)
elif self.clustering == 'optics':
return self.cluster.predict(self.svd.transform(X))
else:
raise NotImplementedError
def fit(self, data: LabelledCollection, fit_learner=True, val_split: Union[float, int, LabelledCollection] = None):
self.classes = data.classes_
# first k-means (all classes involved), then PACC local to each cluster
g = self._find_regions(data.instances)
# g = self._get_regions(data.instances)
X, y = data.Xy
self.g_quantifiers = {}
trivial=0
for gi in np.unique(g):
qi_data = LabelledCollection(X[g==gi], y[g==gi], classes_=data.classes_)
if qi_data.counts()[1] <= 1:
# check for <= 1 instead of prevalence==0, since PACC requires at least two
# examples for performing stratified split
# some class is (almost) empty
# if qi_data.prevalence()[0] == 1: # all negatives
self.g_quantifiers[gi] = TrivialRejectorQuantifier()
trivial+=1
elif qi_data.counts()[0] <= 1: # (almost) all positives
self.g_quantifiers[gi] = TrivialAcceptorQuantifier()
trivial += 1
else:
self.g_quantifiers[gi] = self.quantifier_fn().fit(qi_data)
print(f'trivials={trivial}')
return self
@property
def classes_(self):
return self.classes
def quantify(self, instances):
# g = self.cluster.predict(instances)
g = self._get_regions(instances)
prevalence = np.zeros(len(self.classes_), dtype=np.float)
for gi in np.unique(g):
proportion_gi = (g==gi).mean()
prev_gi = self.g_quantifiers[gi].quantify(instances[g==gi])
prevalence += prev_gi * proportion_gi
return prevalence
def get_params(self, deep=True):
pass
def set_params(self, **parameters):
pass
class TrivialRejectorQuantifier(BinaryQuantifier):
def fit(self, data: LabelledCollection):
return self
def quantify(self, instances):
return np.asarray([1,0])
def set_params(self, **parameters):
pass
def get_params(self, deep=True):
pass
@property
def classes_(self):
return np.asarray([0,1])
class TrivialAcceptorQuantifier(BinaryQuantifier):
def fit(self, data: LabelledCollection):
return self
def quantify(self, instances):
return np.asarray([0,1])
def set_params(self, **parameters):
pass
def get_params(self, deep=True):
pass
@property
def classes_(self):
return np.asarray([0,1])