distribution matching multiclass

This commit is contained in:
Alejandro Moreo Fernandez 2023-07-20 09:03:22 +02:00
parent 4a7ffe5b50
commit 26de9d92eb
7 changed files with 335 additions and 67 deletions

58
laboratory/main_lequa.py Normal file
View File

@ -0,0 +1,58 @@
import numpy as np
from sklearn.linear_model import LogisticRegression
import os
import sys
import pandas as pd
import quapy as qp
from quapy.method.aggregative import DistributionMatching
from method_kdey import KDEy
from quapy.model_selection import GridSearchQ
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = qp.datasets.LEQUA2022_SAMPLE_SIZE['T1B']
qp.environ['N_JOBS'] = -1
method = 'KDE'
param = 0.1
div = 'topsoe'
method_identifier = f'{method}_modsel_{div}'
os.makedirs('results', exist_ok=True)
result_path = f'results_LequaT2B/{method_identifier}.csv'
#if os.path.exists(result_path):
# print('Result already exit. Nothing to do')
# sys.exit(0)
with open(result_path, 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
dataset = 'T1B'
train, val_gen, test_gen = qp.datasets.fetch_lequa2022(dataset)
if method == 'KDE':
param_grid = {'bandwidth': np.linspace(0.001, 0.1, 11)}
model = KDEy(LogisticRegression(), divergence=div, bandwidth=param, engine='sklearn')
else:
raise NotImplementedError('unknown method')
modsel = GridSearchQ(model, param_grid, protocol=val_gen, refit=False, n_jobs=-1, verbose=1)
modsel.fit(train)
print(f'best params {modsel.best_params_}')
quantifier = modsel.best_model()
report = qp.evaluation.evaluation_report(quantifier, protocol=test_gen, error_metrics=['mae', 'mrae'], verbose=True)
means = report.mean()
csv.write(f'{method}\tLeQua-{dataset}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
csv.flush()
df = pd.read_csv(result_path, sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
print(pv)

66
laboratory/main_tweets.py Normal file
View File

@ -0,0 +1,66 @@
import numpy as np
from sklearn.linear_model import LogisticRegression
import os
import sys
import pandas as pd
import quapy as qp
from quapy.method.aggregative import DistributionMatching
from method_kdey import KDEy
from quapy.model_selection import GridSearchQ
from quapy.protocol import UPP
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1
method = 'KDE'
param = 0.1
target = 'max_likelihood'
div = 'topsoe'
method_identifier = f'{method}_modsel_{div if target=="min_divergence" else target}'
os.makedirs('results', exist_ok=True)
result_path = f'results/{method_identifier}.csv'
#if os.path.exists(result_path):
# print('Result already exit. Nothing to do')
# sys.exit(0)
with open(result_path, 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
print('init', dataset)
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=True)
if method == 'KDE':
param_grid = {'bandwidth': np.linspace(0.001, 0.2, 21)}
model = KDEy(LogisticRegression(), divergence=div, bandwidth=param, engine='sklearn', target=target)
else:
raise NotImplementedError('unknown method')
protocol = UPP(data.test, repeats=100)
modsel = GridSearchQ(model, param_grid, protocol, refit=False, n_jobs=-1, verbose=1)
modsel.fit(data.training)
print(f'best params {modsel.best_params_}')
quantifier = modsel.best_model()
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True, for_model_selection=False)
quantifier.fit(data.training)
protocol = UPP(data.test, repeats=100)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True)
means = report.mean()
csv.write(f'{method_identifier}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
csv.flush()
df = pd.read_csv(result_path, sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
print(pv)

View File

@ -0,0 +1,62 @@
from sklearn.linear_model import LogisticRegression
import os
import sys
import pandas as pd
import quapy as qp
from method.aggregative import DistributionMatching
from method_kdey import KDEy
from protocol import UPP
if __name__ == '__main__':
qp.environ['SAMPLE_SIZE'] = 100
qp.environ['N_JOBS'] = -1
method = 'KDE'
param = 0.1
div = 'topsoe'
method_identifier = f'{method}_{param}_{div}'
# generates tuples (dataset, method, method_name)
# (the dataset is needed for methods that process the dataset differently)
def gen_methods():
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True)
if method == 'KDE':
kdey = KDEy(LogisticRegression(), divergence=div, bandwidth=param, engine='sklearn')
yield data, kdey, method_identifier
elif method == 'DM':
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=param)
yield data, dm, method_identifier
else:
raise NotImplementedError('unknown method')
os.makedirs('results', exist_ok=True)
result_path = f'results/{method_identifier}.csv'
if os.path.exists(result_path):
print('Result already exit. Nothing to do')
sys.exit(0)
with open(result_path, 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
for data, quantifier, quant_name in gen_methods():
quantifier.fit(data.training)
protocol = UPP(data.test, repeats=100)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae'], verbose=True)
means = report.mean()
csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
csv.flush()
df = pd.read_csv(result_path, sep='\t')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
print(pv)

View File

@ -1,8 +1,11 @@
import os
import sys
from typing import Union, Callable from typing import Union, Callable
import numpy as np import numpy as np
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from sklearn.linear_model import LogisticRegression from sklearn.linear_model import LogisticRegression
import pandas as pd import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity from sklearn.neighbors import KernelDensity
import quapy as qp import quapy as qp
@ -12,56 +15,96 @@ from quapy.method.aggregative import AggregativeProbabilisticQuantifier, _traini
DistributionMatching, _get_divergence DistributionMatching, _get_divergence
import scipy import scipy
from scipy import optimize from scipy import optimize
from statsmodels.nonparametric.kernel_density import KDEMultivariateConditional
# TODO: optimize the bandwidth automatically # TODO: optimize the bandwidth automatically
# TODO: replace the l2 metric in the kernel with the EMD, try to visualize the difference between both criteria in a 3-simplex
# TODO: think of a MMD-y variant, i.e., a MMD variant that uses the points in the simplex and possibly any non-linear kernel # TODO: think of a MMD-y variant, i.e., a MMD variant that uses the points in the simplex and possibly any non-linear kernel
class SklearnKDE:
def __init__(self):
pass
def fit(self):
pass
def likelihood(self):
pass
class KDEy(AggregativeProbabilisticQuantifier): class KDEy(AggregativeProbabilisticQuantifier):
BANDWIDTH_METHOD = ['auto', 'scott', 'silverman'] BANDWIDTH_METHOD = ['auto', 'scott', 'silverman']
ENGINE = ['scipy', 'sklearn'] ENGINE = ['scipy', 'sklearn', 'statsmodels']
TARGET = ['min_divergence', 'max_likelihood']
def __init__(self, classifier: BaseEstimator, val_split=0.4, divergence: Union[str, Callable]='HD', def __init__(self, classifier: BaseEstimator, val_split=0.4, divergence: Union[str, Callable]='HD',
bandwidth_method='scott', engine='sklearn', n_jobs=None): bandwidth='scott', engine='sklearn', target='min_divergence', n_jobs=None):
assert bandwidth in KDEy.BANDWIDTH_METHOD or isinstance(bandwidth, float), \
f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}'
assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}'
assert target in KDEy.TARGET, f'unknown target, valid ones are {KDEy.TARGET}'
self.classifier = classifier self.classifier = classifier
self.val_split = val_split self.val_split = val_split
self.divergence = divergence self.divergence = divergence
self.bandwidth_method = bandwidth_method self.bandwidth = bandwidth
self.engine = engine self.engine = engine
self.target = target
self.n_jobs = n_jobs self.n_jobs = n_jobs
assert bandwidth_method in KDEy.BANDWIDTH_METHOD, f'unknown bandwidth_method, valid ones are {KDEy.BANDWIDTH_METHOD}'
assert engine in KDEy.ENGINE, f'unknown engine, valid ones are {KDEy.ENGINE}' def search_bandwidth_maxlikelihood(self, posteriors, labels):
grid = {'bandwidth': np.linspace(0.001, 0.2, 100)}
search = GridSearchCV(
KernelDensity(), param_grid=grid, n_jobs=-1, cv=50, verbose=1, refit=True
)
search.fit(posteriors, labels)
bandwidth = search.best_params_['bandwidth']
print(f'auto: bandwidth={bandwidth:.5f}')
return bandwidth
def get_kde(self, posteriors): def get_kde(self, posteriors):
# if self.bandwidth == 'auto':
# print('adjusting bandwidth')
#
# if self.engine == 'sklearn':
# grid = {'bandwidth': np.linspace(0.001,0.2,41)}
# search = GridSearchCV(
# KernelDensity(), param_grid=grid, n_jobs=-1, cv=10, verbose=1, refit=True
# )
# search.fit(posteriors)
# print(search.best_score_)
# print(search.best_params_)
#
# import pandas as pd
# df = pd.DataFrame(search.cv_results_)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
# pd.set_option('expand_frame_repr', False)
# print(df)
# sys.exit(0)
#
# kde = search
#else:
if self.engine == 'scipy': if self.engine == 'scipy':
# scipy treats columns as datapoints, and need the datapoints not to lie in a lower-dimensional subspace, which # scipy treats columns as datapoints, and need the datapoints not to lie in a lower-dimensional subspace, which
# requires removing the last dimension which is constrained # requires removing the last dimension which is constrained
posteriors = posteriors[:,:-1].T posteriors = posteriors[:,:-1].T
kde = scipy.stats.gaussian_kde(posteriors) kde = scipy.stats.gaussian_kde(posteriors)
kde.set_bandwidth(self.bandwidth_method) kde.set_bandwidth(self.bandwidth)
elif self.engine == 'sklearn': elif self.engine == 'sklearn':
kde = KernelDensity(bandwidth=self.bandwidth_method).fit(posteriors) kde = KernelDensity(bandwidth=self.bandwidth).fit(posteriors)
return kde return kde
def pdf(self, kde, posteriors): def pdf(self, kde, posteriors):
if self.engine == 'scipy': if self.engine == 'scipy':
return kde(posteriors[:,:-1].T) return kde(posteriors[:, :-1].T)
elif self.engine == 'sklearn': elif self.engine == 'sklearn':
return np.exp(kde.score_samples(posteriors)) return np.exp(kde.score_samples(posteriors))
def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None): def fit(self, data: LabelledCollection, fit_classifier=True, val_split: Union[float, LabelledCollection] = None):
""" """
Trains the classifier (if requested) and generates the validation distributions out of the training data.
The validation distributions have shape `(n, ch, nbins)`, with `n` the number of classes, `ch` the number of
channels (a channel is a description, in form of a histogram, of a specific class -- there are as many channels
as classes, although in the binary case one can use only one channel, since the other one is constrained),
and `nbins` the number of bins. In particular, let `V` be the validation distributions; `di=V[i]`
are the distributions obtained from training data labelled with class `i`; `dij = di[j]` is the discrete
distribution of posterior probabilities `P(Y=j|X=x)` for training data labelled with class `i`, and `dij[k]`
is the fraction of instances with a value in the `k`-th bin.
:param data: the training set :param data: the training set
:param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit) :param fit_classifier: set to False to bypass the training (the learner is assumed to be already fit)
@ -77,6 +120,9 @@ class KDEy(AggregativeProbabilisticQuantifier):
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
) )
if self.bandwidth == 'auto':
self.bandwidth = self.search_bandwidth_maxlikelihood(posteriors, y)
self.val_densities = [self.get_kde(posteriors[y == cat]) for cat in range(data.n_classes)] self.val_densities = [self.get_kde(posteriors[y == cat]) for cat in range(data.n_classes)]
self.val_posteriors = posteriors self.val_posteriors = posteriors
@ -90,14 +136,18 @@ class KDEy(AggregativeProbabilisticQuantifier):
""" """
return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev)) return lambda posteriors: sum(prev_i * self.pdf(kde_i, posteriors) for kde_i, prev_i in zip(self.val_densities, prev))
def aggregate(self, posteriors: np.ndarray): def aggregate(self, posteriors: np.ndarray):
if self.target == 'min_divergence':
return self._target_divergence(posteriors)
elif self.target == 'max_likelihood':
return self._target_likelihood(posteriors)
else:
raise ValueError('unknown target')
def _target_divergence(self, posteriors):
""" """
Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
(the mixture) that best matches the test distribution, in terms of the divergence measure of choice. (the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
In the multiclass case, with `n` the number of classes, the test and mixture distributions contain
`n` channels (proper distributions of binned posterior probabilities), on which the divergence is computed
independently. The matching is computed as an average of the divergence across all channels.
:param instances: instances in the sample :param instances: instances in the sample
:return: a vector of class prevalence estimates :return: a vector of class prevalence estimates
@ -107,12 +157,14 @@ class KDEy(AggregativeProbabilisticQuantifier):
test_likelihood = self.pdf(test_density, posteriors) test_likelihood = self.pdf(test_density, posteriors)
divergence = _get_divergence(self.divergence) divergence = _get_divergence(self.divergence)
n_classes = len(self.val_densities) n_classes = len(self.val_densities)
def match(prev): def match(prev):
val_pdf = self.val_pdf(prev) val_pdf = self.val_pdf(prev)
val_likelihood = val_pdf(posteriors) val_likelihood = val_pdf(posteriors)
#for i,prev_i in enumerate(prev):
return divergence(val_likelihood, test_likelihood) return divergence(val_likelihood, test_likelihood)
# the initial point is set as the uniform distribution # the initial point is set as the uniform distribution
@ -124,50 +176,27 @@ class KDEy(AggregativeProbabilisticQuantifier):
r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) r = optimize.minimize(match, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
return r.x return r.x
def _target_likelihood(self, posteriors):
"""
Searches for the mixture model parameter (the sought prevalence values) that yields a validation distribution
(the mixture) that best matches the test distribution, in terms of the divergence measure of choice.
:param instances: instances in the sample
:return: a vector of class prevalence estimates
"""
n_classes = len(self.val_densities)
if __name__ == '__main__': def neg_loglikelihood(prev):
val_pdf = self.val_pdf(prev)
test_likelihood = val_pdf(posteriors)
test_loglikelihood = np.log(test_likelihood)
return - np.sum(test_loglikelihood)
qp.environ['SAMPLE_SIZE'] = 100 # the initial point is set as the uniform distribution
qp.environ['N_JOBS'] = -1 uniform_distribution = np.full(fill_value=1 / n_classes, shape=(n_classes,))
div = 'HD'
# generates tuples (dataset, method, method_name) # solutions are bounded to those contained in the unit-simplex
# (the dataset is needed for methods that process the dataset differently) bounds = tuple((0, 1) for _ in range(n_classes)) # values in [0,1]
def gen_methods(): constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(neg_loglikelihood, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST: return r.x
data = qp.datasets.fetch_twitter(dataset, min_df=3, pickle=True)
# kdey = KDEy(LogisticRegression(), divergence=div, bandwidth_method='scott')
# yield data, kdey, f'KDEy-{div}-scott'
kdey = KDEy(LogisticRegression(), divergence=div, bandwidth_method='silverman', engine='sklearn')
yield data, kdey, f'KDEy-{div}-silverman'
dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=5)
yield data, dm, f'DM-5b-{div}'
# dm = DistributionMatching(LogisticRegression(), divergence=div, nbins=10)
# yield data, dm, f'DM-10b-{div}'
result_path = 'results_kdey.csv'
with open(result_path, 'wt') as csv:
csv.write(f'Method\tDataset\tMAE\tMRAE\n')
for data, quantifier, quant_name in gen_methods():
quantifier.fit(data.training)
protocol = UPP(data.test, repeats=100)
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae','mrae'], verbose=True)
means = report.mean()
csv.write(f'{quant_name}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\n')
csv.flush()
df = pd.read_csv(result_path, sep='\t')
# print(df)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pv = df.pivot_table(index='Dataset', columns="Method", values=["MAE", "MRAE"])
print(pv)

View File

@ -0,0 +1,32 @@
import sys
from pathlib import Path
import pandas as pd
result_dir = 'results'
dfs = []
pathlist = Path(result_dir).rglob('*.csv')
for path in pathlist:
path_in_str = str(path)
print(path_in_str)
df = pd.read_csv(path_in_str, sep='\t')
dfs.append(df)
df = pd.concat(dfs)
piv = df.pivot_table(index='Dataset', columns='Method', values='MRAE')
piv.loc['mean'] = piv.mean()
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('expand_frame_repr', False)
print(piv)

21
laboratory/todo.txt Normal file
View File

@ -0,0 +1,21 @@
Cosa fundamental:
KDE se puede usar para generar 2 distribuciones (una, es un mixture model de KDEs en train condicionados a cada clase,
y el otro es un KDE en test), de las que luego se calculará la divergencia (objetivo a minimizar). Otra opción es
generar solo una distribución (mixture model de train) y tomar la likelihood de los puntos de test como objetivo
a maximizar.
1) aclarar: only test?
2) implementar el auto
- optimización interna para likelihood [ninguno parece funcionar bien]
- de todo (e.g., todo el training)?
- independiente para cada conjunto etiquetado? (e.g., positivos, negativos, neutros, y test)
- optimización como un parámetro GridSearchQ
3) aclarar: topsoe?
4) otro tipo de model selection?
5) aumentar numero de bags
6) optimizar parametro C? optimizar kernel? optimizar distancia?
7) KDE de sklearn o multivariate KDE de statsmodel? ver también qué es esto (parece que da P(Y|X) o sea que podría
eliminar el clasificador?):
https://www.statsmodels.org/dev/_modules/statsmodels/nonparametric/kernel_density.html#KDEMultivariateConditional
8) quitar la ultima dimension en sklearn también?
9) optimizar para RAE en vez de AE?

View File

@ -88,7 +88,7 @@ class GridSearchQ(BaseQuantifier):
hyper = [dict({k: val[i] for i, k in enumerate(params_keys)}) for val in itertools.product(*params_values)] hyper = [dict({k: val[i] for i, k in enumerate(params_keys)}) for val in itertools.product(*params_values)]
self._sout(f'starting model selection with {self.n_jobs =}') self._sout(f'starting model selection with {self.n_jobs =}')
#pass a seed to parallel so it is set in clild processes # pass a seed to parallel so it is set in clild processes
scores = qp.util.parallel( scores = qp.util.parallel(
self._delayed_eval, self._delayed_eval,
((params, training) for params in hyper), ((params, training) for params in hyper),