forked from moreo/QuaPy
running final experiments, one dedicated DM for each divergence with fine-grained exploration of nbins
This commit is contained in:
parent
f08885dca3
commit
3243fd90f8
|
@ -8,7 +8,7 @@ from distribution_matching.method_dirichlety import DIRy
|
||||||
from sklearn.linear_model import LogisticRegression
|
from sklearn.linear_model import LogisticRegression
|
||||||
from method_kdey_closed_efficient import KDEyclosed_efficient
|
from method_kdey_closed_efficient import KDEyclosed_efficient
|
||||||
|
|
||||||
METHODS = ['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'KDEy-closed++', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+',
|
METHODS = ['EMQ', 'EMQ-C', 'DM', 'DM-T', 'DM-HD', 'KDEy-DMhd3', 'DM-CS', 'KDEy-closed++', 'KDEy-ML'] #['ACC', 'PACC', 'HDy-OvA', 'DIR', 'DM', 'KDEy-DMhd3', 'KDEy-closed++', 'EMQ', 'KDEy-ML'] #, 'KDEy-DMhd2'] #, 'KDEy-DMhd2', 'DM-HD'] 'KDEy-DMjs', 'KDEy-DM', 'KDEy-ML+', 'KDEy-DMhd3+',
|
||||||
BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]
|
BIN_METHODS = [x.replace('-OvA', '') for x in METHODS]
|
||||||
|
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ def new_method(method, **lr_kwargs):
|
||||||
param_grid = hyper_LR
|
param_grid = hyper_LR
|
||||||
quantifier = PACC(lr)
|
quantifier = PACC(lr)
|
||||||
elif method == 'KDEy-ML':
|
elif method == 'KDEy-ML':
|
||||||
method_params = {'bandwidth': np.linspace(0.01, 0.3, 30)}
|
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
||||||
param_grid = {**method_params, **hyper_LR}
|
param_grid = {**method_params, **hyper_LR}
|
||||||
quantifier = KDEy(lr, target='max_likelihood', val_split=10)
|
quantifier = KDEy(lr, target='max_likelihood', val_split=10)
|
||||||
elif method == 'KDEy-closed':
|
elif method == 'KDEy-closed':
|
||||||
|
@ -59,6 +59,10 @@ def new_method(method, **lr_kwargs):
|
||||||
elif method == 'EMQ':
|
elif method == 'EMQ':
|
||||||
param_grid = hyper_LR
|
param_grid = hyper_LR
|
||||||
quantifier = EMQ(lr)
|
quantifier = EMQ(lr)
|
||||||
|
elif method == 'EMQ-C':
|
||||||
|
method_params = {'exact_train_prev': [False], 'recalib': ['bcts']}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = EMQ(lr)
|
||||||
elif method == 'HDy-OvA':
|
elif method == 'HDy-OvA':
|
||||||
param_grid = {'binary_quantifier__' + key: val for key, val in hyper_LR.items()}
|
param_grid = {'binary_quantifier__' + key: val for key, val in hyper_LR.items()}
|
||||||
quantifier = OneVsAllAggregative(HDy(lr))
|
quantifier = OneVsAllAggregative(HDy(lr))
|
||||||
|
@ -70,6 +74,30 @@ def new_method(method, **lr_kwargs):
|
||||||
}
|
}
|
||||||
param_grid = {**method_params, **hyper_LR}
|
param_grid = {**method_params, **hyper_LR}
|
||||||
quantifier = DistributionMatching(lr)
|
quantifier = DistributionMatching(lr)
|
||||||
|
elif method == 'DM-T':
|
||||||
|
method_params = {
|
||||||
|
'nbins': [2,3,4,5,6,7,8,9,10,12,14,16,18,20,22,24,26,28,30,32,64],
|
||||||
|
'val_split': [10],
|
||||||
|
'divergence': ['topsoe']
|
||||||
|
}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = DistributionMatching(lr)
|
||||||
|
elif method == 'DM-HD':
|
||||||
|
method_params = {
|
||||||
|
'nbins': [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 64],
|
||||||
|
'val_split': [10],
|
||||||
|
'divergence': ['HD']
|
||||||
|
}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = DistributionMatching(lr)
|
||||||
|
elif method == 'DM-CS':
|
||||||
|
method_params = {
|
||||||
|
'nbins': [2,3,4,5,6,7,8,9,10,12,14,16,18,20,22,24,26,28,30,32,64],
|
||||||
|
'val_split': [10],
|
||||||
|
'divergence': ['CS']
|
||||||
|
}
|
||||||
|
param_grid = {**method_params, **hyper_LR}
|
||||||
|
quantifier = DistributionMatching(lr)
|
||||||
|
|
||||||
# experimental
|
# experimental
|
||||||
elif method in ['KDEy-DMkld']:
|
elif method in ['KDEy-DMkld']:
|
||||||
|
@ -95,7 +123,7 @@ def new_method(method, **lr_kwargs):
|
||||||
# can be stored. This means that the reference distribution is V and not T. Then I have found that an
|
# can be stored. This means that the reference distribution is V and not T. Then I have found that an
|
||||||
# f-divergence is defined as D(p||q) \int_{R^n}q(x)f(p(x)/q(x))dx = E_{x~q}[f(p(x)/q(x))], so if I am sampling
|
# f-divergence is defined as D(p||q) \int_{R^n}q(x)f(p(x)/q(x))dx = E_{x~q}[f(p(x)/q(x))], so if I am sampling
|
||||||
# V then I am computing D(T||V) (and not D(V||T) as I thought).
|
# V then I am computing D(T||V) (and not D(V||T) as I thought).
|
||||||
method_params = {'bandwidth': np.linspace(0.01, 0.3, 30)}
|
method_params = {'bandwidth': np.linspace(0.01, 0.2, 20)}
|
||||||
param_grid = {**method_params, **hyper_LR}
|
param_grid = {**method_params, **hyper_LR}
|
||||||
quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
|
quantifier = KDEy(lr, target='min_divergence', divergence='HD', montecarlo_trials=5000, val_split=10)
|
||||||
elif method == 'DM-HD':
|
elif method == 'DM-HD':
|
||||||
|
|
|
@ -21,7 +21,7 @@ import dirichlet
|
||||||
|
|
||||||
class DIRy(AggregativeProbabilisticQuantifier):
|
class DIRy(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
MAXITER = 10000
|
MAXITER = 100000
|
||||||
|
|
||||||
def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None, target='max_likelihood'):
|
def __init__(self, classifier: BaseEstimator, val_split=0.4, n_jobs=None, target='max_likelihood'):
|
||||||
self.classifier = classifier
|
self.classifier = classifier
|
||||||
|
@ -38,7 +38,12 @@ class DIRy(AggregativeProbabilisticQuantifier):
|
||||||
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
data, self.classifier, val_split, probabilistic=True, fit_classifier=fit_classifier, n_jobs=self.n_jobs
|
||||||
)
|
)
|
||||||
|
|
||||||
self.val_parameters = [dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER) for cat in range(data.n_classes)]
|
self.val_parameters = []
|
||||||
|
for cat in range(data.n_classes):
|
||||||
|
dir_i = dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER)
|
||||||
|
self.val_parameters.append(dir_i)
|
||||||
|
# print(cat)
|
||||||
|
# self.val_parameters = [dirichlet.mle(posteriors[y == cat], maxiter=DIRy.MAXITER) for cat in range(data.n_classes)]
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
|
@ -24,6 +24,7 @@ if __name__ == '__main__':
|
||||||
for method in METHODS:
|
for method in METHODS:
|
||||||
|
|
||||||
print('Init method', method)
|
print('Init method', method)
|
||||||
|
if method == 'EMQ-C': continue
|
||||||
|
|
||||||
global_result_path = f'{result_dir}/{method}'
|
global_result_path = f'{result_dir}/{method}'
|
||||||
|
|
||||||
|
@ -36,7 +37,7 @@ if __name__ == '__main__':
|
||||||
# this variable controls that the mod sel has already been done, and skip this otherwise
|
# this variable controls that the mod sel has already been done, and skip this otherwise
|
||||||
semeval_trained = False
|
semeval_trained = False
|
||||||
|
|
||||||
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST[::-1]:
|
for dataset in qp.datasets.TWITTER_SENTIMENT_DATASETS_TEST:
|
||||||
print('init', dataset)
|
print('init', dataset)
|
||||||
|
|
||||||
local_result_path = global_result_path + '_' + dataset
|
local_result_path = global_result_path + '_' + dataset
|
||||||
|
|
|
@ -69,8 +69,8 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
quantifier = modsel.best_model()
|
quantifier = modsel.best_model()
|
||||||
except:
|
except:
|
||||||
print('something went wrong... reporting CC')
|
print('something went wrong... trying to fit the default model')
|
||||||
quantifier = qp.method.aggregative.CC(LR()).fit(train)
|
quantifier.fit(train)
|
||||||
|
|
||||||
protocol = UPP(test, repeats=n_bags_test)
|
protocol = UPP(test, repeats=n_bags_test)
|
||||||
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
|
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
import pickle
|
import pickle
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
|
||||||
from distribution_matching.commons import METHODS, new_method, show_results
|
from distribution_matching.commons import METHODS, new_method, show_results
|
||||||
|
|
||||||
import quapy as qp
|
import quapy as qp
|
||||||
|
@ -22,9 +25,9 @@ if __name__ == '__main__':
|
||||||
os.makedirs(result_dir, exist_ok=True)
|
os.makedirs(result_dir, exist_ok=True)
|
||||||
|
|
||||||
for method in METHODS:
|
for method in METHODS:
|
||||||
if method == 'HDy-OvA': continue
|
#if method == 'HDy-OvA': continue
|
||||||
if method == 'DIR': continue
|
#if method == 'DIR': continue
|
||||||
if method != 'KDEy-ML': continue
|
# if method != 'EMQ-C': continue
|
||||||
|
|
||||||
print('Init method', method)
|
print('Init method', method)
|
||||||
|
|
||||||
|
@ -71,12 +74,14 @@ if __name__ == '__main__':
|
||||||
|
|
||||||
quantifier = modsel.best_model()
|
quantifier = modsel.best_model()
|
||||||
except:
|
except:
|
||||||
print('something went wrong... reporting CC')
|
print('something went wrong... trying to fit the default model')
|
||||||
quantifier = qp.method.aggregative.CC(LR()).fit(train)
|
quantifier.fit(train)
|
||||||
|
# quantifier = qp.method.aggregative.CC(LogisticRegression()).fit(train)
|
||||||
|
|
||||||
|
|
||||||
protocol = UPP(test, repeats=n_bags_test)
|
protocol = UPP(test, repeats=n_bags_test)
|
||||||
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
|
report = qp.evaluation.evaluation_report(quantifier, protocol, error_metrics=['mae', 'mrae', 'kld'],
|
||||||
verbose=True)
|
verbose=True, n_jobs=-1)
|
||||||
report.to_csv(f'{local_result_path}.dataframe')
|
report.to_csv(f'{local_result_path}.dataframe')
|
||||||
means = report.mean()
|
means = report.mean()
|
||||||
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
csv.write(f'{method}\t{data.name}\t{means["mae"]:.5f}\t{means["mrae"]:.5f}\t{means["kld"]:.5f}\n')
|
||||||
|
|
|
@ -9,8 +9,6 @@ Change Log 0.1.8
|
||||||
- qp.evaluation now runs in parallel <improve, remove or fix the ongoing error, put at the qp. level instead of
|
- qp.evaluation now runs in parallel <improve, remove or fix the ongoing error, put at the qp. level instead of
|
||||||
qp.evaluation because I don't like the qp.evaluation.evaluate thing>
|
qp.evaluation because I don't like the qp.evaluation.evaluate thing>
|
||||||
|
|
||||||
- <fix> remove dependencies with LabelledCollection in the library.
|
|
||||||
|
|
||||||
|
|
||||||
Change Log 0.1.7
|
Change Log 0.1.7
|
||||||
----------------
|
----------------
|
||||||
|
|
|
@ -59,7 +59,7 @@ class RecalibratedProbabilisticClassifierBase(BaseEstimator, RecalibratedProbabi
|
||||||
elif isinstance(k, float):
|
elif isinstance(k, float):
|
||||||
if not (0 < k < 1):
|
if not (0 < k < 1):
|
||||||
raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)')
|
raise ValueError('wrong value for val_split: the proportion of validation documents must be in (0,1)')
|
||||||
return self.fit_cv(X, y)
|
return self.fit_tr_val(X, y)
|
||||||
|
|
||||||
def fit_cv(self, X, y):
|
def fit_cv(self, X, y):
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -90,11 +90,32 @@ def TopsoeDistance(P, Q, epsilon=1e-20):
|
||||||
|
|
||||||
:param P: real-valued array-like of shape `(k,)` representing a discrete distribution
|
:param P: real-valued array-like of shape `(k,)` representing a discrete distribution
|
||||||
:param Q: real-valued array-like of shape `(k,)` representing a discrete distribution
|
:param Q: real-valued array-like of shape `(k,)` representing a discrete distribution
|
||||||
|
:param epsilon: small value to smooth the distributions for numerical stability
|
||||||
:return: float
|
:return: float
|
||||||
"""
|
"""
|
||||||
return np.sum(P*np.log((2*P+epsilon)/(P+Q+epsilon)) + Q*np.log((2*Q+epsilon)/(P+Q+epsilon)))
|
return np.sum(P*np.log((2*P+epsilon)/(P+Q+epsilon)) + Q*np.log((2*Q+epsilon)/(P+Q+epsilon)))
|
||||||
|
|
||||||
|
|
||||||
|
def CauchySchwarz(P, Q, epsilon=1e-20):
|
||||||
|
"""
|
||||||
|
Cauchy-Schwarz divergence between two (discretized) distributions `P` and `Q`.
|
||||||
|
The Cauchy-Schwarz divergence for two discrete distributions of `k` bins is defined as:
|
||||||
|
|
||||||
|
.. math::
|
||||||
|
CS(P,Q) = \\frac{ \\sum_{i=1}^k p_i q_i }{
|
||||||
|
\\left( \\sum_{i=1}^k p^2_i \\right) \\left( \\sum_{i=1}^k q^2_i \\right)
|
||||||
|
}
|
||||||
|
|
||||||
|
:param P: real-valued array-like of shape `(k,)` representing a discrete distribution
|
||||||
|
:param Q: real-valued array-like of shape `(k,)` representing a discrete distribution
|
||||||
|
:param epsilon: small value to smooth the distributions for numerical stability
|
||||||
|
:return: float
|
||||||
|
"""
|
||||||
|
P += epsilon
|
||||||
|
Q += epsilon
|
||||||
|
return - np.log(sum(P * Q) / np.sqrt(sum(P ** 2) * sum(Q ** 2)))
|
||||||
|
|
||||||
|
|
||||||
def uniform_prevalence_sampling(n_classes, size=1):
|
def uniform_prevalence_sampling(n_classes, size=1):
|
||||||
"""
|
"""
|
||||||
Implements the `Kraemer algorithm <http://www.cs.cmu.edu/~nasmith/papers/smith+tromble.tr04.pdf>`_
|
Implements the `Kraemer algorithm <http://www.cs.cmu.edu/~nasmith/papers/smith+tromble.tr04.pdf>`_
|
||||||
|
@ -276,3 +297,4 @@ def check_prevalence_vector(p, raise_exception=False, toleranze=1e-08):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -604,10 +604,13 @@ class HDy(AggregativeProbabilisticQuantifier, BinaryQuantifier):
|
||||||
|
|
||||||
def _get_divergence(divergence: Union[str, Callable]):
|
def _get_divergence(divergence: Union[str, Callable]):
|
||||||
if isinstance(divergence, str):
|
if isinstance(divergence, str):
|
||||||
if divergence=='HD':
|
divergence = divergence.lower()
|
||||||
|
if divergence=='hd':
|
||||||
return F.HellingerDistance
|
return F.HellingerDistance
|
||||||
elif divergence=='topsoe':
|
elif divergence=='topsoe':
|
||||||
return F.TopsoeDistance
|
return F.TopsoeDistance
|
||||||
|
elif divergence.lower()=='cs':
|
||||||
|
return F.CauchySchwarz
|
||||||
elif divergence.lower()=='l2':
|
elif divergence.lower()=='l2':
|
||||||
return lambda a,b: np.linalg.norm(a-b)
|
return lambda a,b: np.linalg.norm(a-b)
|
||||||
elif divergence.lower()=='l1':
|
elif divergence.lower()=='l1':
|
||||||
|
|
Loading…
Reference in New Issue