forked from moreo/QuaPy
making everything work like in the wiki
This commit is contained in:
parent
70da8f7925
commit
854d759dc4
|
@ -75,7 +75,7 @@ if __name__ == '__main__':
|
||||||
nexp_methods = len(experimental_methods)
|
nexp_methods = len(experimental_methods)
|
||||||
|
|
||||||
# fill data table
|
# fill data table
|
||||||
table = Table(rows=datasets, cols=methods)
|
table = Table(benchmarks=datasets, methods=methods)
|
||||||
for dataset in datasets:
|
for dataset in datasets:
|
||||||
for method in methods:
|
for method in methods:
|
||||||
if method in experimental_methods:
|
if method in experimental_methods:
|
||||||
|
@ -94,7 +94,7 @@ if __name__ == '__main__':
|
||||||
rowreplace={dataset: nice.get(dataset, dataset.upper()) for dataset in datasets}
|
rowreplace={dataset: nice.get(dataset, dataset.upper()) for dataset in datasets}
|
||||||
colreplace={method:'\side{' + nice.get(method, method.upper()) +'$^{' + nicerm(eval_name) + '}$} ' for method in methods}
|
colreplace={method:'\side{' + nice.get(method, method.upper()) +'$^{' + nicerm(eval_name) + '}$} ' for method in methods}
|
||||||
|
|
||||||
tabular += table.latexTabular(rowreplace=rowreplace, colreplace=colreplace)
|
tabular += table.latexTabular(benchmark_replace=rowreplace, method_replace=colreplace)
|
||||||
tabular += "\n\end{tabularx}"
|
tabular += "\n\end{tabularx}"
|
||||||
|
|
||||||
save_table(f'./tables/tab_results_{eval_name}.new.tex', tabular)
|
save_table(f'./tables/tab_results_{eval_name}.new.tex', tabular)
|
||||||
|
@ -102,7 +102,7 @@ if __name__ == '__main__':
|
||||||
# Tables ranks for AE and RAE (two tables)
|
# Tables ranks for AE and RAE (two tables)
|
||||||
# ----------------------------------------------------
|
# ----------------------------------------------------
|
||||||
# fill the data table
|
# fill the data table
|
||||||
ranktable = Table(rows=datasets, cols=methods, missing='--')
|
ranktable = Table(benchmarks=datasets, methods=methods, missing='--')
|
||||||
for dataset in datasets:
|
for dataset in datasets:
|
||||||
for method in methods:
|
for method in methods:
|
||||||
ranktable.add(dataset, method, values=table.get(dataset, method, 'rank'))
|
ranktable.add(dataset, method, values=table.get(dataset, method, 'rank'))
|
||||||
|
|
7
TODO.txt
7
TODO.txt
|
@ -1,20 +1,15 @@
|
||||||
Documentation with sphinx
|
Documentation with sphinx
|
||||||
Add quantification_report (akin to classification_report from sklearn) (?)
|
|
||||||
Add NAE, NRAE
|
Add NAE, NRAE
|
||||||
Add "measures for evaluating ordinal"?
|
Add "measures for evaluating ordinal"?
|
||||||
Document methods with paper references
|
Document methods with paper references
|
||||||
The parallel training in svmperf seems not to work (not sure...)
|
|
||||||
In binary quantification (hp, kindle, imdb) we used F1 in the minority class (which in kindle and hp happens to be the
|
In binary quantification (hp, kindle, imdb) we used F1 in the minority class (which in kindle and hp happens to be the
|
||||||
negative class). This is not covered in this new implementation, in which the binary case is not treated as such, but as
|
negative class). This is not covered in this new implementation, in which the binary case is not treated as such, but as
|
||||||
an instance of single-label with 2 labels. Check
|
an instance of single-label with 2 labels. Check
|
||||||
Add classnames to LabelledCollection ?
|
Add classnames to LabelledCollection ?
|
||||||
Check the overhead in OneVsAll for SVMperf-based (?)
|
Add automatic reindex of class labels in LabelledCollection (currently, class indexes should be ordered and with no gaps)
|
||||||
Add HDy to QuaNet? if so, wrap HDy into OneVsAll in case the dataset is not binary.
|
|
||||||
Add datasets for topic.
|
Add datasets for topic.
|
||||||
Clarify whether QuaNet is an aggregative method or not.
|
Clarify whether QuaNet is an aggregative method or not.
|
||||||
Add medium swap method
|
|
||||||
Explore the hyperparameter "number of bins" in HDy
|
Explore the hyperparameter "number of bins" in HDy
|
||||||
Implement HDy for single-label?
|
|
||||||
Rename EMQ to SLD ?
|
Rename EMQ to SLD ?
|
||||||
How many times is the system of equations for ACC and PACC not solved? How many times is it clipped? Do they sum up
|
How many times is the system of equations for ACC and PACC not solved? How many times is it clipped? Do they sum up
|
||||||
to one always?
|
to one always?
|
||||||
|
|
|
@ -16,7 +16,8 @@ class Table:
|
||||||
self.methods = np.asarray(methods)
|
self.methods = np.asarray(methods)
|
||||||
self.method_index = {col:j for j, col in enumerate(methods)}
|
self.method_index = {col:j for j, col in enumerate(methods)}
|
||||||
|
|
||||||
self.map = {} # keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
|
self.map = {}
|
||||||
|
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
|
||||||
self._addmap('values', dtype=object)
|
self._addmap('values', dtype=object)
|
||||||
self.lower_is_better = lower_is_better
|
self.lower_is_better = lower_is_better
|
||||||
self.ttest = ttest
|
self.ttest = ttest
|
||||||
|
@ -28,6 +29,7 @@ class Table:
|
||||||
self.missing = missing
|
self.missing = missing
|
||||||
self.missing_str = missing_str
|
self.missing_str = missing_str
|
||||||
self.color = color
|
self.color = color
|
||||||
|
|
||||||
self.touch()
|
self.touch()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -39,10 +41,10 @@ class Table:
|
||||||
return len(self.methods)
|
return len(self.methods)
|
||||||
|
|
||||||
def touch(self):
|
def touch(self):
|
||||||
self.modif = True
|
self._modif = True
|
||||||
|
|
||||||
def update(self):
|
def update(self):
|
||||||
if self.modif:
|
if self._modif:
|
||||||
self.compute()
|
self.compute()
|
||||||
|
|
||||||
def _getfilled(self):
|
def _getfilled(self):
|
||||||
|
@ -61,8 +63,6 @@ class Table:
|
||||||
return
|
return
|
||||||
m = self.map[map]
|
m = self.map[map]
|
||||||
f = func
|
f = func
|
||||||
if f is None:
|
|
||||||
return
|
|
||||||
indexes = self._indexes() if map == 'fill' else self._getfilled()
|
indexes = self._indexes() if map == 'fill' else self._getfilled()
|
||||||
for i, j in indexes:
|
for i, j in indexes:
|
||||||
m[i, j] = f(self.values[i, j])
|
m[i, j] = f(self.values[i, j])
|
||||||
|
@ -95,7 +95,6 @@ class Table:
|
||||||
normval = 1 - normval
|
normval = 1 - normval
|
||||||
self.map['color'][i, col_idx] = color_red2green_01(normval)
|
self.map['color'][i, col_idx] = color_red2green_01(normval)
|
||||||
|
|
||||||
|
|
||||||
def _run_ttest(self, row, col1, col2):
|
def _run_ttest(self, row, col1, col2):
|
||||||
mean1 = self.map['mean'][row, col1]
|
mean1 = self.map['mean'][row, col1]
|
||||||
std1 = self.map['std'][row, col1]
|
std1 = self.map['std'][row, col1]
|
||||||
|
@ -112,7 +111,7 @@ class Table:
|
||||||
_, p_val = wilcoxon(values1, values2)
|
_, p_val = wilcoxon(values1, values2)
|
||||||
return p_val
|
return p_val
|
||||||
|
|
||||||
def _addttest(self):
|
def _add_statistical_test(self):
|
||||||
if self.ttest is None:
|
if self.ttest is None:
|
||||||
return
|
return
|
||||||
self.some_similar = [False]*self.nmethods
|
self.some_similar = [False]*self.nmethods
|
||||||
|
@ -147,10 +146,10 @@ class Table:
|
||||||
self._addmap('latex', dtype=object, func=None)
|
self._addmap('latex', dtype=object, func=None)
|
||||||
self._addrank()
|
self._addrank()
|
||||||
self._addcolor()
|
self._addcolor()
|
||||||
self._addttest()
|
self._add_statistical_test()
|
||||||
if self.add_average:
|
if self.add_average:
|
||||||
self._addave()
|
self._addave()
|
||||||
self.modif = False
|
self._modif = False
|
||||||
|
|
||||||
def _is_column_full(self, col):
|
def _is_column_full(self, col):
|
||||||
return all(self.map['fill'][:, self.method_index[col]])
|
return all(self.map['fill'][:, self.method_index[col]])
|
||||||
|
@ -189,11 +188,11 @@ class Table:
|
||||||
else:
|
else:
|
||||||
return self.missing
|
return self.missing
|
||||||
|
|
||||||
def _coordinates(self, row, col):
|
def _coordinates(self, benchmark, method):
|
||||||
assert row in self.benchmark_index, f'row {row} out of range'
|
assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
|
||||||
assert col in self.method_index, f'col {col} out of range'
|
assert method in self.method_index, f'method {method} out of range'
|
||||||
rid = self.benchmark_index[row]
|
rid = self.benchmark_index[benchmark]
|
||||||
cid = self.method_index[col]
|
cid = self.method_index[method]
|
||||||
return rid, cid
|
return rid, cid
|
||||||
|
|
||||||
def get_average(self, method, attr='mean'):
|
def get_average(self, method, attr='mean'):
|
||||||
|
@ -283,7 +282,6 @@ class Table:
|
||||||
t.compute()
|
t.compute()
|
||||||
return t
|
return t
|
||||||
|
|
||||||
|
|
||||||
def dropMethods(self, methods):
|
def dropMethods(self, methods):
|
||||||
drop_index = [self.method_index[m] for m in methods]
|
drop_index = [self.method_index[m] for m in methods]
|
||||||
new_methods = np.delete(self.methods, drop_index)
|
new_methods = np.delete(self.methods, drop_index)
|
||||||
|
@ -295,8 +293,6 @@ class Table:
|
||||||
self.touch()
|
self.touch()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def pval_interpretation(p_val):
|
def pval_interpretation(p_val):
|
||||||
if 0.005 >= p_val:
|
if 0.005 >= p_val:
|
||||||
return 'Diff'
|
return 'Diff'
|
||||||
|
|
|
@ -32,7 +32,7 @@ class NeuralClassifierTrainer:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
assert isinstance(net, TextClassifierNet), f'net is not an instance of {TextClassifierNet.__name__}'
|
assert isinstance(net, TextClassifierNet), f'net is not an instance of {TextClassifierNet.__name__}'
|
||||||
self.net = net
|
self.net = net.to(device)
|
||||||
self.vocab_size = self.net.vocabulary_size
|
self.vocab_size = self.net.vocabulary_size
|
||||||
self.trainer_hyperparams={
|
self.trainer_hyperparams={
|
||||||
'lr': lr,
|
'lr': lr,
|
||||||
|
@ -50,10 +50,12 @@ class NeuralClassifierTrainer:
|
||||||
self.classes_ = np.asarray([0, 1])
|
self.classes_ = np.asarray([0, 1])
|
||||||
|
|
||||||
print(f'[NeuralNetwork running on {device}]')
|
print(f'[NeuralNetwork running on {device}]')
|
||||||
|
|
||||||
os.makedirs(Path(checkpointpath).parent, exist_ok=True)
|
os.makedirs(Path(checkpointpath).parent, exist_ok=True)
|
||||||
|
|
||||||
def reset_net_params(self, vocab_size, n_classes):
|
def reset_net_params(self, vocab_size, n_classes):
|
||||||
self.net = self.net.__class__(vocab_size, n_classes, **self.learner_hyperparams)
|
self.net = self.net.__class__(vocab_size, n_classes, **self.learner_hyperparams)
|
||||||
|
self.net = self.net.to(self.trainer_hyperparams['device'])
|
||||||
self.net.xavier_uniform()
|
self.net.xavier_uniform()
|
||||||
|
|
||||||
def get_params(self):
|
def get_params(self):
|
||||||
|
@ -65,7 +67,7 @@ class NeuralClassifierTrainer:
|
||||||
for key, val in params.items():
|
for key, val in params.items():
|
||||||
if key in trainer_hyperparams and key in learner_hyperparams:
|
if key in trainer_hyperparams and key in learner_hyperparams:
|
||||||
raise ValueError(f'the use of parameter {key} is ambiguous since it can refer to '
|
raise ValueError(f'the use of parameter {key} is ambiguous since it can refer to '
|
||||||
f'a parameters of the Trainer or the learner {self.netclass.__name__}')
|
f'a parameters of the Trainer or the learner {self.net.__name__}')
|
||||||
elif key not in trainer_hyperparams and key not in learner_hyperparams:
|
elif key not in trainer_hyperparams and key not in learner_hyperparams:
|
||||||
raise ValueError(f'parameter {key} is not valid')
|
raise ValueError(f'parameter {key} is not valid')
|
||||||
|
|
||||||
|
@ -81,17 +83,7 @@ class NeuralClassifierTrainer:
|
||||||
def device(self):
|
def device(self):
|
||||||
return next(self.net.parameters()).device
|
return next(self.net.parameters()).device
|
||||||
|
|
||||||
def __update_progress_bar(self, pbar):
|
def _train_epoch(self, data, status, pbar, epoch):
|
||||||
pbar.set_description(f'[{self.net.__class__.__name__}] training epoch={self.current_epoch} '
|
|
||||||
f'tr-loss={self.status["tr"]["loss"]:.5f} '
|
|
||||||
f'tr-acc={100 * self.status["tr"]["acc"]:.2f}% '
|
|
||||||
f'tr-macroF1={100 * self.status["tr"]["f1"]:.2f}% '
|
|
||||||
f'patience={self.early_stop.patience}/{self.early_stop.PATIENCE_LIMIT} '
|
|
||||||
f'val-loss={self.status["va"]["loss"]:.5f} '
|
|
||||||
f'val-acc={100 * self.status["va"]["acc"]:.2f}% '
|
|
||||||
f'macroF1={100 * self.status["va"]["f1"]:.2f}%')
|
|
||||||
|
|
||||||
def _train_epoch(self, data, status, pbar):
|
|
||||||
self.net.train()
|
self.net.train()
|
||||||
criterion = torch.nn.CrossEntropyLoss()
|
criterion = torch.nn.CrossEntropyLoss()
|
||||||
losses, predictions, true_labels = [], [], []
|
losses, predictions, true_labels = [], [], []
|
||||||
|
@ -109,9 +101,9 @@ class NeuralClassifierTrainer:
|
||||||
true_labels.extend(yi.detach().cpu().numpy().tolist())
|
true_labels.extend(yi.detach().cpu().numpy().tolist())
|
||||||
status["acc"] = accuracy_score(true_labels, predictions)
|
status["acc"] = accuracy_score(true_labels, predictions)
|
||||||
status["f1"] = f1_score(true_labels, predictions, average='macro')
|
status["f1"] = f1_score(true_labels, predictions, average='macro')
|
||||||
self.__update_progress_bar(pbar)
|
self.__update_progress_bar(pbar, epoch)
|
||||||
|
|
||||||
def _test_epoch(self, data, status, pbar):
|
def _test_epoch(self, data, status, pbar, epoch):
|
||||||
self.net.eval()
|
self.net.eval()
|
||||||
criterion = torch.nn.CrossEntropyLoss()
|
criterion = torch.nn.CrossEntropyLoss()
|
||||||
losses, predictions, true_labels = [], [], []
|
losses, predictions, true_labels = [], [], []
|
||||||
|
@ -127,7 +119,17 @@ class NeuralClassifierTrainer:
|
||||||
status["loss"] = np.mean(losses)
|
status["loss"] = np.mean(losses)
|
||||||
status["acc"] = accuracy_score(true_labels, predictions)
|
status["acc"] = accuracy_score(true_labels, predictions)
|
||||||
status["f1"] = f1_score(true_labels, predictions, average='macro')
|
status["f1"] = f1_score(true_labels, predictions, average='macro')
|
||||||
self.__update_progress_bar(pbar)
|
self.__update_progress_bar(pbar, epoch)
|
||||||
|
|
||||||
|
def __update_progress_bar(self, pbar, epoch):
|
||||||
|
pbar.set_description(f'[{self.net.__class__.__name__}] training epoch={epoch} '
|
||||||
|
f'tr-loss={self.status["tr"]["loss"]:.5f} '
|
||||||
|
f'tr-acc={100 * self.status["tr"]["acc"]:.2f}% '
|
||||||
|
f'tr-macroF1={100 * self.status["tr"]["f1"]:.2f}% '
|
||||||
|
f'patience={self.early_stop.patience}/{self.early_stop.PATIENCE_LIMIT} '
|
||||||
|
f'val-loss={self.status["va"]["loss"]:.5f} '
|
||||||
|
f'val-acc={100 * self.status["va"]["acc"]:.2f}% '
|
||||||
|
f'macroF1={100 * self.status["va"]["f1"]:.2f}%')
|
||||||
|
|
||||||
def fit(self, instances, labels, val_split=0.3):
|
def fit(self, instances, labels, val_split=0.3):
|
||||||
train, val = LabelledCollection(instances, labels).split_stratified(1-val_split)
|
train, val = LabelledCollection(instances, labels).split_stratified(1-val_split)
|
||||||
|
@ -147,11 +149,11 @@ class NeuralClassifierTrainer:
|
||||||
self.early_stop = EarlyStop(opt['patience'], lower_is_better=False)
|
self.early_stop = EarlyStop(opt['patience'], lower_is_better=False)
|
||||||
|
|
||||||
with tqdm(range(1, opt['epochs'] + 1)) as pbar:
|
with tqdm(range(1, opt['epochs'] + 1)) as pbar:
|
||||||
for self.current_epoch in pbar:
|
for epoch in pbar:
|
||||||
self._train_epoch(train_generator, self.status['tr'], pbar)
|
self._train_epoch(train_generator, self.status['tr'], pbar, epoch)
|
||||||
self._test_epoch(valid_generator, self.status['va'], pbar)
|
self._test_epoch(valid_generator, self.status['va'], pbar, epoch)
|
||||||
|
|
||||||
self.early_stop(self.status['va']['f1'], self.current_epoch)
|
self.early_stop(self.status['va']['f1'], epoch)
|
||||||
if self.early_stop.IMPROVED:
|
if self.early_stop.IMPROVED:
|
||||||
torch.save(self.net.state_dict(), checkpoint)
|
torch.save(self.net.state_dict(), checkpoint)
|
||||||
elif self.early_stop.STOP:
|
elif self.early_stop.STOP:
|
||||||
|
@ -161,7 +163,7 @@ class NeuralClassifierTrainer:
|
||||||
break
|
break
|
||||||
|
|
||||||
print('performing one training pass over the validation set...')
|
print('performing one training pass over the validation set...')
|
||||||
self._train_epoch(valid_generator, self.status['tr'], pbar)
|
self._train_epoch(valid_generator, self.status['tr'], pbar, epoch=0)
|
||||||
print('[done]')
|
print('[done]')
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
@ -170,9 +172,6 @@ class NeuralClassifierTrainer:
|
||||||
return np.argmax(self.predict_proba(instances), axis=-1)
|
return np.argmax(self.predict_proba(instances), axis=-1)
|
||||||
|
|
||||||
def predict_proba(self, instances):
|
def predict_proba(self, instances):
|
||||||
return self.net.predict_proba(instances)
|
|
||||||
|
|
||||||
def predict_probability_positive(self, instances):
|
|
||||||
self.net.eval()
|
self.net.eval()
|
||||||
opt = self.trainer_hyperparams
|
opt = self.trainer_hyperparams
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
|
@ -185,9 +184,10 @@ class NeuralClassifierTrainer:
|
||||||
def transform(self, instances):
|
def transform(self, instances):
|
||||||
self.net.eval()
|
self.net.eval()
|
||||||
embeddings = []
|
embeddings = []
|
||||||
|
opt = self.trainer_hyperparams
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
for xi in TorchDataset(instances).asDataloader(
|
for xi in TorchDataset(instances).asDataloader(
|
||||||
self.batch_size_test, shuffle=False, pad_length=self.padding_length, device=self.device):
|
opt['batch_size_test'], shuffle=False, pad_length=opt['padding_length'], device=opt['device']):
|
||||||
embeddings.append(self.net.document_embedding(xi).detach().cpu().numpy())
|
embeddings.append(self.net.document_embedding(xi).detach().cpu().numpy())
|
||||||
return np.concatenate(embeddings)
|
return np.concatenate(embeddings)
|
||||||
|
|
||||||
|
@ -233,7 +233,7 @@ class TextClassifierNet(torch.nn.Module, metaclass=ABCMeta):
|
||||||
|
|
||||||
def predict_proba(self, x):
|
def predict_proba(self, x):
|
||||||
logits = self(x)
|
logits = self(x)
|
||||||
return torch.softmax(logits).detach().cpu().numpy()
|
return torch.softmax(logits, dim=1).detach().cpu().numpy()
|
||||||
|
|
||||||
def xavier_uniform(self):
|
def xavier_uniform(self):
|
||||||
for p in self.parameters():
|
for p in self.parameters():
|
||||||
|
|
|
@ -1,9 +1,10 @@
|
||||||
import random
|
import random
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
from os import remove
|
from os import remove, makedirs
|
||||||
from os.path import join, exists
|
from os.path import join, exists
|
||||||
from subprocess import PIPE, STDOUT
|
from subprocess import PIPE, STDOUT
|
||||||
|
import shutil
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
|
@ -22,7 +23,6 @@ class SVMperf(BaseEstimator, ClassifierMixin):
|
||||||
self.verbose = verbose
|
self.verbose = verbose
|
||||||
self.loss = loss
|
self.loss = loss
|
||||||
|
|
||||||
|
|
||||||
def set_params(self, **parameters):
|
def set_params(self, **parameters):
|
||||||
assert list(parameters.keys()) == ['C'], 'currently, only the C parameter is supported'
|
assert list(parameters.keys()) == ['C'], 'currently, only the C parameter is supported'
|
||||||
self.C = parameters['C']
|
self.C = parameters['C']
|
||||||
|
@ -42,10 +42,15 @@ class SVMperf(BaseEstimator, ClassifierMixin):
|
||||||
local_random = random.Random()
|
local_random = random.Random()
|
||||||
# this would allow to run parallel instances of predict
|
# this would allow to run parallel instances of predict
|
||||||
random_code = '-'.join(str(local_random.randint(0,1000000)) for _ in range(5))
|
random_code = '-'.join(str(local_random.randint(0,1000000)) for _ in range(5))
|
||||||
self.tmpdir = tempfile.TemporaryDirectory(suffix=random_code)
|
# self.tmpdir = tempfile.TemporaryDirectory(suffix=random_code)
|
||||||
|
# tmp dir are removed after the fit terminates in multiprocessing... moving to regular directories + __del__
|
||||||
|
self.tmpdir = '.svmperf-' + random_code
|
||||||
|
makedirs(self.tmpdir, exist_ok=True)
|
||||||
|
|
||||||
self.model = join(self.tmpdir.name, 'model-'+random_code)
|
# self.model = join(self.tmpdir.name, 'model-'+random_code)
|
||||||
traindat = join(self.tmpdir.name, f'train-{random_code}.dat')
|
# traindat = join(self.tmpdir.name, f'train-{random_code}.dat')
|
||||||
|
self.model = join(self.tmpdir, 'model-'+random_code)
|
||||||
|
traindat = join(self.tmpdir, f'train-{random_code}.dat')
|
||||||
|
|
||||||
dump_svmlight_file(X, y, traindat, zero_based=False)
|
dump_svmlight_file(X, y, traindat, zero_based=False)
|
||||||
|
|
||||||
|
@ -75,8 +80,10 @@ class SVMperf(BaseEstimator, ClassifierMixin):
|
||||||
# in order to allow for parallel runs of predict, a random code is assigned
|
# in order to allow for parallel runs of predict, a random code is assigned
|
||||||
local_random = random.Random()
|
local_random = random.Random()
|
||||||
random_code = '-'.join(str(local_random.randint(0, 1000000)) for _ in range(5))
|
random_code = '-'.join(str(local_random.randint(0, 1000000)) for _ in range(5))
|
||||||
predictions_path = join(self.tmpdir.name, 'predictions'+random_code+'.dat')
|
# predictions_path = join(self.tmpdir.name, 'predictions'+random_code+'.dat')
|
||||||
testdat = join(self.tmpdir.name, 'test'+random_code+'.dat')
|
# testdat = join(self.tmpdir.name, 'test'+random_code+'.dat')
|
||||||
|
predictions_path = join(self.tmpdir, 'predictions' + random_code + '.dat')
|
||||||
|
testdat = join(self.tmpdir, 'test' + random_code + '.dat')
|
||||||
dump_svmlight_file(X, y, testdat, zero_based=False)
|
dump_svmlight_file(X, y, testdat, zero_based=False)
|
||||||
|
|
||||||
cmd = ' '.join([self.svmperf_classify, testdat, self.model, predictions_path])
|
cmd = ' '.join([self.svmperf_classify, testdat, self.model, predictions_path])
|
||||||
|
@ -93,4 +100,7 @@ class SVMperf(BaseEstimator, ClassifierMixin):
|
||||||
|
|
||||||
return scores
|
return scores
|
||||||
|
|
||||||
|
def __del__(self):
|
||||||
|
if hasattr(self, 'tmpdir'):
|
||||||
|
shutil.rmtree(self.tmpdir)
|
||||||
|
|
||||||
|
|
|
@ -149,7 +149,7 @@ class IndexTransformer:
|
||||||
|
|
||||||
def index(self, documents):
|
def index(self, documents):
|
||||||
vocab = self.vocabulary_.copy()
|
vocab = self.vocabulary_.copy()
|
||||||
return [[vocab.getscore(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
|
return [[vocab.get(word, self.unk) for word in self.analyzer(doc)] for doc in tqdm(documents, 'indexing')]
|
||||||
|
|
||||||
def fit_transform(self, X, n_jobs=-1):
|
def fit_transform(self, X, n_jobs=-1):
|
||||||
return self.fit(X).transform(X, n_jobs=n_jobs)
|
return self.fit(X).transform(X, n_jobs=n_jobs)
|
||||||
|
|
|
@ -580,7 +580,9 @@ class OneVsAll(AggregativeQuantifier):
|
||||||
|
|
||||||
def __parallel(self, func, *args, **kwargs):
|
def __parallel(self, func, *args, **kwargs):
|
||||||
return np.asarray(
|
return np.asarray(
|
||||||
Parallel(n_jobs=self.n_jobs)(
|
# some quantifiers (in particular, ELM-based ones) cannot be run with multiprocess, since the temp dir they
|
||||||
|
# create during the fit will be removed and be no longer available for the predict...
|
||||||
|
Parallel(n_jobs=self.n_jobs, backend='threading')(
|
||||||
delayed(func)(c, *args, **kwargs) for c in self.classes
|
delayed(func)(c, *args, **kwargs) for c in self.classes
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
|
from sklearn.metrics import f1_score, make_scorer, accuracy_score
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -267,7 +269,11 @@ def _instantiate_ensemble(learner, base_quantifier_class, param_grid, optim, par
|
||||||
if optim is None:
|
if optim is None:
|
||||||
base_quantifier = base_quantifier_class(learner)
|
base_quantifier = base_quantifier_class(learner)
|
||||||
elif optim in qp.error.CLASSIFICATION_ERROR:
|
elif optim in qp.error.CLASSIFICATION_ERROR:
|
||||||
learner = GridSearchCV(learner, param_grid)
|
if optim == qp.error.f1e:
|
||||||
|
scoring = make_scorer(f1_score)
|
||||||
|
elif optim == qp.error.acce:
|
||||||
|
scoring = make_scorer(accuracy_score)
|
||||||
|
learner = GridSearchCV(learner, param_grid, scoring=scoring)
|
||||||
base_quantifier = base_quantifier_class(learner)
|
base_quantifier = base_quantifier_class(learner)
|
||||||
else:
|
else:
|
||||||
base_quantifier = GridSearchQ(base_quantifier_class(learner),
|
base_quantifier = GridSearchQ(base_quantifier_class(learner),
|
||||||
|
|
|
@ -79,13 +79,6 @@ class QuaNetTrainer(BaseQuantifier):
|
||||||
self.tr_prev = data.prevalence()
|
self.tr_prev = data.prevalence()
|
||||||
|
|
||||||
self.learner.fit(*classifier_data.Xy)
|
self.learner.fit(*classifier_data.Xy)
|
||||||
self.quantifiers = {
|
|
||||||
'cc': CC(self.learner).fit(classifier_data, fit_learner=False),
|
|
||||||
'acc': ACC(self.learner).fit(classifier_data, fit_learner=False, val_split=valid_data),
|
|
||||||
'pcc': PCC(self.learner).fit(classifier_data, fit_learner=False),
|
|
||||||
'pacc': PACC(self.learner).fit(classifier_data, fit_learner=False, val_split=valid_data),
|
|
||||||
'emq': EMQ(self.learner).fit(classifier_data, fit_learner=False),
|
|
||||||
}
|
|
||||||
|
|
||||||
# compute the posterior probabilities of the instances
|
# compute the posterior probabilities of the instances
|
||||||
valid_posteriors = self.learner.predict_proba(valid_data.instances)
|
valid_posteriors = self.learner.predict_proba(valid_data.instances)
|
||||||
|
@ -95,6 +88,14 @@ class QuaNetTrainer(BaseQuantifier):
|
||||||
valid_data.instances = self.learner.transform(valid_data.instances)
|
valid_data.instances = self.learner.transform(valid_data.instances)
|
||||||
train_data.instances = self.learner.transform(train_data.instances)
|
train_data.instances = self.learner.transform(train_data.instances)
|
||||||
|
|
||||||
|
self.quantifiers = {
|
||||||
|
'cc': CC(self.learner).fit(classifier_data, fit_learner=False),
|
||||||
|
'acc': ACC(self.learner).fit(classifier_data, fit_learner=False, val_split=valid_data),
|
||||||
|
'pcc': PCC(self.learner).fit(classifier_data, fit_learner=False),
|
||||||
|
'pacc': PACC(self.learner).fit(classifier_data, fit_learner=False, val_split=valid_data),
|
||||||
|
'emq': EMQ(self.learner).fit(classifier_data, fit_learner=False),
|
||||||
|
}
|
||||||
|
|
||||||
self.status = {
|
self.status = {
|
||||||
'tr-loss': -1,
|
'tr-loss': -1,
|
||||||
'va-loss': -1,
|
'va-loss': -1,
|
||||||
|
|
Loading…
Reference in New Issue