From 8b9b8957f5113902056a4c72e38cd6df301d2021 Mon Sep 17 00:00:00 2001 From: Alejandro Moreo Date: Fri, 8 Mar 2024 12:32:45 +0100 Subject: [PATCH] bugfix, some methods modified h --- ClassifierAccuracy/experiments.py | 8 +++-- ClassifierAccuracy/models_multiclass.py | 6 ++-- ClassifierAccuracy/util/commons.py | 46 +++++++++++++++++-------- quapy/data/base.py | 2 +- 4 files changed, 41 insertions(+), 21 deletions(-) diff --git a/ClassifierAccuracy/experiments.py b/ClassifierAccuracy/experiments.py index b4e976d..238a5e2 100644 --- a/ClassifierAccuracy/experiments.py +++ b/ClassifierAccuracy/experiments.py @@ -1,8 +1,8 @@ from ClassifierAccuracy.util.commons import * from ClassifierAccuracy.util.plotting import plot_diagonal -PROBLEM = 'multiclass' -ORACLE = True +PROBLEM = 'binary' +ORACLE = False basedir = PROBLEM+('-oracle' if ORACLE else '') @@ -14,6 +14,10 @@ elif PROBLEM == 'multiclass': qp.environ['SAMPLE_SIZE'] = 250 NUM_TEST = 1000 gen_datasets = gen_multi_datasets +elif PROBLEM == 'tweet': + qp.environ['SAMPLE_SIZE'] = 100 + NUM_TEST = 1000 + gen_datasets = gen_tweet_datasets for (cls_name, h), (dataset_name, (L, V, U)) in itertools.product(gen_classifiers(), gen_datasets()): diff --git a/ClassifierAccuracy/models_multiclass.py b/ClassifierAccuracy/models_multiclass.py index a3bc98d..09e4da5 100644 --- a/ClassifierAccuracy/models_multiclass.py +++ b/ClassifierAccuracy/models_multiclass.py @@ -65,7 +65,7 @@ class CAPContingencyTable(ClassifierAccuracyPrediction): the errors in quantification performance :return: float """ - cont_table = self.predict_ct(X, oracle) + cont_table = self.predict_ct(X, oracle_prev) raw_acc = self.acc(cont_table) norm_acc = np.clip(raw_acc, 0, 1) return norm_acc @@ -140,7 +140,7 @@ class ContTableTransferCAP(CAPContingencyTableQ): def fit(self, val: LabelledCollection): y_hat = self.h.predict(val.X) y_true = val.y - self.cont_table = confusion_matrix(y_true, y_pred=y_hat, labels=val.classes_, normalize='all') + self.cont_table = confusion_matrix(y_true=y_true, y_pred=y_hat, labels=val.classes_, normalize='all') self.train_prev = val.prevalence() self.quantifier_fit(val) return self @@ -332,7 +332,7 @@ class PabloCAP(ClassifierAccuracyPrediction): def __init__(self, h, acc_fn, q_class, n_val_samples=100, aggr='mean'): self.h = h self.acc = acc_fn - self.q = q_class(h) + self.q = q_class(deepcopy(h)) self.n_val_samples = n_val_samples self.aggr = aggr assert aggr in ['mean', 'median'], 'unknown aggregation function, use mean or median' diff --git a/ClassifierAccuracy/util/commons.py b/ClassifierAccuracy/util/commons.py index 55f8aa6..67df860 100644 --- a/ClassifierAccuracy/util/commons.py +++ b/ClassifierAccuracy/util/commons.py @@ -17,7 +17,7 @@ from ClassifierAccuracy.util.tabular import Table from quapy.method.aggregative import EMQ, ACC, KDEyML from quapy.data import LabelledCollection -from quapy.data.datasets import fetch_UCIMulticlassLabelledCollection, UCI_MULTICLASS_DATASETS, fetch_lequa2022 +from quapy.data.datasets import fetch_UCIMulticlassLabelledCollection, UCI_MULTICLASS_DATASETS, fetch_lequa2022, TWITTER_SENTIMENT_DATASETS_TEST from quapy.data.datasets import fetch_reviews @@ -45,21 +45,37 @@ def gen_multi_datasets(only_names=False)-> [str,[LabelledCollection,LabelledColl yield dataset_name, split(dataset) # yields the 20 newsgroups dataset - train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) - test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes')) - tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True) - Xtr = tfidf.fit_transform(train.data) - Xte = tfidf.transform((test.data)) - train = LabelledCollection(instances=Xtr, labels=train.target) - U = LabelledCollection(instances=Xte, labels=test.target) - T, V = train.split_stratified(train_prop=0.5, random_state=0) - yield "20news", (T, V, U) + if only_names: + yield "20news", None + else: + train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) + test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes')) + tfidf = TfidfVectorizer(min_df=5, sublinear_tf=True) + Xtr = tfidf.fit_transform(train.data) + Xte = tfidf.transform((test.data)) + train = LabelledCollection(instances=Xtr, labels=train.target) + U = LabelledCollection(instances=Xte, labels=test.target) + T, V = train.split_stratified(train_prop=0.5, random_state=0) + yield "20news", (T, V, U) # yields the T1B@LeQua2022 (training) dataset - train, _, _ = fetch_lequa2022(task='T1B') - yield "T1B-LeQua2022", split(train) + if only_names: + yield "T1B-LeQua2022", None + else: + train, _, _ = fetch_lequa2022(task='T1B') + yield "T1B-LeQua2022", split(train) +def gen_tweet_datasets(only_names=False)-> [str,[LabelledCollection,LabelledCollection,LabelledCollection]]: + for dataset_name in TWITTER_SENTIMENT_DATASETS_TEST: + if only_names: + yield dataset_name, None + else: + data = qp.datasets.fetch_twitter(dataset_name, min_df=3, pickle=True) + T, V = data.training.split_stratified(0.5, random_state=0) + U = data.test + yield dataset_name, (T, V, U) + def gen_bin_datasets(only_names=False) -> [str,[LabelledCollection,LabelledCollection,LabelledCollection]]: if only_names: @@ -91,7 +107,7 @@ def gen_CAP(h, acc_fn, with_oracle=False)->[str, ClassifierAccuracyPrediction]: #yield 'PabCAP', PabloCAP(h, acc_fn, ACC) yield 'PabCAP-SLD-median', PabloCAP(h, acc_fn, EMQ, aggr='median') yield 'ATC-MC', ATC(h, acc_fn, scoring_fn='maxconf') - #yield 'ATC-NE', ATC(h, acc_fn, scoring_fn='neg_entropy') + # yield 'ATC-NE', ATC(h, acc_fn, scoring_fn='neg_entropy') yield 'DoC', DoC(h, acc_fn, sample_size=qp.environ['SAMPLE_SIZE']) @@ -99,12 +115,12 @@ def gen_CAP_cont_table(h)->[str,CAPContingencyTable]: acc_fn = None yield 'Naive', NaiveCAP(h, acc_fn) yield 'CT-PPS-EMQ', ContTableTransferCAP(h, acc_fn, EMQ(LogisticRegression())) - #yield 'CT-PPS-KDE', ContTableTransferCAP(h, acc_fn, KDEyML(LogisticRegression(class_weight='balanced'), bandwidth=0.01)) + # yield 'CT-PPS-KDE', ContTableTransferCAP(h, acc_fn, KDEyML(LogisticRegression(class_weight='balanced'), bandwidth=0.01)) yield 'CT-PPS-KDE05', ContTableTransferCAP(h, acc_fn, KDEyML(LogisticRegression(class_weight='balanced'), bandwidth=0.05)) #yield 'QuAcc(EMQ)nxn-noX', QuAccNxN(h, acc_fn, EMQ(LogisticRegression()), add_posteriors=True, add_X=False) #yield 'QuAcc(EMQ)nxn', QuAccNxN(h, acc_fn, EMQ(LogisticRegression())) #yield 'QuAcc(EMQ)nxn-MC', QuAccNxN(h, acc_fn, EMQ(LogisticRegression()), add_maxconf=True) - yield 'QuAcc(EMQ)nxn-NE', QuAccNxN(h, acc_fn, EMQ(LogisticRegression()), add_negentropy=True) + # yield 'QuAcc(EMQ)nxn-NE', QuAccNxN(h, acc_fn, EMQ(LogisticRegression()), add_negentropy=True) #yield 'QuAcc(EMQ)nxn-MIS', QuAccNxN(h, acc_fn, EMQ(LogisticRegression()), add_maxinfsoft=True) #yield 'QuAcc(EMQ)1xn2', QuAcc1xN2(h, acc_fn, EMQ(LogisticRegression())) #yield 'QuAcc(EMQ)1xn2', QuAcc1xN2(h, acc_fn, EMQ(LogisticRegression())) diff --git a/quapy/data/base.py b/quapy/data/base.py index cb695be..55fcc83 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -123,7 +123,7 @@ class LabelledCollection: if len(prevs) == self.n_classes - 1: prevs = prevs + (1 - sum(prevs),) assert len(prevs) == self.n_classes, 'unexpected number of prevalences' - assert sum(prevs) == 1, f'prevalences ({prevs}) wrong range (sum={sum(prevs)})' + assert np.isclose(sum(prevs), 1), f'prevalences ({prevs}) wrong range (sum={sum(prevs)})' # Decide how many instances should be taken for each class in order to satisfy the requested prevalence # accurately, and the number of instances in the sample (exactly). If int(size * prevs[i]) (which is