import os import numpy as np import quapy as qp from quapy.data.base import LabelledCollection from quapy.data.datasets import ( TWITTER_SENTIMENT_DATASETS_TEST, UCI_MULTICLASS_DATASETS, ) from quapy.method.aggregative import EMQ from sklearn.linear_model import LogisticRegression from quacc.dataset import DatasetProvider as DP from quacc.error import macrof1_fn, vanilla_acc_fn from quacc.models.base import ClassifierAccuracyPrediction from quacc.models.baselines import ATC, DoC from quacc.models.cont_table import CAPContingencyTable, ContTableTransferCAP, NaiveCAP from quacc.utils.commons import get_results_path def gen_classifiers(): param_grid = {"C": np.logspace(-4, -4, 9), "class_weight": ["balanced", None]} yield "LR", LogisticRegression() # yield 'LR-opt', GridSearchCV(LogisticRegression(), param_grid, cv=5, n_jobs=-1) # yield 'NB', GaussianNB() # yield 'SVM(rbf)', SVC() # yield 'SVM(linear)', LinearSVC() def gen_multi_datasets( only_names=False, ) -> [str, [LabelledCollection, LabelledCollection, LabelledCollection]]: for dataset_name in np.setdiff1d(UCI_MULTICLASS_DATASETS, ["wine-quality"]): if only_names: yield dataset_name, None else: yield dataset_name, DP.uci_multiclass(dataset_name) # yields the 20 newsgroups dataset if only_names: yield "20news", None else: yield "20news", DP.news20() # yields the T1B@LeQua2022 (training) dataset if only_names: yield "T1B-LeQua2022", None else: yield "T1B-LeQua2022", DP.t1b_lequa2022() def gen_tweet_datasets( only_names=False, ) -> [str, [LabelledCollection, LabelledCollection, LabelledCollection]]: for dataset_name in TWITTER_SENTIMENT_DATASETS_TEST: if only_names: yield dataset_name, None else: yield dataset_name, DP.twitter(dataset_name) def gen_bin_datasets( only_names=False, ) -> [str, [LabelledCollection, LabelledCollection, LabelledCollection]]: _IMDB = [ "imdb", ] _RCV1 = [ "CCAT", "GCAT", "MCAT", ] for dn in _IMDB: dval = None if only_names else DP.imdb() yield dn, dval for dn in _RCV1: dval = None if only_names else DP.rcv1(dn) yield dn, dval def gen_CAP(h, acc_fn, with_oracle=False) -> [str, ClassifierAccuracyPrediction]: ### CAP methods ### # yield 'SebCAP', SebastianiCAP(h, acc_fn, ACC) # yield 'SebCAP-SLD', SebastianiCAP(h, acc_fn, EMQ, predict_train_prev=not with_oracle) # yield 'SebCAP-KDE', SebastianiCAP(h, acc_fn, KDEyML) # yield 'SebCAPweight', SebastianiCAP(h, acc_fn, ACC, alpha=0) # yield 'PabCAP', PabloCAP(h, acc_fn, ACC) # yield 'PabCAP-SLD-median', PabloCAP(h, acc_fn, EMQ, aggr='median') ### baselines ### yield "ATC-MC", ATC(h, acc_fn, scoring_fn="maxconf") # yield 'ATC-NE', ATC(h, acc_fn, scoring_fn='neg_entropy') yield "DoC", DoC(h, acc_fn, sample_size=qp.environ["SAMPLE_SIZE"]) def gen_CAP_cont_table(h) -> [str, CAPContingencyTable]: acc_fn = None yield "Naive", NaiveCAP(h, acc_fn) # yield "CT-PPS-EMQ", ContTableTransferCAP(h, acc_fn, EMQ(LogisticRegression())) # yield 'CT-PPS-KDE', ContTableTransferCAP(h, acc_fn, KDEyML(LogisticRegression(class_weight='balanced'), bandwidth=0.01)) # yield 'CT-PPS-KDE05', ContTableTransferCAP(h, acc_fn, KDEyML(LogisticRegression(class_weight='balanced'), bandwidth=0.05)) # yield 'QuAcc(EMQ)nxn-noX', QuAccNxN(h, acc_fn, EMQ(LogisticRegression()), add_posteriors=True, add_X=False) # yield 'QuAcc(EMQ)nxn', QuAccNxN(h, acc_fn, EMQ(LogisticRegression())) # yield 'QuAcc(EMQ)nxn-MC', QuAccNxN(h, acc_fn, EMQ(LogisticRegression()), add_maxconf=True) # yield 'QuAcc(EMQ)nxn-NE', QuAccNxN(h, acc_fn, EMQ(LogisticRegression()), add_negentropy=True) # yield 'QuAcc(EMQ)nxn-MIS', QuAccNxN(h, acc_fn, EMQ(LogisticRegression()), add_maxinfsoft=True) # yield 'QuAcc(EMQ)1xn2', QuAcc1xN2(h, acc_fn, EMQ(LogisticRegression())) # yield 'QuAcc(EMQ)1xn2', QuAcc1xN2(h, acc_fn, EMQ(LogisticRegression())) # yield 'CT-PPSh-EMQ', ContTableTransferCAP(h, acc_fn, EMQ(LogisticRegression()), reuse_h=True) # yield 'Equations-ACCh', NsquaredEquationsCAP(h, acc_fn, ACC, reuse_h=True) # yield 'Equations-ACC', NsquaredEquationsCAP(h, acc_fn, ACC) # yield 'Equations-SLD', NsquaredEquationsCAP(h, acc_fn, EMQ) def get_method_names(): mock_h = LogisticRegression() return [m for m, _ in gen_CAP(mock_h, None)] + [ m for m, _ in gen_CAP_cont_table(mock_h) ] def gen_acc_measure(): yield "vanilla_accuracy", vanilla_acc_fn yield "macro-F1", macrof1_fn def any_missing(basedir, cls_name, dataset_name, method_name): for acc_name, _ in gen_acc_measure(): if not os.path.exists( get_results_path(basedir, cls_name, acc_name, dataset_name, method_name) ): return True return False