diff --git a/TODO.txt b/TODO.txt index 70b4c44..1c58be2 100644 --- a/TODO.txt +++ b/TODO.txt @@ -9,4 +9,18 @@ negative class). This is not covered in this new implementation, in which the bi an instance of single-label with 2 labels. Check Add classnames to LabelledCollection ? Check the overhead in OneVsAll for SVMperf-based (?) - +Add HDy to QuaNet? if so, wrap HDy into OneVsAll in case the dataset is not binary. +Plots (one for binary -- the "diagonal", or for a specific class), another for the error as a funcition of drift. +Add datasets for topic. +Add other methods +Clarify whether QuaNet is an aggregative method or not. +Add datasets from Pérez-Gallego et al. 2017, 2019 +Add ensemble models from Pérez-Gallego et al. 2017, 2019 +Add plots models like those in Pérez-Gallego et al. 2017 (error boxes) +Add support for CV prediction in ACC and PACC for tpr, fpr +Add medium swap method +Explore the hyperparameter "number of bins" in HDy +Implement HDy for single-label? +Rename EMQ to SLD ? +How many times is the system of equations for ACC and PACC not solved? How many times is it clipped? Do they sum up + to one always? \ No newline at end of file diff --git a/quapy/__init__.py b/quapy/__init__.py index e324da1..ac09f24 100644 --- a/quapy/__init__.py +++ b/quapy/__init__.py @@ -1,8 +1,8 @@ -from . import data +from . import error from .data import datasets from . import functional from . import method -from . import error +from . import data from . import evaluation from method.aggregative import isaggregative, isprobabilistic diff --git a/quapy/classification/methods.py b/quapy/classification/methods.py new file mode 100644 index 0000000..997b3bc --- /dev/null +++ b/quapy/classification/methods.py @@ -0,0 +1,38 @@ +from sklearn.decomposition import TruncatedSVD +from sklearn.linear_model import LogisticRegression + + +class PCALR: + + def __init__(self, n_components=300, C=10, class_weight=None): + self.n_components = n_components + self.learner = LogisticRegression(C=C, class_weight=class_weight, max_iter=1000) + + def get_params(self): + params = {'n_components': self.n_components} + params.update(self.learner.get_params()) + return params + + def set_params(self, **params): + if 'n_components' in params: + self.n_components = params['n_components'] + del params['n_components'] + self.learner.set_params(**params) + + def fit(self, documents, labels): + self.pca = TruncatedSVD(self.n_components) + embedded = self.pca.fit_transform(documents, labels) + self.learner.fit(embedded, labels) + self.classes_ = self.learner.classes_ + return self + + def predict(self, documents): + embedded = self.transform(documents) + return self.learner.predict(embedded) + + def predict_proba(self, documents): + embedded = self.transform(documents) + return self.learner.predict_proba(embedded) + + def transform(self, documents): + return self.pca.transform(documents) diff --git a/quapy/data/base.py b/quapy/data/base.py index 6879a75..c75804e 100644 --- a/quapy/data/base.py +++ b/quapy/data/base.py @@ -1,11 +1,9 @@ import numpy as np from scipy.sparse import issparse from sklearn.model_selection import train_test_split -from quapy.functional import artificial_prevalence_sampling +from quapy.functional import artificial_prevalence_sampling, strprev from scipy.sparse import vstack -from util import temp_seed - class LabelledCollection: @@ -130,6 +128,21 @@ class LabelledCollection: labels = np.concatenate([self.labels, other.labels]) return LabelledCollection(join_instances, labels) + @property + def Xy(self): + return self.instances, self.labels + + def stats(self): + ninstances = len(self) + instance_type = type(self.instances[0]) + if instance_type == list: + nfeats = len(self.instances[0]) + elif instance_type == np.ndarray: + nfeats = self.instances.shape[1] + else: + nfeats = '?' + print(f'#instances={ninstances}, type={instance_type}, features={nfeats}, n_classes={self.n_classes}, ' + f'prevs={strprev(self.prevalence())}') class Dataset: @@ -153,7 +166,7 @@ class Dataset: return self.training.binary @classmethod - def load(cls, train_path, test_path, loader_func:callable): + def load(cls, train_path, test_path, loader_func: callable): training = LabelledCollection.load(train_path, loader_func) test = LabelledCollection.load(test_path, loader_func) return Dataset(training, test) diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index e2def20..0ef233e 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -2,13 +2,15 @@ import zipfile from util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource import os from os.path import join -from data.base import Dataset -from data.reader import from_text, from_sparse +from data.base import Dataset, LabelledCollection +from data.reader import * from data.preprocessing import text2tfidf, reduce_columns +import pandas as pd REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb'] -TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', 'semeval13', 'semeval14', 'semeval15', 'semeval16', +TWITTER_SENTIMENT_DATASETS = ['gasp', 'hcr', 'omd', 'sanders', + 'semeval13', 'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'] @@ -117,4 +119,88 @@ def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_hom return data +UCI_DATASETS = ['acute.a', 'acute.b', + 'balance.1', 'balance.2', 'balance.3'] +def fetch_UCIDataset(dataset_name, data_home=None, verbose=False): + + assert dataset_name in UCI_DATASETS, \ + f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \ + f'Valid ones are {UCI_DATASETS}' + if data_home is None: + data_home = get_quapy_home() + + identifier_map = { + 'acute.a': 'acute', + 'acute.b': 'acute', + 'balance.1': 'balance-scale', + 'balance.2': 'balance-scale', + 'balance.3': 'balance-scale', + } + + dataset_fullname = { + 'acute.a': 'Acute Inflammations (urinary bladder)', + 'acute.b': 'Acute Inflammations (renal pelvis)', + 'balance.1': 'Balance Scale Weight & Distance Database (left)', + 'balance.2': 'Balance Scale Weight & Distance Database (balanced)', + 'balance.3': 'Balance Scale Weight & Distance Database (right)', + } + + data_folder = { + 'acute': 'diagnosis', + 'balance-scale': 'balance-scale', + } + + identifier = identifier_map[dataset_name] + URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}' + data_path = join(data_home, 'uci_datasets', identifier) + download_file_if_not_exists(f'{URL}/{data_folder[identifier]}.data', f'{data_path}/{identifier}.data') + download_file_if_not_exists(f'{URL}/{data_folder[identifier]}.names', f'{data_path}/{identifier}.names') + + if verbose: + print(open(f'{data_path}/{identifier}.names', 'rt').read()) + + print(f'Loading {dataset_name} ({dataset_fullname[dataset_name]})') + if identifier == 'acute': + df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, encoding='utf-16', sep='\t') + if dataset_name == 'acute.a': + y = binarize(df[6], pos_class='yes') + elif dataset_name == 'acute.b': + y = binarize(df[7], pos_class='yes') + + mintemp, maxtemp = 35, 42 + df[0] = df[0].apply(lambda x:(float(x.replace(',','.'))-mintemp)/(maxtemp-mintemp)).astype(float, copy=False) + [df_replace(df, col) for col in range(1, 6)] + X = df.loc[:, 0:5].values + + if identifier == 'balance-scale': + df = pd.read_csv(f'{data_path}/{identifier}.data', header=None, sep=',') + if dataset_name == 'balance.1': + y = binarize(df[0], pos_class='L') + elif dataset_name == 'balance.2': + y = binarize(df[0], pos_class='B') + elif dataset_name == 'balance.3': + y = binarize(df[0], pos_class='R') + X = df.loc[:, 1:].astype(float).values + + data = LabelledCollection(X, y) + data.stats() + #print(df) + #print(df.loc[:, 0:5].values) + #print(y) + +# X = __read_csv(f'{data_path}/{identifier}.data', separator='\t') +# print(X) + + #X, y = from_csv(f'{data_path}/{dataset_name}.data') + #y, classnames = reindex_labels(y) + + +#def __read_csv(path, separator=','): +# x = [] +# for instance in tqdm(open(path, 'rt', encoding='utf-16').readlines(), desc=f'reading {path}'): +# x.append(instance.strip().split(separator)) +# return x + +def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float): + df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False) \ No newline at end of file diff --git a/quapy/data/reader.py b/quapy/data/reader.py index 84550c6..7597fd0 100644 --- a/quapy/data/reader.py +++ b/quapy/data/reader.py @@ -1,6 +1,7 @@ import numpy as np from scipy.sparse import dok_matrix from tqdm import tqdm +import pandas as pd def from_text(path): @@ -55,3 +56,42 @@ def from_sparse(path): y = np.asarray(all_labels) + 1 return X, y + +def from_csv(path): + """ + Reas a csv file in which columns are separated by ','. + File fomart