diff --git a/quapy/data/datasets.py b/quapy/data/datasets.py index 63a179e..d489b39 100644 --- a/quapy/data/datasets.py +++ b/quapy/data/datasets.py @@ -3,6 +3,7 @@ def warn(*args, **kwargs): import warnings warnings.warn = warn import os +from contextlib import contextmanager import zipfile from os.path import join import pandas as pd @@ -269,323 +270,298 @@ def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=Fals :param dataset_name: a dataset name :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default ~/quay_data/ directory) - :param test_split: proportion of documents to be included in the test set. The rest conforms the training set :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets :return: a :class:`quapy.data.base.LabelledCollection` instance """ - - assert dataset_name in UCI_BINARY_DATASETS, \ - f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \ - f'Valid ones are {UCI_BINARY_DATASETS}' + assert dataset_name in UCI_BINARY_DATASETS, ( + f"Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. " + f"Valid ones are {UCI_BINARY_DATASETS}" + ) if data_home is None: data_home = get_quapy_home() - dataset_fullname = { - 'acute.a': 'Acute Inflammations (urinary bladder)', - 'acute.b': 'Acute Inflammations (renal pelvis)', - 'balance.1': 'Balance Scale Weight & Distance Database (left)', - 'balance.2': 'Balance Scale Weight & Distance Database (balanced)', - 'balance.3': 'Balance Scale Weight & Distance Database (right)', - 'breast-cancer': 'Breast Cancer Wisconsin (Original)', - 'cmc.1': 'Contraceptive Method Choice (no use)', - 'cmc.2': 'Contraceptive Method Choice (long term)', - 'cmc.3': 'Contraceptive Method Choice (short term)', - 'ctg.1': 'Cardiotocography Data Set (normal)', - 'ctg.2': 'Cardiotocography Data Set (suspect)', - 'ctg.3': 'Cardiotocography Data Set (pathologic)', - 'german': 'Statlog German Credit Data', - 'haberman': "Haberman's Survival Data", - 'ionosphere': 'Johns Hopkins University Ionosphere DB', - 'iris.1': 'Iris Plants Database(x)', - 'iris.2': 'Iris Plants Database(versicolour)', - 'iris.3': 'Iris Plants Database(virginica)', - 'mammographic': 'Mammographic Mass', - 'pageblocks.5': 'Page Blocks Classification (5)', - 'semeion': 'Semeion Handwritten Digit (8)', - 'sonar': 'Sonar, Mines vs. Rocks', - 'spambase': 'Spambase Data Set', - 'spectf': 'SPECTF Heart Data', - 'tictactoe': 'Tic-Tac-Toe Endgame Database', - 'transfusion': 'Blood Transfusion Service Center Data Set', - 'wdbc': 'Wisconsin Diagnostic Breast Cancer', - 'wine.1': 'Wine Recognition Data (1)', - 'wine.2': 'Wine Recognition Data (2)', - 'wine.3': 'Wine Recognition Data (3)', - 'wine-q-red': 'Wine Quality Red (6-10)', - 'wine-q-white': 'Wine Quality White (6-10)', - 'yeast': 'Yeast', + # mapping bewteen dataset names and UCI api ids + identifiers = { + "acute.a": 184, + "acute.b": 184, + "balance.1": 12, + "balance.2": 12, + "balance.3": 12, + "breast-cancer": 15, + "cmc.1": 30, + "cmc.2": 30, + "cmc.3": 30, + # "ctg.1": , # not python importable + # "ctg.2": , # not python importable + # "ctg.3": , # not python importable + # "german": , # not python importable + "haberman": 43, + "ionosphere": 52, + "iris.1": 53, + "iris.2": 53, + "iris.3": 53, + "mammographic": 161, + "pageblocks.5": 78, + # "semeion": , # not python importable + "sonar": 151, + "spambase": 94, + "spectf": 96, + "tictactoe": 101, + "transfusion": 176, + "wdbc": 17, + "wine.1": 109, + "wine.2": 109, + "wine.3": 109, + "wine-q-red": 186, + "wine-q-white": 186, + "yeast": 110, } - # the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use - # to download the raw dataset - identifier_map = { - 'acute.a': 'acute', - 'acute.b': 'acute', - 'balance.1': 'balance-scale', - 'balance.2': 'balance-scale', - 'balance.3': 'balance-scale', - 'breast-cancer': 'breast-cancer-wisconsin', - 'cmc.1': 'cmc', - 'cmc.2': 'cmc', - 'cmc.3': 'cmc', - 'ctg.1': '00193', - 'ctg.2': '00193', - 'ctg.3': '00193', - 'german': 'statlog/german', - 'haberman': 'haberman', - 'ionosphere': 'ionosphere', - 'iris.1': 'iris', - 'iris.2': 'iris', - 'iris.3': 'iris', - 'mammographic': 'mammographic-masses', - 'pageblocks.5': 'page-blocks', - 'semeion': 'semeion', - 'sonar': 'undocumented/connectionist-bench/sonar', - 'spambase': 'spambase', - 'spectf': 'spect', - 'tictactoe': 'tic-tac-toe', - 'transfusion': 'blood-transfusion', - 'wdbc': 'breast-cancer-wisconsin', - 'wine-q-red': 'wine-quality', - 'wine-q-white': 'wine-quality', - 'wine.1': 'wine', - 'wine.2': 'wine', - 'wine.3': 'wine', - 'yeast': 'yeast', + # mapping between dataset names and dataset groups + groups = { + "acute.a": "acute", + "acute.b": "acute", + "balance.1": "balance", + "balance.2": "balance", + "balance.3": "balance", + "breast-cancer": "breast-cancer", + "cmc.1": "cmc", + "cmc.2": "cmc", + "cmc.3": "cmc", + "ctg.1": "ctg", + "ctg.2": "ctg", + "ctg.3": "ctg", + "german": "german", + "haberman": "haberman", + "ionosphere": "ionosphere", + "iris.1": "iris", + "iris.2": "iris", + "iris.3": "iris", + "mammographic": "mammographic", + "pageblocks.5": "pageblocks", + "semeion": "semeion", + "sonar": "sonar", + "spambase": "spambase", + "spectf": "spectf", + "tictactoe": "tictactoe", + "transfusion": "transfusion", + "wdbc": "wdbc", + "wine-q-red": "wine-quality", + "wine-q-white": "wine-quality", + "wine.1": "wine", + "wine.2": "wine", + "wine.3": "wine", + "yeast": "yeast", } - # the filename is the name of the file within the data_folder indexed by the identifier - file_name = { - 'acute': 'diagnosis.data', - '00193': 'CTG.xls', - 'statlog/german': 'german.data-numeric', - 'mammographic-masses': 'mammographic_masses.data', - 'page-blocks': 'page-blocks.data.Z', - 'undocumented/connectionist-bench/sonar': 'sonar.all-data', - 'spect': ['SPECTF.train', 'SPECTF.test'], - 'blood-transfusion': 'transfusion.data', - 'wine-quality': ['winequality-red.csv', 'winequality-white.csv'], - 'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data' + # mapping between dataset short names and full names + full_names = { + "acute.a": "Acute Inflammations (urinary bladder)", + "acute.b": "Acute Inflammations (renal pelvis)", + "balance.1": "Balance Scale Weight & Distance Database (left)", + "balance.2": "Balance Scale Weight & Distance Database (balanced)", + "balance.3": "Balance Scale Weight & Distance Database (right)", + "breast-cancer": "Breast Cancer Wisconsin (Original)", + "cmc.1": "Contraceptive Method Choice (no use)", + "cmc.2": "Contraceptive Method Choice (long term)", + "cmc.3": "Contraceptive Method Choice (short term)", + "ctg.1": "Cardiotocography Data Set (normal)", + "ctg.2": "Cardiotocography Data Set (suspect)", + "ctg.3": "Cardiotocography Data Set (pathologic)", + "german": "Statlog German Credit Data", + "haberman": "Haberman's Survival Data", + "ionosphere": "Johns Hopkins University Ionosphere DB", + "iris.1": "Iris Plants Database(x)", + "iris.2": "Iris Plants Database(versicolour)", + "iris.3": "Iris Plants Database(virginica)", + "mammographic": "Mammographic Mass", + "pageblocks.5": "Page Blocks Classification (5)", + "semeion": "Semeion Handwritten Digit (8)", + "sonar": "Sonar, Mines vs. Rocks", + "spambase": "Spambase Data Set", + "spectf": "SPECTF Heart Data", + "tictactoe": "Tic-Tac-Toe Endgame Database", + "transfusion": "Blood Transfusion Service Center Data Set", + "wdbc": "Wisconsin Diagnostic Breast Cancer", + "wine.1": "Wine Recognition Data (1)", + "wine.2": "Wine Recognition Data (2)", + "wine.3": "Wine Recognition Data (3)", + "wine-q-red": "Wine Quality Red (6-10)", + "wine-q-white": "Wine Quality White (6-10)", + "yeast": "Yeast", } - # the filename containing the dataset description (if any) - desc_name = { - 'acute': 'diagnosis.names', - '00193': None, - 'statlog/german': 'german.doc', - 'mammographic-masses': 'mammographic_masses.names', - 'undocumented/connectionist-bench/sonar': 'sonar.names', - 'spect': 'SPECTF.names', - 'blood-transfusion': 'transfusion.names', - 'wine-quality': 'winequality.names', - 'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names' + # mapping between dataset names and values of positive class + pos_class = { + "acute.a": "yes", + "acute.b": "yes", + "balance.1": "L", + "balance.2": "B", + "balance.3": "R", + "breast-cancer": 2, + "cmc.1": 1, + "cmc.2": 2, + "cmc.3": 3, + "ctg.1": 1, # 1==Normal + "ctg.2": 2, # 2==Suspect + "ctg.3": 3, # 3==Pathologic + "german": 1, + "haberman": 2, + "ionosphere": "b", + "iris.1": "Iris-setosa", # 1==Setosa + "iris.2": "Iris-versicolor", # 2==Versicolor + "iris.3": "Iris-virginica", # 3==Virginica + "mammographic": 1, + "pageblocks.5": 5, # 5==block "graphic" + "semeion": 1, + "sonar": "R", + "spambase": 1, + "spectf": 0, + "tictactoe": "negative", + "transfusion": 1, + "wdbc": "M", + "wine.1": 1, + "wine.2": 2, + "wine.3": 3, + "wine-q-red": 1, + "wine-q-white": 1, + "yeast": "NUC", } - identifier = identifier_map[dataset_name] - filename = file_name.get(identifier, f'{identifier}.data') - descfile = desc_name.get(identifier, f'{identifier}.names') - fullname = dataset_fullname[dataset_name] - - URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}' - data_dir = join(data_home, 'uci_datasets', identifier) - if isinstance(filename, str): # filename could be a list of files, in which case it will be processed later - data_path = join(data_dir, filename) - download_file_if_not_exists(f'{URL}/{filename}', data_path) - - if descfile: - try: - download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}') - if verbose: - print(open(f'{data_dir}/{descfile}', 'rt').read()) - except Exception: - print('could not read the description file') - elif verbose: - print('no file description available') + identifier = identifiers.get(dataset_name, None) + dataset_group = groups[dataset_name] + fullname = full_names[dataset_name] if verbose: - print(f'Loading {dataset_name} ({fullname})') - if identifier == 'acute': - df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t') + print(f"Loading UCI Binary {dataset_name} ({fullname})") - df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False) - [_df_replace(df, col) for col in range(1, 6)] - X = df.loc[:, 0:5].values - if dataset_name == 'acute.a': - y = binarize(df[6], pos_class='yes') - elif dataset_name == 'acute.b': - y = binarize(df[7], pos_class='yes') + file = join(data_home, "uci_datasets", dataset_group + ".pkl") - if identifier == 'balance-scale': - df = pd.read_csv(data_path, header=None, sep=',') - if dataset_name == 'balance.1': - y = binarize(df[0], pos_class='L') - elif dataset_name == 'balance.2': - y = binarize(df[0], pos_class='B') - elif dataset_name == 'balance.3': - y = binarize(df[0], pos_class='R') - X = df.loc[:, 1:].astype(float).values + @contextmanager + def download_tmp_file(url_group: str, filename: str): + """ + Download a data file for a group of datasets temporarely. + When used as a context, the file is removed once the context exits. - if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer': - df = pd.read_csv(data_path, header=None, sep=',') - Xy = df.loc[:, 1:10] - Xy[Xy=='?']=np.nan - Xy = Xy.dropna(axis=0) - X = Xy.loc[:, 1:9] - X = X.astype(float).values - y = binarize(Xy[10], pos_class=2) - - if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc': - df = pd.read_csv(data_path, header=None, sep=',') - X = df.loc[:, 2:32].astype(float).values - y = df[1].values - y = binarize(y, pos_class='M') - - if identifier == 'cmc': - df = pd.read_csv(data_path, header=None, sep=',') - X = df.loc[:, 0:8].astype(float).values - y = df[9].astype(int).values - if dataset_name == 'cmc.1': - y = binarize(y, pos_class=1) - elif dataset_name == 'cmc.2': - y = binarize(y, pos_class=2) - elif dataset_name == 'cmc.3': - y = binarize(y, pos_class=3) - - if identifier == '00193': - df = pd.read_excel(data_path, sheet_name='Data', skipfooter=3) - df = df[list(range(1,24))] # select columns numbered (number 23 is the target label) - # replaces the header with the first row - new_header = df.iloc[0] # grab the first row for the header - df = df[1:] # take the data less the header row - df.columns = new_header # set the header row as the df header - X = df.iloc[:, 0:22].astype(float).values - y = df['NSP'].astype(int).values - if dataset_name == 'ctg.1': - y = binarize(y, pos_class=1) # 1==Normal - elif dataset_name == 'ctg.2': - y = binarize(y, pos_class=2) # 2==Suspect - elif dataset_name == 'ctg.3': - y = binarize(y, pos_class=3) # 3==Pathologic - - if identifier == 'statlog/german': - df = pd.read_csv(data_path, header=None, delim_whitespace=True) - X = df.iloc[:, 0:24].astype(float).values - y = df[24].astype(int).values - y = binarize(y, pos_class=1) - - if identifier == 'haberman': - df = pd.read_csv(data_path, header=None) - X = df.iloc[:, 0:3].astype(float).values - y = df[3].astype(int).values - y = binarize(y, pos_class=2) - - if identifier == 'ionosphere': - df = pd.read_csv(data_path, header=None) - X = df.iloc[:, 0:34].astype(float).values - y = df[34].values - y = binarize(y, pos_class='b') - - if identifier == 'iris': - df = pd.read_csv(data_path, header=None) - X = df.iloc[:, 0:4].astype(float).values - y = df[4].values - if dataset_name == 'iris.1': - y = binarize(y, pos_class='Iris-setosa') # 1==Setosa - elif dataset_name == 'iris.2': - y = binarize(y, pos_class='Iris-versicolor') # 2==Versicolor - elif dataset_name == 'iris.3': - y = binarize(y, pos_class='Iris-virginica') # 3==Virginica - - if identifier == 'mammographic-masses': - df = pd.read_csv(data_path, header=None, sep=',') - df[df == '?'] = np.nan - Xy = df.dropna(axis=0) - X = Xy.iloc[:, 0:5] - X = X.astype(float).values - y = binarize(Xy.iloc[:,5], pos_class=1) - - if identifier == 'page-blocks': - data_path_ = data_path.replace('.Z', '') - if not os.path.exists(data_path_): - raise FileNotFoundError(f'Warning: file {data_path_} does not exist. If this is the first time you ' - f'attempt to load this dataset, then you have to manually unzip the {data_path} ' - f'and name the extracted file {data_path_} (unfortunately, neither zipfile, nor ' - f'gzip can handle unix compressed files automatically -- there is a repo in GitHub ' - f'https://github.com/umeat/unlzw where the problem seems to be solved anyway).') - df = pd.read_csv(data_path_, header=None, delim_whitespace=True) - X = df.iloc[:, 0:10].astype(float).values - y = df[10].values - y = binarize(y, pos_class=5) # 5==block "graphic" - - if identifier == 'semeion': - df = pd.read_csv(data_path, header=None, delim_whitespace=True ) - X = df.iloc[:, 0:256].astype(float).values - y = df[263].values # 263 stands for digit 8 (labels are one-hot vectors from col 256-266) - y = binarize(y, pos_class=1) - - if identifier == 'undocumented/connectionist-bench/sonar': - df = pd.read_csv(data_path, header=None, sep=',') - X = df.iloc[:, 0:60].astype(float).values - y = df[60].values - y = binarize(y, pos_class='R') - - if identifier == 'spambase': - df = pd.read_csv(data_path, header=None, sep=',') - X = df.iloc[:, 0:57].astype(float).values - y = df[57].values - y = binarize(y, pos_class=1) - - if identifier == 'spect': - dfs = [] - for file in filename: - data_path = join(data_dir, file) - download_file_if_not_exists(f'{URL}/{file}', data_path) - dfs.append(pd.read_csv(data_path, header=None, sep=',')) - df = pd.concat(dfs) - X = df.iloc[:, 1:45].astype(float).values - y = df[0].values - y = binarize(y, pos_class=0) - - if identifier == 'tic-tac-toe': - df = pd.read_csv(data_path, header=None, sep=',') - X = df.iloc[:, 0:9].replace('o',0).replace('b',1).replace('x',2).values - y = df[9].values - y = binarize(y, pos_class='negative') - - if identifier == 'blood-transfusion': - df = pd.read_csv(data_path, sep=',') - X = df.iloc[:, 0:4].astype(float).values - y = df.iloc[:, 4].values - y = binarize(y, pos_class=1) - - if identifier == 'wine': - df = pd.read_csv(data_path, header=None, sep=',') - X = df.iloc[:, 1:14].astype(float).values - y = df[0].values - if dataset_name == 'wine.1': - y = binarize(y, pos_class=1) - elif dataset_name == 'wine.2': - y = binarize(y, pos_class=2) - elif dataset_name == 'wine.3': - y = binarize(y, pos_class=3) - - if identifier == 'wine-quality': - filename = filename[0] if dataset_name=='wine-q-red' else filename[1] + :param url_group: identifier of the dataset group in the URL + :param filename: name of the file to be downloaded + """ + data_dir = join(data_home, "uci_datasets", "tmp") + os.makedirs(data_dir, exist_ok=True) data_path = join(data_dir, filename) - download_file_if_not_exists(f'{URL}/{filename}', data_path) - df = pd.read_csv(data_path, sep=';') - X = df.iloc[:, 0:11].astype(float).values - y = df.iloc[:, 11].values > 5 + url = f"http://archive.ics.uci.edu/ml/machine-learning-databases/{url_group}/{filename}" + download_file_if_not_exists(url, data_path) + try: + yield data_path + finally: + os.remove(data_path) - if identifier == 'yeast': - df = pd.read_csv(data_path, header=None, delim_whitespace=True) - X = df.iloc[:, 1:9].astype(float).values - y = df.iloc[:, 9].values - y = binarize(y, pos_class='NUC') + def download(id: int | None, group: str) -> dict: + """ + Download the data to be pickled for a dataset group. Use the `fetch_ucirepo` api when possible. + + :param id: numeric identifier for the group; can be None + :param group: group name + :return: a dictionary with X and y as keys and, optionally, extra data. + """ + + # use the fetch_ucirepo api, when possible, to download data + # fall back to direct download when needed + if group == "german": + with download_tmp_file("statlog/german", "german.data-numeric") as tmp: + df = pd.read_csv(tmp, header=None, delim_whitespace=True) + X, y = df.iloc[:, 0:24].astype(float).values, df[24].astype(int).values + elif group == "ctg": + with download_tmp_file("00193", "CTG.xls") as tmp: + df = pd.read_excel(tmp, sheet_name="Data", skipfooter=3) + df = df[list(range(1, 24))] # select columns numbered (number 23 is the target label) + # replaces the header with the first row + new_header = df.iloc[0] # grab the first row for the header + df = df[1:] # take the data less the header row + df.columns = new_header # set the header row as the df header + X = df.iloc[:, 0:21].astype(float).values # column 21 is skipped, it is a class column + y = df["NSP"].astype(int).values + elif group == "semeion": + with download_tmp_file("semeion", "semeion.data") as tmp: + df = pd.read_csv(tmp, header=None, delim_whitespace=True) + X = df.iloc[:, 0:256].astype(float).values + y = df[263].values # 263 stands for digit 8 (labels are one-hot vectors from col 256-266) + else: + df = fetch_ucirepo(id=id) + X, y = df.data.features.to_numpy(), df.data.targets.to_numpy().squeeze() + + # transform data when needed before returning (returned data will be pickled) + if group == "acute": + _array_replace(X) + data = {"X": X, "y": y} + elif group == "balance": + # features' order is reversed to match data retrieved via direct download + X = X[:, np.arange(X.shape[1])[::-1]] + data = {"X": X, "y": y} + elif group == "breast-cancer": + # remove rows with nan values + Xy = np.hstack([X, y[:, np.newaxis]]) + nan_rows = np.isnan(Xy).sum(axis=-1) > 0 + Xy = Xy[~nan_rows] + data = {"X": Xy[:, :-1], "y": Xy[:, -1]} + elif group == "mammographic": + # remove rows with nan values + Xy = np.hstack([X, y[:, np.newaxis]]) + nan_rows = np.isnan(Xy).sum(axis=-1) > 0 + Xy = Xy[~nan_rows] + data = {"X": Xy[:, :-1], "y": Xy[:, -1]} + elif group == "tictactoe": + _array_replace(X, repl={"o": 0, "b": 1, "x": 2}) + data = {"X": X, "y": y} + elif group == "wine-quality": + # add color data to split the final datasets + color = df.data.original["color"].to_numpy() + data = {"X": X, "y": y, "color": color} + else: + data = {"X": X, "y": y} + + return data + + def binarize_data(name, data: dict) -> LabelledCollection: + """ + Filter and transform data to extract a binary dataset. + + :param name: name of the dataset + :param data: dictionary containing X and y fields, plus additional data when needed + :return: a :class:`quapy.data.base.LabelledCollection` with the extracted dataset + """ + if name == "acute.a": + X, y = data["X"], data["y"][:, 0] + # X, y = Xy[:, :-2], Xy[:, -2] + elif name == "acute.b": + X, y = data["X"], data["y"][:, 1] + # X, y = Xy[:, :-2], Xy[:, -1] + elif name == "wine-q-red": + X, y, color = data["X"], data["y"], data["color"] + # X, y, color = Xy[:, :-2], Xy[:, -2], Xy[:, -1] + red_idx = color == "red" + X, y = X[red_idx, :], y[red_idx] + y = (y > 5).astype(int) + elif name == "wine-q-white": + X, y, color = data["X"], data["y"], data["color"] + # X, y, color = Xy[:, :-2], Xy[:, -2], Xy[:, -1] + white_idx = color == "white" + X, y = X[white_idx, :], y[white_idx] + y = (y > 5).astype(int) + else: + X, y = data["X"], data["y"] + # X, y = Xy[:, :-1], Xy[:, -1] + + y = binarize(y, pos_class=pos_class[name]) + + return LabelledCollection(X, y) + + data = pickled_resource(file, download, identifier, dataset_group) + data = binarize_data(dataset_name, data) - data = LabelledCollection(X, y) if verbose: data.stats() + return data @@ -776,6 +752,9 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float): df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False) +def _array_replace(arr, repl={"yes": 1, "no": 0}): + for k, v in repl.items(): + arr[arr == k] = v def fetch_lequa2022(task, data_home=None): """