UCI binary fetch function rewritten using UCI python api

This commit is contained in:
Lorenzo Volpi 2024-07-02 16:08:55 +02:00
parent c408deacae
commit 8237c121de
1 changed files with 272 additions and 293 deletions

View File

@ -3,6 +3,7 @@ def warn(*args, **kwargs):
import warnings import warnings
warnings.warn = warn warnings.warn = warn
import os import os
from contextlib import contextmanager
import zipfile import zipfile
from os.path import join from os.path import join
import pandas as pd import pandas as pd
@ -269,323 +270,298 @@ def fetch_UCIBinaryLabelledCollection(dataset_name, data_home=None, verbose=Fals
:param dataset_name: a dataset name :param dataset_name: a dataset name
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default :param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory) ~/quay_data/ directory)
:param test_split: proportion of documents to be included in the test set. The rest conforms the training set
:param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets :param verbose: set to True (default is False) to get information (from the UCI ML repository) about the datasets
:return: a :class:`quapy.data.base.LabelledCollection` instance :return: a :class:`quapy.data.base.LabelledCollection` instance
""" """
assert dataset_name in UCI_BINARY_DATASETS, (
assert dataset_name in UCI_BINARY_DATASETS, \ f"Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. "
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \ f"Valid ones are {UCI_BINARY_DATASETS}"
f'Valid ones are {UCI_BINARY_DATASETS}' )
if data_home is None: if data_home is None:
data_home = get_quapy_home() data_home = get_quapy_home()
dataset_fullname = { # mapping bewteen dataset names and UCI api ids
'acute.a': 'Acute Inflammations (urinary bladder)', identifiers = {
'acute.b': 'Acute Inflammations (renal pelvis)', "acute.a": 184,
'balance.1': 'Balance Scale Weight & Distance Database (left)', "acute.b": 184,
'balance.2': 'Balance Scale Weight & Distance Database (balanced)', "balance.1": 12,
'balance.3': 'Balance Scale Weight & Distance Database (right)', "balance.2": 12,
'breast-cancer': 'Breast Cancer Wisconsin (Original)', "balance.3": 12,
'cmc.1': 'Contraceptive Method Choice (no use)', "breast-cancer": 15,
'cmc.2': 'Contraceptive Method Choice (long term)', "cmc.1": 30,
'cmc.3': 'Contraceptive Method Choice (short term)', "cmc.2": 30,
'ctg.1': 'Cardiotocography Data Set (normal)', "cmc.3": 30,
'ctg.2': 'Cardiotocography Data Set (suspect)', # "ctg.1": , # not python importable
'ctg.3': 'Cardiotocography Data Set (pathologic)', # "ctg.2": , # not python importable
'german': 'Statlog German Credit Data', # "ctg.3": , # not python importable
'haberman': "Haberman's Survival Data", # "german": , # not python importable
'ionosphere': 'Johns Hopkins University Ionosphere DB', "haberman": 43,
'iris.1': 'Iris Plants Database(x)', "ionosphere": 52,
'iris.2': 'Iris Plants Database(versicolour)', "iris.1": 53,
'iris.3': 'Iris Plants Database(virginica)', "iris.2": 53,
'mammographic': 'Mammographic Mass', "iris.3": 53,
'pageblocks.5': 'Page Blocks Classification (5)', "mammographic": 161,
'semeion': 'Semeion Handwritten Digit (8)', "pageblocks.5": 78,
'sonar': 'Sonar, Mines vs. Rocks', # "semeion": , # not python importable
'spambase': 'Spambase Data Set', "sonar": 151,
'spectf': 'SPECTF Heart Data', "spambase": 94,
'tictactoe': 'Tic-Tac-Toe Endgame Database', "spectf": 96,
'transfusion': 'Blood Transfusion Service Center Data Set', "tictactoe": 101,
'wdbc': 'Wisconsin Diagnostic Breast Cancer', "transfusion": 176,
'wine.1': 'Wine Recognition Data (1)', "wdbc": 17,
'wine.2': 'Wine Recognition Data (2)', "wine.1": 109,
'wine.3': 'Wine Recognition Data (3)', "wine.2": 109,
'wine-q-red': 'Wine Quality Red (6-10)', "wine.3": 109,
'wine-q-white': 'Wine Quality White (6-10)', "wine-q-red": 186,
'yeast': 'Yeast', "wine-q-white": 186,
"yeast": 110,
} }
# the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use # mapping between dataset names and dataset groups
# to download the raw dataset groups = {
identifier_map = { "acute.a": "acute",
'acute.a': 'acute', "acute.b": "acute",
'acute.b': 'acute', "balance.1": "balance",
'balance.1': 'balance-scale', "balance.2": "balance",
'balance.2': 'balance-scale', "balance.3": "balance",
'balance.3': 'balance-scale', "breast-cancer": "breast-cancer",
'breast-cancer': 'breast-cancer-wisconsin', "cmc.1": "cmc",
'cmc.1': 'cmc', "cmc.2": "cmc",
'cmc.2': 'cmc', "cmc.3": "cmc",
'cmc.3': 'cmc', "ctg.1": "ctg",
'ctg.1': '00193', "ctg.2": "ctg",
'ctg.2': '00193', "ctg.3": "ctg",
'ctg.3': '00193', "german": "german",
'german': 'statlog/german', "haberman": "haberman",
'haberman': 'haberman', "ionosphere": "ionosphere",
'ionosphere': 'ionosphere', "iris.1": "iris",
'iris.1': 'iris', "iris.2": "iris",
'iris.2': 'iris', "iris.3": "iris",
'iris.3': 'iris', "mammographic": "mammographic",
'mammographic': 'mammographic-masses', "pageblocks.5": "pageblocks",
'pageblocks.5': 'page-blocks', "semeion": "semeion",
'semeion': 'semeion', "sonar": "sonar",
'sonar': 'undocumented/connectionist-bench/sonar', "spambase": "spambase",
'spambase': 'spambase', "spectf": "spectf",
'spectf': 'spect', "tictactoe": "tictactoe",
'tictactoe': 'tic-tac-toe', "transfusion": "transfusion",
'transfusion': 'blood-transfusion', "wdbc": "wdbc",
'wdbc': 'breast-cancer-wisconsin', "wine-q-red": "wine-quality",
'wine-q-red': 'wine-quality', "wine-q-white": "wine-quality",
'wine-q-white': 'wine-quality', "wine.1": "wine",
'wine.1': 'wine', "wine.2": "wine",
'wine.2': 'wine', "wine.3": "wine",
'wine.3': 'wine', "yeast": "yeast",
'yeast': 'yeast',
} }
# the filename is the name of the file within the data_folder indexed by the identifier # mapping between dataset short names and full names
file_name = { full_names = {
'acute': 'diagnosis.data', "acute.a": "Acute Inflammations (urinary bladder)",
'00193': 'CTG.xls', "acute.b": "Acute Inflammations (renal pelvis)",
'statlog/german': 'german.data-numeric', "balance.1": "Balance Scale Weight & Distance Database (left)",
'mammographic-masses': 'mammographic_masses.data', "balance.2": "Balance Scale Weight & Distance Database (balanced)",
'page-blocks': 'page-blocks.data.Z', "balance.3": "Balance Scale Weight & Distance Database (right)",
'undocumented/connectionist-bench/sonar': 'sonar.all-data', "breast-cancer": "Breast Cancer Wisconsin (Original)",
'spect': ['SPECTF.train', 'SPECTF.test'], "cmc.1": "Contraceptive Method Choice (no use)",
'blood-transfusion': 'transfusion.data', "cmc.2": "Contraceptive Method Choice (long term)",
'wine-quality': ['winequality-red.csv', 'winequality-white.csv'], "cmc.3": "Contraceptive Method Choice (short term)",
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.data' if dataset_name=='breast-cancer' else 'wdbc.data' "ctg.1": "Cardiotocography Data Set (normal)",
"ctg.2": "Cardiotocography Data Set (suspect)",
"ctg.3": "Cardiotocography Data Set (pathologic)",
"german": "Statlog German Credit Data",
"haberman": "Haberman's Survival Data",
"ionosphere": "Johns Hopkins University Ionosphere DB",
"iris.1": "Iris Plants Database(x)",
"iris.2": "Iris Plants Database(versicolour)",
"iris.3": "Iris Plants Database(virginica)",
"mammographic": "Mammographic Mass",
"pageblocks.5": "Page Blocks Classification (5)",
"semeion": "Semeion Handwritten Digit (8)",
"sonar": "Sonar, Mines vs. Rocks",
"spambase": "Spambase Data Set",
"spectf": "SPECTF Heart Data",
"tictactoe": "Tic-Tac-Toe Endgame Database",
"transfusion": "Blood Transfusion Service Center Data Set",
"wdbc": "Wisconsin Diagnostic Breast Cancer",
"wine.1": "Wine Recognition Data (1)",
"wine.2": "Wine Recognition Data (2)",
"wine.3": "Wine Recognition Data (3)",
"wine-q-red": "Wine Quality Red (6-10)",
"wine-q-white": "Wine Quality White (6-10)",
"yeast": "Yeast",
} }
# the filename containing the dataset description (if any) # mapping between dataset names and values of positive class
desc_name = { pos_class = {
'acute': 'diagnosis.names', "acute.a": "yes",
'00193': None, "acute.b": "yes",
'statlog/german': 'german.doc', "balance.1": "L",
'mammographic-masses': 'mammographic_masses.names', "balance.2": "B",
'undocumented/connectionist-bench/sonar': 'sonar.names', "balance.3": "R",
'spect': 'SPECTF.names', "breast-cancer": 2,
'blood-transfusion': 'transfusion.names', "cmc.1": 1,
'wine-quality': 'winequality.names', "cmc.2": 2,
'breast-cancer-wisconsin': 'breast-cancer-wisconsin.names' if dataset_name == 'breast-cancer' else 'wdbc.names' "cmc.3": 3,
"ctg.1": 1, # 1==Normal
"ctg.2": 2, # 2==Suspect
"ctg.3": 3, # 3==Pathologic
"german": 1,
"haberman": 2,
"ionosphere": "b",
"iris.1": "Iris-setosa", # 1==Setosa
"iris.2": "Iris-versicolor", # 2==Versicolor
"iris.3": "Iris-virginica", # 3==Virginica
"mammographic": 1,
"pageblocks.5": 5, # 5==block "graphic"
"semeion": 1,
"sonar": "R",
"spambase": 1,
"spectf": 0,
"tictactoe": "negative",
"transfusion": 1,
"wdbc": "M",
"wine.1": 1,
"wine.2": 2,
"wine.3": 3,
"wine-q-red": 1,
"wine-q-white": 1,
"yeast": "NUC",
} }
identifier = identifier_map[dataset_name] identifier = identifiers.get(dataset_name, None)
filename = file_name.get(identifier, f'{identifier}.data') dataset_group = groups[dataset_name]
descfile = desc_name.get(identifier, f'{identifier}.names') fullname = full_names[dataset_name]
fullname = dataset_fullname[dataset_name]
URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
data_dir = join(data_home, 'uci_datasets', identifier)
if isinstance(filename, str): # filename could be a list of files, in which case it will be processed later
data_path = join(data_dir, filename)
download_file_if_not_exists(f'{URL}/{filename}', data_path)
if descfile:
try:
download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}')
if verbose:
print(open(f'{data_dir}/{descfile}', 'rt').read())
except Exception:
print('could not read the description file')
elif verbose:
print('no file description available')
if verbose: if verbose:
print(f'Loading {dataset_name} ({fullname})') print(f"Loading UCI Binary {dataset_name} ({fullname})")
if identifier == 'acute':
df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
df[0] = df[0].apply(lambda x: float(x.replace(',', '.'))).astype(float, copy=False) file = join(data_home, "uci_datasets", dataset_group + ".pkl")
[_df_replace(df, col) for col in range(1, 6)]
X = df.loc[:, 0:5].values
if dataset_name == 'acute.a':
y = binarize(df[6], pos_class='yes')
elif dataset_name == 'acute.b':
y = binarize(df[7], pos_class='yes')
if identifier == 'balance-scale': @contextmanager
df = pd.read_csv(data_path, header=None, sep=',') def download_tmp_file(url_group: str, filename: str):
if dataset_name == 'balance.1': """
y = binarize(df[0], pos_class='L') Download a data file for a group of datasets temporarely.
elif dataset_name == 'balance.2': When used as a context, the file is removed once the context exits.
y = binarize(df[0], pos_class='B')
elif dataset_name == 'balance.3':
y = binarize(df[0], pos_class='R')
X = df.loc[:, 1:].astype(float).values
if identifier == 'breast-cancer-wisconsin' and dataset_name=='breast-cancer': :param url_group: identifier of the dataset group in the URL
df = pd.read_csv(data_path, header=None, sep=',') :param filename: name of the file to be downloaded
Xy = df.loc[:, 1:10] """
Xy[Xy=='?']=np.nan data_dir = join(data_home, "uci_datasets", "tmp")
Xy = Xy.dropna(axis=0) os.makedirs(data_dir, exist_ok=True)
X = Xy.loc[:, 1:9]
X = X.astype(float).values
y = binarize(Xy[10], pos_class=2)
if identifier == 'breast-cancer-wisconsin' and dataset_name=='wdbc':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.loc[:, 2:32].astype(float).values
y = df[1].values
y = binarize(y, pos_class='M')
if identifier == 'cmc':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.loc[:, 0:8].astype(float).values
y = df[9].astype(int).values
if dataset_name == 'cmc.1':
y = binarize(y, pos_class=1)
elif dataset_name == 'cmc.2':
y = binarize(y, pos_class=2)
elif dataset_name == 'cmc.3':
y = binarize(y, pos_class=3)
if identifier == '00193':
df = pd.read_excel(data_path, sheet_name='Data', skipfooter=3)
df = df[list(range(1,24))] # select columns numbered (number 23 is the target label)
# replaces the header with the first row
new_header = df.iloc[0] # grab the first row for the header
df = df[1:] # take the data less the header row
df.columns = new_header # set the header row as the df header
X = df.iloc[:, 0:22].astype(float).values
y = df['NSP'].astype(int).values
if dataset_name == 'ctg.1':
y = binarize(y, pos_class=1) # 1==Normal
elif dataset_name == 'ctg.2':
y = binarize(y, pos_class=2) # 2==Suspect
elif dataset_name == 'ctg.3':
y = binarize(y, pos_class=3) # 3==Pathologic
if identifier == 'statlog/german':
df = pd.read_csv(data_path, header=None, delim_whitespace=True)
X = df.iloc[:, 0:24].astype(float).values
y = df[24].astype(int).values
y = binarize(y, pos_class=1)
if identifier == 'haberman':
df = pd.read_csv(data_path, header=None)
X = df.iloc[:, 0:3].astype(float).values
y = df[3].astype(int).values
y = binarize(y, pos_class=2)
if identifier == 'ionosphere':
df = pd.read_csv(data_path, header=None)
X = df.iloc[:, 0:34].astype(float).values
y = df[34].values
y = binarize(y, pos_class='b')
if identifier == 'iris':
df = pd.read_csv(data_path, header=None)
X = df.iloc[:, 0:4].astype(float).values
y = df[4].values
if dataset_name == 'iris.1':
y = binarize(y, pos_class='Iris-setosa') # 1==Setosa
elif dataset_name == 'iris.2':
y = binarize(y, pos_class='Iris-versicolor') # 2==Versicolor
elif dataset_name == 'iris.3':
y = binarize(y, pos_class='Iris-virginica') # 3==Virginica
if identifier == 'mammographic-masses':
df = pd.read_csv(data_path, header=None, sep=',')
df[df == '?'] = np.nan
Xy = df.dropna(axis=0)
X = Xy.iloc[:, 0:5]
X = X.astype(float).values
y = binarize(Xy.iloc[:,5], pos_class=1)
if identifier == 'page-blocks':
data_path_ = data_path.replace('.Z', '')
if not os.path.exists(data_path_):
raise FileNotFoundError(f'Warning: file {data_path_} does not exist. If this is the first time you '
f'attempt to load this dataset, then you have to manually unzip the {data_path} '
f'and name the extracted file {data_path_} (unfortunately, neither zipfile, nor '
f'gzip can handle unix compressed files automatically -- there is a repo in GitHub '
f'https://github.com/umeat/unlzw where the problem seems to be solved anyway).')
df = pd.read_csv(data_path_, header=None, delim_whitespace=True)
X = df.iloc[:, 0:10].astype(float).values
y = df[10].values
y = binarize(y, pos_class=5) # 5==block "graphic"
if identifier == 'semeion':
df = pd.read_csv(data_path, header=None, delim_whitespace=True )
X = df.iloc[:, 0:256].astype(float).values
y = df[263].values # 263 stands for digit 8 (labels are one-hot vectors from col 256-266)
y = binarize(y, pos_class=1)
if identifier == 'undocumented/connectionist-bench/sonar':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.iloc[:, 0:60].astype(float).values
y = df[60].values
y = binarize(y, pos_class='R')
if identifier == 'spambase':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.iloc[:, 0:57].astype(float).values
y = df[57].values
y = binarize(y, pos_class=1)
if identifier == 'spect':
dfs = []
for file in filename:
data_path = join(data_dir, file)
download_file_if_not_exists(f'{URL}/{file}', data_path)
dfs.append(pd.read_csv(data_path, header=None, sep=','))
df = pd.concat(dfs)
X = df.iloc[:, 1:45].astype(float).values
y = df[0].values
y = binarize(y, pos_class=0)
if identifier == 'tic-tac-toe':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.iloc[:, 0:9].replace('o',0).replace('b',1).replace('x',2).values
y = df[9].values
y = binarize(y, pos_class='negative')
if identifier == 'blood-transfusion':
df = pd.read_csv(data_path, sep=',')
X = df.iloc[:, 0:4].astype(float).values
y = df.iloc[:, 4].values
y = binarize(y, pos_class=1)
if identifier == 'wine':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.iloc[:, 1:14].astype(float).values
y = df[0].values
if dataset_name == 'wine.1':
y = binarize(y, pos_class=1)
elif dataset_name == 'wine.2':
y = binarize(y, pos_class=2)
elif dataset_name == 'wine.3':
y = binarize(y, pos_class=3)
if identifier == 'wine-quality':
filename = filename[0] if dataset_name=='wine-q-red' else filename[1]
data_path = join(data_dir, filename) data_path = join(data_dir, filename)
download_file_if_not_exists(f'{URL}/{filename}', data_path) url = f"http://archive.ics.uci.edu/ml/machine-learning-databases/{url_group}/{filename}"
df = pd.read_csv(data_path, sep=';') download_file_if_not_exists(url, data_path)
X = df.iloc[:, 0:11].astype(float).values try:
y = df.iloc[:, 11].values > 5 yield data_path
finally:
os.remove(data_path)
if identifier == 'yeast': def download(id: int | None, group: str) -> dict:
df = pd.read_csv(data_path, header=None, delim_whitespace=True) """
X = df.iloc[:, 1:9].astype(float).values Download the data to be pickled for a dataset group. Use the `fetch_ucirepo` api when possible.
y = df.iloc[:, 9].values
y = binarize(y, pos_class='NUC') :param id: numeric identifier for the group; can be None
:param group: group name
:return: a dictionary with X and y as keys and, optionally, extra data.
"""
# use the fetch_ucirepo api, when possible, to download data
# fall back to direct download when needed
if group == "german":
with download_tmp_file("statlog/german", "german.data-numeric") as tmp:
df = pd.read_csv(tmp, header=None, delim_whitespace=True)
X, y = df.iloc[:, 0:24].astype(float).values, df[24].astype(int).values
elif group == "ctg":
with download_tmp_file("00193", "CTG.xls") as tmp:
df = pd.read_excel(tmp, sheet_name="Data", skipfooter=3)
df = df[list(range(1, 24))] # select columns numbered (number 23 is the target label)
# replaces the header with the first row
new_header = df.iloc[0] # grab the first row for the header
df = df[1:] # take the data less the header row
df.columns = new_header # set the header row as the df header
X = df.iloc[:, 0:21].astype(float).values # column 21 is skipped, it is a class column
y = df["NSP"].astype(int).values
elif group == "semeion":
with download_tmp_file("semeion", "semeion.data") as tmp:
df = pd.read_csv(tmp, header=None, delim_whitespace=True)
X = df.iloc[:, 0:256].astype(float).values
y = df[263].values # 263 stands for digit 8 (labels are one-hot vectors from col 256-266)
else:
df = fetch_ucirepo(id=id)
X, y = df.data.features.to_numpy(), df.data.targets.to_numpy().squeeze()
# transform data when needed before returning (returned data will be pickled)
if group == "acute":
_array_replace(X)
data = {"X": X, "y": y}
elif group == "balance":
# features' order is reversed to match data retrieved via direct download
X = X[:, np.arange(X.shape[1])[::-1]]
data = {"X": X, "y": y}
elif group == "breast-cancer":
# remove rows with nan values
Xy = np.hstack([X, y[:, np.newaxis]])
nan_rows = np.isnan(Xy).sum(axis=-1) > 0
Xy = Xy[~nan_rows]
data = {"X": Xy[:, :-1], "y": Xy[:, -1]}
elif group == "mammographic":
# remove rows with nan values
Xy = np.hstack([X, y[:, np.newaxis]])
nan_rows = np.isnan(Xy).sum(axis=-1) > 0
Xy = Xy[~nan_rows]
data = {"X": Xy[:, :-1], "y": Xy[:, -1]}
elif group == "tictactoe":
_array_replace(X, repl={"o": 0, "b": 1, "x": 2})
data = {"X": X, "y": y}
elif group == "wine-quality":
# add color data to split the final datasets
color = df.data.original["color"].to_numpy()
data = {"X": X, "y": y, "color": color}
else:
data = {"X": X, "y": y}
return data
def binarize_data(name, data: dict) -> LabelledCollection:
"""
Filter and transform data to extract a binary dataset.
:param name: name of the dataset
:param data: dictionary containing X and y fields, plus additional data when needed
:return: a :class:`quapy.data.base.LabelledCollection` with the extracted dataset
"""
if name == "acute.a":
X, y = data["X"], data["y"][:, 0]
# X, y = Xy[:, :-2], Xy[:, -2]
elif name == "acute.b":
X, y = data["X"], data["y"][:, 1]
# X, y = Xy[:, :-2], Xy[:, -1]
elif name == "wine-q-red":
X, y, color = data["X"], data["y"], data["color"]
# X, y, color = Xy[:, :-2], Xy[:, -2], Xy[:, -1]
red_idx = color == "red"
X, y = X[red_idx, :], y[red_idx]
y = (y > 5).astype(int)
elif name == "wine-q-white":
X, y, color = data["X"], data["y"], data["color"]
# X, y, color = Xy[:, :-2], Xy[:, -2], Xy[:, -1]
white_idx = color == "white"
X, y = X[white_idx, :], y[white_idx]
y = (y > 5).astype(int)
else:
X, y = data["X"], data["y"]
# X, y = Xy[:, :-1], Xy[:, -1]
y = binarize(y, pos_class=pos_class[name])
return LabelledCollection(X, y)
data = pickled_resource(file, download, identifier, dataset_group)
data = binarize_data(dataset_name, data)
data = LabelledCollection(X, y)
if verbose: if verbose:
data.stats() data.stats()
return data return data
@ -776,6 +752,9 @@ def fetch_UCIMulticlassLabelledCollection(dataset_name, data_home=None, min_clas
def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float): def _df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False) df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)
def _array_replace(arr, repl={"yes": 1, "no": 0}):
for k, v in repl.items():
arr[arr == k] = v
def fetch_lequa2022(task, data_home=None): def fetch_lequa2022(task, data_home=None):
""" """