1
0
Fork 0
QuaPy/quapy/data/datasets.py

425 lines
18 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import zipfile
from os.path import join
from urllib.error import HTTPError
import pandas as pd
from data.base import Dataset, LabelledCollection
from quapy.data.preprocessing import text2tfidf, reduce_columns
from quapy.data.reader import *
from quapy.util import download_file_if_not_exists, download_file, get_quapy_home, pickled_resource
REVIEWS_SENTIMENT_DATASETS = ['hp', 'kindle', 'imdb']
TWITTER_SENTIMENT_DATASETS_TEST = ['gasp', 'hcr', 'omd', 'sanders',
'semeval13', 'semeval14', 'semeval15', 'semeval16',
'sst', 'wa', 'wb']
TWITTER_SENTIMENT_DATASETS_TRAIN = ['gasp', 'hcr', 'omd', 'sanders',
'semeval', 'semeval16',
'sst', 'wa', 'wb']
def fetch_reviews(dataset_name, tfidf=False, min_df=None, data_home=None, pickle=False):
"""
Load a Reviews dataset as a Dataset instance, as used in:
Esuli, A., Moreo, A., and Sebastiani, F. "A recurrent neural network for sentiment quantification."
Proceedings of the 27th ACM International Conference on Information and Knowledge Management. 2018.
:param dataset_name: the name of the dataset: valid ones are 'hp', 'kindle', 'imdb'
:param tfidf: set to True to transform the raw documents into tfidf weighted matrices
:param min_df: minimun number of documents that should contain a term in order for the term to be
kept (ignored if tfidf==False)
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations
:return: a Dataset instance
"""
assert dataset_name in REVIEWS_SENTIMENT_DATASETS, \
f'Name {dataset_name} does not match any known dataset for sentiment reviews. ' \
f'Valid ones are {REVIEWS_SENTIMENT_DATASETS}'
if data_home is None:
data_home = get_quapy_home()
URL_TRAIN = f'https://zenodo.org/record/4117827/files/{dataset_name}_train.txt'
URL_TEST = f'https://zenodo.org/record/4117827/files/{dataset_name}_test.txt'
os.makedirs(join(data_home, 'reviews'), exist_ok=True)
train_path = join(data_home, 'reviews', dataset_name, 'train.txt')
test_path = join(data_home, 'reviews', dataset_name, 'test.txt')
download_file_if_not_exists(URL_TRAIN, train_path)
download_file_if_not_exists(URL_TEST, test_path)
pickle_path = None
if pickle:
pickle_path = join(data_home, 'reviews', 'pickle', f'{dataset_name}.pkl')
data = pickled_resource(pickle_path, Dataset.load, train_path, test_path, from_text)
if tfidf:
text2tfidf(data, inplace=True)
if min_df is not None:
reduce_columns(data, min_df=min_df, inplace=True)
data.name = dataset_name
return data
def fetch_twitter(dataset_name, for_model_selection=False, min_df=None, data_home=None, pickle=False):
"""
Load a Twitter dataset as a Dataset instance, as used in:
Gao, W., Sebastiani, F.: From classification to quantification in tweet sentiment analysis.
Social Network Analysis and Mining6(19), 122 (2016)
The datasets 'semeval13', 'semeval14', 'semeval15' share the same training set.
:param dataset_name: the name of the dataset: valid ones are 'gasp', 'hcr', 'omd', 'sanders', 'semeval13',
'semeval14', 'semeval15', 'semeval16', 'sst', 'wa', 'wb'
:param for_model_selection: if True, then returns the train split as the training set and the devel split
as the test set; if False, then returns the train+devel split as the training set and the test set as the
test set
:param min_df: minimun number of documents that should contain a term in order for the term to be kept
:param data_home: specify the quapy home directory where collections will be dumped (leave empty to use the default
~/quay_data/ directory)
:param pickle: set to True to pickle the Dataset object the first time it is generated, in order to allow for
faster subsequent invokations
:return: a Dataset instance
"""
assert dataset_name in TWITTER_SENTIMENT_DATASETS_TRAIN + TWITTER_SENTIMENT_DATASETS_TEST, \
f'Name {dataset_name} does not match any known dataset for sentiment twitter. ' \
f'Valid ones are {TWITTER_SENTIMENT_DATASETS_TRAIN} for model selection and ' \
f'{TWITTER_SENTIMENT_DATASETS_TEST} for test (datasets "semeval14", "semeval15", "semeval16" share ' \
f'a common training set "semeval")'
if data_home is None:
data_home = get_quapy_home()
URL = 'https://zenodo.org/record/4255764/files/tweet_sentiment_quantification_snam.zip'
unzipped_path = join(data_home, 'tweet_sentiment_quantification_snam')
if not os.path.exists(unzipped_path):
downloaded_path = join(data_home, 'tweet_sentiment_quantification_snam.zip')
download_file(URL, downloaded_path)
with zipfile.ZipFile(downloaded_path) as file:
file.extractall(data_home)
os.remove(downloaded_path)
if dataset_name in {'semeval13', 'semeval14', 'semeval15'}:
trainset_name = 'semeval'
testset_name = 'semeval' if for_model_selection else dataset_name
print(f"the training and development sets for datasets 'semeval13', 'semeval14', 'semeval15' are common "
f"(called 'semeval'); returning trainin-set='{trainset_name}' and test-set={testset_name}")
else:
if dataset_name == 'semeval' and for_model_selection==False:
raise ValueError('dataset "semeval" can only be used for model selection. '
'Use "semeval13", "semeval14", or "semeval15" for model evaluation.')
trainset_name = testset_name = dataset_name
if for_model_selection:
train = join(unzipped_path, 'train', f'{trainset_name}.train.feature.txt')
test = join(unzipped_path, 'test', f'{testset_name}.dev.feature.txt')
else:
train = join(unzipped_path, 'train', f'{trainset_name}.train+dev.feature.txt')
if dataset_name == 'semeval16': # there is a different test name in the case of semeval16 only
test = join(unzipped_path, 'test', f'{testset_name}.dev-test.feature.txt')
else:
test = join(unzipped_path, 'test', f'{testset_name}.test.feature.txt')
pickle_path = None
if pickle:
mode = "train-dev" if for_model_selection else "train+dev-test"
pickle_path = join(unzipped_path, 'pickle', f'{testset_name}.{mode}.pkl')
data = pickled_resource(pickle_path, Dataset.load, train, test, from_sparse)
if min_df is not None:
reduce_columns(data, min_df=min_df, inplace=True)
data.name = dataset_name
return data
UCI_DATASETS = ['acute.a', 'acute.b',
'balance.1', 'balance.2', 'balance.3',
'breast-cancer',
'cmc.1', 'cmc.2', 'cmc.3',
'ctg.1', 'ctg.2', 'ctg.3',
#'diabetes', # <-- I haven't found this one...
'german',
'haberman',
'ionosphere',
'iris.1', 'iris.2', 'iris.3',
'mammographic',
'pageblocks.5',
#'phoneme', # <-- I haven't found this one...
'semeion',
'sonar',
'spambase',
'spectf',
'tictactoe',
'transfusion'] # ongoing...
def fetch_UCIDataset(dataset_name, data_home=None, verbose=False, test_split=0.3):
assert dataset_name in UCI_DATASETS, \
f'Name {dataset_name} does not match any known dataset from the UCI Machine Learning datasets repository. ' \
f'Valid ones are {UCI_DATASETS}'
if data_home is None:
data_home = get_quapy_home()
dataset_fullname = {
'acute.a': 'Acute Inflammations (urinary bladder)',
'acute.b': 'Acute Inflammations (renal pelvis)',
'balance.1': 'Balance Scale Weight & Distance Database (left)',
'balance.2': 'Balance Scale Weight & Distance Database (balanced)',
'balance.3': 'Balance Scale Weight & Distance Database (right)',
'breast-cancer': 'Breast Cancer Wisconsin (Original)',
'cmc.1': 'Contraceptive Method Choice (no use)',
'cmc.2': 'Contraceptive Method Choice (long term)',
'cmc.3': 'Contraceptive Method Choice (short term)',
'ctg.1': 'Cardiotocography Data Set (normal)',
'ctg.2': 'Cardiotocography Data Set (suspect)',
'ctg.3': 'Cardiotocography Data Set (pathologic)',
'german': 'Statlog German Credit Data',
'haberman': "Haberman's Survival Data",
'ionosphere': 'Johns Hopkins University Ionosphere DB',
'iris.1': 'Iris Plants Database(x)',
'iris.2': 'Iris Plants Database(versicolour)',
'iris.3': 'Iris Plants Database(virginica)',
'mammographic': 'Mammographic Mass',
'pageblocks.5': 'Page Blocks Classification (5)',
'semeion': 'Semeion Handwritten Digit (8)',
'sonar': 'Sonar, Mines vs. Rocks',
'spambase': 'Spambase Data Set',
'spectf': 'SPECTF Heart Data',
'tictactoe': 'Tic-Tac-Toe Endgame Database',
'transfusion': 'Blood Transfusion Service Center Data Set '
}
# the identifier is an alias for the dataset group, it's part of the url data-folder, and is the name we use
# to download the raw dataset
identifier_map = {
'acute.a': 'acute',
'acute.b': 'acute',
'balance.1': 'balance-scale',
'balance.2': 'balance-scale',
'balance.3': 'balance-scale',
'breast-cancer': 'breast-cancer-wisconsin',
'cmc.1': 'cmc',
'cmc.2': 'cmc',
'cmc.3': 'cmc',
'ctg.1': '00193',
'ctg.2': '00193',
'ctg.3': '00193',
'german': 'statlog/german',
'haberman': 'haberman',
'ionosphere': 'ionosphere',
'iris.1': 'iris',
'iris.2': 'iris',
'iris.3': 'iris',
'mammographic': 'mammographic-masses',
'pageblocks.5': 'page-blocks',
'semeion': 'semeion',
'sonar': 'undocumented/connectionist-bench/sonar',
'spambase': 'spambase',
'spectf': 'spect',
'tictactoe': 'tic-tac-toe',
'transfusion': 'blood-transfusion'
}
# the filename is the name of the file within the data_folder indexed by the identifier
file_name = {
'acute': 'diagnosis.data',
'00193': 'CTG.xls',
'statlog/german': 'german.data-numeric',
'mammographic-masses': 'mammographic_masses.data',
'page-blocks': 'page-blocks.data.Z',
'undocumented/connectionist-bench/sonar': 'sonar.all-data',
'spect': ['SPECTF.train', 'SPECTF.test'],
'blood-transfusion': 'transfusion.data'
}
# the filename containing the dataset description (if any)
desc_name = {
'acute': 'diagnosis.names',
'00193': None,
'statlog/german': 'german.doc',
'mammographic-masses': 'mammographic_masses.names',
'undocumented/connectionist-bench/sonar': 'sonar.names',
'spect': 'SPECTF.names',
'blood-transfusion': 'transfusion.names'
}
identifier = identifier_map[dataset_name]
filename = file_name.get(identifier, f'{identifier}.data')
descfile = desc_name.get(identifier, f'{identifier}.names')
fullname = dataset_fullname[dataset_name]
URL = f'http://archive.ics.uci.edu/ml/machine-learning-databases/{identifier}'
data_dir = join(data_home, 'uci_datasets', identifier)
if isinstance(filename, str): # filename could be a list of files, in which case it will be processed later
data_path = join(data_dir, filename)
download_file_if_not_exists(f'{URL}/{filename}', data_path)
if descfile:
try:
download_file_if_not_exists(f'{URL}/{descfile}', f'{data_dir}/{descfile}')
if verbose:
print(open(f'{data_dir}/{descfile}', 'rt').read())
except Exception:
print('could not read the description file')
elif verbose:
print('no file description available')
print(f'Loading {dataset_name} ({fullname})')
if identifier == 'acute':
df = pd.read_csv(data_path, header=None, encoding='utf-16', sep='\t')
if dataset_name == 'acute.a':
y = binarize(df[6], pos_class='yes')
elif dataset_name == 'acute.b':
y = binarize(df[7], pos_class='yes')
mintemp, maxtemp = 35, 42
df[0] = df[0].apply(lambda x:(float(x.replace(',','.'))-mintemp)/(maxtemp-mintemp)).astype(float, copy=False)
[df_replace(df, col) for col in range(1, 6)]
X = df.loc[:, 0:5].values
if identifier == 'balance-scale':
df = pd.read_csv(data_path, header=None, sep=',')
if dataset_name == 'balance.1':
y = binarize(df[0], pos_class='L')
elif dataset_name == 'balance.2':
y = binarize(df[0], pos_class='B')
elif dataset_name == 'balance.3':
y = binarize(df[0], pos_class='R')
X = df.loc[:, 1:].astype(float).values
if identifier == 'breast-cancer-wisconsin':
df = pd.read_csv(data_path, header=None, sep=',')
Xy = df.loc[:, 1:10]
Xy[Xy=='?']=np.nan
Xy = Xy.dropna(axis=0)
X = Xy.loc[:, 1:9]
X = X.astype(float).values
y = binarize(Xy[10], pos_class=4)
if identifier == 'cmc':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.loc[:, 0:8].astype(float).values
y = df[9].astype(int).values
if dataset_name == 'cmc.1':
y = binarize(y, pos_class=1)
elif dataset_name == 'cmc.2':
y = binarize(y, pos_class=2)
elif dataset_name == 'cmc.3':
y = binarize(y, pos_class=3)
if identifier == '00193':
df = pd.read_excel(data_path, sheet_name='Data', skipfooter=3)
df = df[list(range(1,24))] # select columns numbered (number 23 is the target label)
# replaces the header with the first row
new_header = df.iloc[0] # grab the first row for the header
df = df[1:] # take the data less the header row
df.columns = new_header # set the header row as the df header
X = df.iloc[:, 0:22].astype(float).values
y = df['NSP'].astype(int).values
if dataset_name == 'ctg.1':
y = binarize(y, pos_class=1) # 1==Normal
elif dataset_name == 'ctg.2':
y = binarize(y, pos_class=2) # 2==Suspect
elif dataset_name == 'ctg.3':
y = binarize(y, pos_class=3) # 3==Pathologic
if identifier == 'statlog/german':
df = pd.read_csv(data_path, header=None, delim_whitespace=True)
X = df.iloc[:, 0:24].astype(float).values
y = df[24].astype(int).values
y = binarize(y, pos_class=1)
if identifier == 'haberman':
df = pd.read_csv(data_path, header=None)
X = df.iloc[:, 0:3].astype(float).values
y = df[3].astype(int).values
y = binarize(y, pos_class=2)
if identifier == 'ionosphere':
df = pd.read_csv(data_path, header=None)
X = df.iloc[:, 0:34].astype(float).values
y = df[34].values
y = binarize(y, pos_class='b')
if identifier == 'iris':
df = pd.read_csv(data_path, header=None)
X = df.iloc[:, 0:4].astype(float).values
y = df[4].values
if dataset_name == 'iris.1':
y = binarize(y, pos_class='Iris-setosa') # 1==Setosa
elif dataset_name == 'iris.2':
y = binarize(y, pos_class='Iris-versicolor') # 2==Versicolor
elif dataset_name == 'iris.3':
y = binarize(y, pos_class='Iris-virginica') # 3==Virginica
if identifier == 'mammographic-masses':
df = pd.read_csv(data_path, header=None, sep=',')
Xy[df == '?'] = np.nan
Xy = Xy.dropna(axis=0)
X = Xy.iloc[:, 0:5]
X = X.astype(float).values
y = binarize(Xy.iloc[:,5], pos_class=1)
if identifier == 'page-blocks':
data_path_ = data_path.replace('.Z', '')
if not os.path.exists(data_path_):
raise FileNotFoundError(f'Warning: file {data_path_} does not exist. If this is the first time you '
f'attempt to load this dataset, then you have to manually unzip the {data_path} '
f'and name the extracted file {data_path_} (unfortunately, neither zipfile, nor '
f'gzip can handle unix compressed files automatically -- there is a repo in GitHub '
f'https://github.com/umeat/unlzw where the problem seems to be solved anyway).')
df = pd.read_csv(data_path_, header=None, delim_whitespace=True)
X = df.iloc[:, 0:10].astype(float).values
y = df[10].values
y = binarize(y, pos_class=5) # 5==block "graphic"
if identifier == 'semeion':
df = pd.read_csv(data_path, header=None, delim_whitespace=True )
X = df.iloc[:, 0:256].astype(float).values
y = df[263].values # 263 stands for digit 8 (labels are one-hot vectors from col 256-266)
y = binarize(y, pos_class=1)
if identifier == 'undocumented/connectionist-bench/sonar':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.iloc[:, 0:60].astype(float).values
y = df[60].values
y = binarize(y, pos_class='R')
if identifier == 'spambase':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.iloc[:, 0:57].astype(float).values
y = df[57].values
y = binarize(y, pos_class=1)
if identifier == 'spect':
dfs = []
for file in filename:
data_path = join(data_dir, file)
download_file_if_not_exists(f'{URL}/{filename}', data_path)
dfs.append(pd.read_csv(data_path, header=None, sep=','))
df = pd.concat(dfs)
X = df.iloc[:, 1:45].astype(float).values
y = df[0].values
y = binarize(y, pos_class=0)
if identifier == 'tic-tac-toe':
df = pd.read_csv(data_path, header=None, sep=',')
X = df.iloc[:, 0:9].replace('o',0).replace('b',1).replace('x',2).values
y = df[9].values
y = binarize(y, pos_class='negative')
if identifier == 'blood-transfusion':
df = pd.read_csv(data_path, sep=',')
X = df.iloc[:, 0:4].astype(float).values
y = df.iloc[:, 4].values
y = binarize(y, pos_class=1)
data = LabelledCollection(X, y)
data.stats()
return Dataset(*data.split_stratified(1-test_split, random_state=0))
def df_replace(df, col, repl={'yes': 1, 'no':0}, astype=float):
df[col] = df[col].apply(lambda x:repl[x]).astype(astype, copy=False)