QuaPy/Ordinal/preprocess_dataset_raw2tfid...

import quapy as qp
from Ordinal.utils import load_simple_sample_raw, load_samples_raw
from quapy.data import LabelledCollection
from sklearn.feature_extraction.text import TfidfVectorizer
from os.path import join
import os
import pickle
from tqdm import tqdm
import shutil


"""
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into tfidf vectors.
"""


datapath = './data'
domain = 'Books'
outname = domain + '-tfidf'


def save_preprocessing_info(transformer):
    with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo:
        foo.write(f'{str(transformer)}\n')


os.makedirs(join(datapath, outname), exist_ok=True)
os.makedirs(join(datapath, outname, 'app'), exist_ok=True)
os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True)
os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True)
shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt'))
shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt'))
os.makedirs(join(datapath, outname, 'real'), exist_ok=True)
os.makedirs(join(datapath, outname, 'real', 'dev_samples'), exist_ok=True)
os.makedirs(join(datapath, outname, 'real', 'test_samples'), exist_ok=True)
shutil.copyfile(join(datapath, domain, 'real', 'dev_prevalences.txt'), join(datapath, outname, 'real', 'dev_prevalences.txt'))
shutil.copyfile(join(datapath, domain, 'real', 'test_prevalences.txt'), join(datapath, outname, 'real', 'test_prevalences.txt'))


tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5)

train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text)
train.instances = tfidf.fit_transform(train.instances)
save_preprocessing_info(tfidf)
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)


def transform_folder_samples(protocol, splitname):
    for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
        sample.instances = tfidf.transform(sample.instances)
        pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)


transform_folder_samples('app', 'dev_samples')
transform_folder_samples('app', 'test_samples')
transform_folder_samples('real', 'dev_samples')
transform_folder_samples('real', 'test_samples')
generating features from RoBERTa, testing them on Amazons data 2022-03-16 19:12:45 +01:00			`import quapy as qp`
regenerating tfidf vectors 2024-03-15 14:30:51 +01:00			`from Ordinal.utils import load_simple_sample_raw, load_samples_raw`
generating features from RoBERTa, testing them on Amazons data 2022-03-16 19:12:45 +01:00			`from quapy.data import LabelledCollection`
			`from sklearn.feature_extraction.text import TfidfVectorizer`
			`from os.path import join`
			`import os`
			`import pickle`
			`from tqdm import tqdm`
			`import shutil`


scripts using QuaPy 2022-03-31 18:46:56 +02:00
			`"""`
			`This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into tfidf vectors.`
			`"""`


generating features from RoBERTa, testing them on Amazons data 2022-03-16 19:12:45 +01:00			`datapath = './data'`
			`domain = 'Books'`
			`outname = domain + '-tfidf'`

regenerating tfidf vectors 2024-03-15 14:30:51 +01:00
generating features from RoBERTa, testing them on Amazons data 2022-03-16 19:12:45 +01:00			`def save_preprocessing_info(transformer):`
			`with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo:`
			`foo.write(f'{str(transformer)}\n')`


			`os.makedirs(join(datapath, outname), exist_ok=True)`
			`os.makedirs(join(datapath, outname, 'app'), exist_ok=True)`
			`os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True)`
			`os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True)`
			`shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt'))`
			`shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt'))`
regenerating tfidf vectors 2024-03-15 14:30:51 +01:00			`os.makedirs(join(datapath, outname, 'real'), exist_ok=True)`
			`os.makedirs(join(datapath, outname, 'real', 'dev_samples'), exist_ok=True)`
			`os.makedirs(join(datapath, outname, 'real', 'test_samples'), exist_ok=True)`
			`shutil.copyfile(join(datapath, domain, 'real', 'dev_prevalences.txt'), join(datapath, outname, 'real', 'dev_prevalences.txt'))`
			`shutil.copyfile(join(datapath, domain, 'real', 'test_prevalences.txt'), join(datapath, outname, 'real', 'test_prevalences.txt'))`
generating features from RoBERTa, testing them on Amazons data 2022-03-16 19:12:45 +01:00

			`tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5)`

			`train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text)`
			`train.instances = tfidf.fit_transform(train.instances)`
			`save_preprocessing_info(tfidf)`
			`pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)`


regenerating tfidf vectors 2024-03-15 14:30:51 +01:00
generating features from RoBERTa, testing them on Amazons data 2022-03-16 19:12:45 +01:00			`def transform_folder_samples(protocol, splitname):`
regenerating tfidf vectors 2024-03-15 14:30:51 +01:00			`for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):`
generating features from RoBERTa, testing them on Amazons data 2022-03-16 19:12:45 +01:00			`sample.instances = tfidf.transform(sample.instances)`
			`pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)`


			`transform_folder_samples('app', 'dev_samples')`
			`transform_folder_samples('app', 'test_samples')`
regenerating tfidf vectors 2024-03-15 14:30:51 +01:00			`transform_folder_samples('real', 'dev_samples')`
			`transform_folder_samples('real', 'test_samples')`
generating features from RoBERTa, testing them on Amazons data 2022-03-16 19:12:45 +01:00