QuaPy/Ordinal/partition_dataset_by_smooth...

import numpy as np
from Ordinal.evaluation import smoothness
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
from os.path import join
from tqdm import tqdm


"""
This scripts generates a partition of a dataset in terms of "smoothness".
The partition is only carried out by generating index vectors.
"""


def partition_by_smoothness(split):
    assert split in ['dev', 'test'], 'invalid split name'
    total=1000 if split=='dev' else 5000
    smooths = []
    folderpath = join(datapath, domain, 'app', f'{split}_samples')
    for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
        smooths.append(smoothness(sample.prevalence()))
    smooths = np.asarray(smooths)
    order = np.argsort(smooths)
    nD = len(order)
    low2high_smooth = np.array_split(order, 5)
    all_drift = np.arange(nD)
    for i, smooth_idx in enumerate(low2high_smooth):
        block = smooths[smooth_idx]
        print(f'smooth block {i}: shape={smooth_idx.shape}, interval=[{block.min()}, {block.max()}] mean={block.mean()}')
        np.save(join(datapath, domain, 'app', f'smooth{i}.{split}.id.npy'), smooth_idx)
    np.save(join(datapath, domain, 'app', f'all.{split}.id.npy'), all_drift)


#domain = 'Books-tfidf'
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
datapath = './data'

#training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))

partition_by_smoothness('dev')
partition_by_smoothness('test')