forked from moreo/QuaPy
Compare commits
16 Commits
|
@ -0,0 +1,52 @@
|
||||||
|
import gzip
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from collections import Counter
|
||||||
|
from Ordinal.utils import jaggedness
|
||||||
|
import pickle
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
amazon = np.genfromtxt('prevalence_votes1_reviews100.csv', delimiter='\t')
|
||||||
|
telescope = np.genfromtxt('fact_real_prevalences.csv', delimiter=',')[1:]
|
||||||
|
|
||||||
|
nclasses_amazon = amazon.shape[1]
|
||||||
|
nclasses_telescope = telescope.shape[1]
|
||||||
|
|
||||||
|
jags_amazon = np.asarray([jaggedness(p) for p in amazon])
|
||||||
|
jags_telescope = np.asarray([jaggedness(p) for p in telescope])
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from matplotlib.pyplot import figure
|
||||||
|
import seaborn as sns
|
||||||
|
|
||||||
|
sns.set_theme('paper')
|
||||||
|
sns.set_style('dark')
|
||||||
|
sns.set(font_scale=0.7)
|
||||||
|
|
||||||
|
# figure, axis = plt.subplots(1, 2, figsize=(8, 7))
|
||||||
|
ymax = 0.75
|
||||||
|
|
||||||
|
figure(figsize=(8, 4), dpi=300)
|
||||||
|
|
||||||
|
ax=plt.subplot(1, 2, 1)
|
||||||
|
classes = np.arange(1, nclasses_amazon+1)
|
||||||
|
plt.bar(classes, np.mean(amazon, axis=0), yerr=np.std(amazon, axis=0), width=1)
|
||||||
|
ax.set_ylim(0, ymax)
|
||||||
|
ax.set_xlabel("stars")
|
||||||
|
ax.set_xticks(classes)
|
||||||
|
ax.set_title(f'Amazon Books ({jags_amazon.mean():.4f})')
|
||||||
|
|
||||||
|
ax=plt.subplot(1, 2, 2)
|
||||||
|
# ax=plt.subplot(1, 1, 1)
|
||||||
|
classes = np.arange(1, nclasses_telescope+1)
|
||||||
|
plt.bar(classes, np.mean(telescope, axis=0), yerr=np.std(telescope, axis=0), width=1)
|
||||||
|
ax.set_ylim(0, ymax)
|
||||||
|
ax.set_xlabel("energy bin")
|
||||||
|
ax.set_xticks(classes)
|
||||||
|
ax.set_title(f'FACT Samples ({jags_telescope.mean():.4f})')
|
||||||
|
|
||||||
|
|
||||||
|
plt.subplots_adjust(wspace=0.1, hspace=0)
|
||||||
|
plt.savefig('prevalence_averages.pdf', bbox_inches='tight')
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,43 @@
|
||||||
|
import gzip
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from collections import Counter
|
||||||
|
from Ordinal.utils import jaggedness
|
||||||
|
import pickle
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
telescope = np.genfromtxt('fact_expectation.txt')
|
||||||
|
nclasses_telescope = len(telescope)
|
||||||
|
|
||||||
|
jag = jaggedness(telescope)
|
||||||
|
print(jag)
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from matplotlib.pyplot import figure
|
||||||
|
import seaborn as sns
|
||||||
|
|
||||||
|
sns.set_theme('paper')
|
||||||
|
sns.set_style('dark')
|
||||||
|
sns.set(font_scale=0.7)
|
||||||
|
|
||||||
|
# figure, axis = plt.subplots(1, 2, figsize=(8, 7))
|
||||||
|
ymax = 0.4
|
||||||
|
|
||||||
|
figure(figsize=(8, 4), dpi=300)
|
||||||
|
|
||||||
|
ax=plt.subplot(1, 1, 1)
|
||||||
|
classes = np.arange(1, nclasses_telescope+1)
|
||||||
|
plt.bar(classes, telescope, width=1)
|
||||||
|
# ax.bar_label(telescope)
|
||||||
|
ax.set_ylim(0, ymax)
|
||||||
|
ax.set_xlabel("energy bin")
|
||||||
|
ax.set_xticks(classes)
|
||||||
|
ax.set_title(f'FACT data ({jag:.4f})')
|
||||||
|
for index, data in enumerate(telescope):
|
||||||
|
plt.text(x=index+0.56 , y=data+0.005 , s=f"{data:.4f}")
|
||||||
|
|
||||||
|
|
||||||
|
plt.subplots_adjust(wspace=0.1, hspace=0)
|
||||||
|
plt.savefig('telescope_prevalence.pdf', bbox_inches='tight')
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,136 @@
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from tqdm import tqdm
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
|
||||||
|
# this script computes the distribution of smoothness/sharpness of the books;
|
||||||
|
# either considering all books, as well as considering different groups of books' reviews (by product)
|
||||||
|
|
||||||
|
# filein='/media/moreo/Volume/Datasets/Amazon/raw/Gift_Cards.json.gz'
|
||||||
|
# df = pd.read_json(filein, lines=True, compression='gzip')
|
||||||
|
|
||||||
|
read_meta = True
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_vote_field(df):
|
||||||
|
df['vote'] = df['vote'].fillna('0')
|
||||||
|
df['vote'] = df['vote'].apply(lambda x: x.replace(',', ''))
|
||||||
|
df['vote'] = pd.to_numeric(df['vote'])
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def read_from_huge_json(filein):
|
||||||
|
df = pd.read_json(filein, lines=True)
|
||||||
|
df.drop(columns=[
|
||||||
|
'verified', 'reviewTime', 'reviewerID', 'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime',
|
||||||
|
'image'
|
||||||
|
], inplace=True)
|
||||||
|
|
||||||
|
df = prepare_vote_field(df)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def read_from_metadata(filein):
|
||||||
|
df = pd.read_csv(filein)
|
||||||
|
df['vote'] = pd.to_numeric(df['vote'])
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def filter_by_vote(df, vote_threshold=1):
|
||||||
|
df = df[df['vote'] >= vote_threshold]
|
||||||
|
df.drop(columns=['vote'], inplace=True)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
if read_meta:
|
||||||
|
filein = '/media/moreo/Volume/Datasets/Amazon/meta/Books.csv'
|
||||||
|
readfn = read_from_metadata
|
||||||
|
else:
|
||||||
|
filein='/media/moreo/Volume/Datasets/Amazon/raw/Books.json'
|
||||||
|
readfn = read_from_huge_json
|
||||||
|
|
||||||
|
votes_support=9
|
||||||
|
|
||||||
|
df = readfn(filein)
|
||||||
|
|
||||||
|
num_entries = len(df)
|
||||||
|
# df = prepare_vote_field(df)
|
||||||
|
df = filter_by_vote(df, vote_threshold=votes_support)
|
||||||
|
num_entries_with_vote = len(df)
|
||||||
|
|
||||||
|
unique_product_ids = df['asin'].unique()
|
||||||
|
num_products = len(unique_product_ids)
|
||||||
|
|
||||||
|
print(df.columns)
|
||||||
|
print(f'num rows {len(df)} (before vote-thresholding {num_entries}, after thresholding {num_entries_with_vote})')
|
||||||
|
print(f'num unique products {num_products}')
|
||||||
|
|
||||||
|
|
||||||
|
# df = df.groupby(df['asin'])
|
||||||
|
|
||||||
|
def not_smoothness(p):
|
||||||
|
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||||
|
|
||||||
|
|
||||||
|
# pass to dictionaries
|
||||||
|
df = df.reset_index() # make sure indexes pair with number of rows
|
||||||
|
|
||||||
|
ids = df['asin'].values
|
||||||
|
overalls = df['overall'].values
|
||||||
|
|
||||||
|
allbooks_prev = np.histogram(overalls, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
|
||||||
|
allbooks_sharpness = not_smoothness(allbooks_prev)
|
||||||
|
print(f'all books prev={allbooks_prev} has sharpness {allbooks_sharpness:.4f}')
|
||||||
|
|
||||||
|
import sys
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
# Defining a dict
|
||||||
|
d = defaultdict(list)
|
||||||
|
for i, id in tqdm(enumerate(ids), total=len(ids), desc='passing to dictionary'):
|
||||||
|
d[id].append(overalls[i])
|
||||||
|
|
||||||
|
|
||||||
|
by_review_support = []
|
||||||
|
by_review_support_label = []
|
||||||
|
for reviews_support in [50, 100, 1]:
|
||||||
|
sharpness_all = []
|
||||||
|
num_products_with_reviews = 0
|
||||||
|
for product_id, ratings in tqdm(d.items(), total=len(d), desc='processing histograms'):
|
||||||
|
# ratings = df[df["asin"] == product_id]["overall"].values
|
||||||
|
n_ratings = len(ratings)
|
||||||
|
if n_ratings >= reviews_support:
|
||||||
|
# print(product_id, ratings)
|
||||||
|
prev = np.histogram(ratings, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
|
||||||
|
sharpness = not_smoothness(prev)
|
||||||
|
# print(prev, sharpness)
|
||||||
|
sharpness_all.append(sharpness)
|
||||||
|
num_products_with_reviews+=1
|
||||||
|
by_review_support.append(sharpness_all)
|
||||||
|
by_review_support_label.append(f'>{reviews_support}')
|
||||||
|
|
||||||
|
print(f'#votes-support (min number of votes): {votes_support}')
|
||||||
|
print(f'#reviews with >#votes-support: {num_entries_with_vote}/{num_entries}={100*num_entries_with_vote/num_entries:.2f}%')
|
||||||
|
|
||||||
|
print(f'#reviews-support (min number of reviews): {reviews_support}')
|
||||||
|
print(f'#products with >#reviews-support: {num_products_with_reviews}/{num_products}={100*num_products_with_reviews/num_products:.2f}%')
|
||||||
|
|
||||||
|
q05 = np.percentile(sharpness_all, 5)
|
||||||
|
q25 = np.percentile(sharpness_all, 25)
|
||||||
|
q50 = np.percentile(sharpness_all, 50)
|
||||||
|
q75 = np.percentile(sharpness_all, 75)
|
||||||
|
q95 = np.percentile(sharpness_all, 95)
|
||||||
|
print(f'{q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}')
|
||||||
|
print(f'ave={np.mean(sharpness_all):.5f}')
|
||||||
|
print(f'min={np.min(sharpness_all):.5f}')
|
||||||
|
print(f'max={np.max(sharpness_all):.5f}')
|
||||||
|
|
||||||
|
#fig, ax = plt.subplots()
|
||||||
|
#ax.boxplot(by_review_support)
|
||||||
|
#ax.set_xticklabels(by_review_support_label)
|
||||||
|
#ax.set_ylabel("Sharpness")
|
||||||
|
#ax.set_xlabel("Distributions by number of reviews")
|
||||||
|
#plt.show()
|
||||||
|
|
|
@ -0,0 +1,209 @@
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from tqdm import tqdm
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from quapy.protocol import UPP
|
||||||
|
|
||||||
|
# this script computes the distribution of smoothness/sharpness of the books;
|
||||||
|
# either considering all books, as well as considering different groups of books' reviews (by product)
|
||||||
|
# Mirko asked for some exploration of values (votes, num reviews), and percentiles of dataset shift as measured in terms
|
||||||
|
# of NMD between training set prevalences and sample prevalences; this script does this
|
||||||
|
# It also generates a csv containing all the prevalence values by product
|
||||||
|
|
||||||
|
|
||||||
|
read_meta = True
|
||||||
|
|
||||||
|
|
||||||
|
def not_smoothness(p):
|
||||||
|
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||||
|
|
||||||
|
|
||||||
|
def _check_arrays(prevs):
|
||||||
|
prevs = np.asarray(prevs)
|
||||||
|
if prevs.ndim==1:
|
||||||
|
prevs = prevs.reshape(1,-1)
|
||||||
|
return prevs
|
||||||
|
|
||||||
|
|
||||||
|
# mean normalized match distance
|
||||||
|
def mnmd(prevs, prevs_hat):
|
||||||
|
prevs = _check_arrays(prevs)
|
||||||
|
prevs_hat = _check_arrays(prevs_hat)
|
||||||
|
assert prevs.shape == prevs_hat.shape, f'wrong shape; found {prevs.shape} and {prevs_hat.shape}'
|
||||||
|
|
||||||
|
nmds = [nmd(p, p_hat) for p, p_hat in zip(prevs, prevs_hat)]
|
||||||
|
return np.mean(nmds)
|
||||||
|
|
||||||
|
|
||||||
|
# normalized match distance
|
||||||
|
def nmd(prev, prev_hat):
|
||||||
|
n = len(prev)
|
||||||
|
return (1./(n-1))*mdpa(prev, prev_hat)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Minimum Distance of Pair Assignments (MDPA) [cha2002measuring] for ordinal pdfs `a` and `b`.
|
||||||
|
The MDPA is a special case of the Earth Mover's Distance [rubner1998metric] that can be
|
||||||
|
computed efficiently.
|
||||||
|
[Mirko Bunse's code from Julia adapted]
|
||||||
|
"""
|
||||||
|
def mdpa(a, b):
|
||||||
|
assert len(a) == len(b), "histograms have to have the same length"
|
||||||
|
assert np.isclose(sum(a), sum(b)), "histograms have to have the same mass (difference is $(sum(a)-sum(b))"
|
||||||
|
|
||||||
|
# algorithm 1 in [cha2002measuring]
|
||||||
|
prefixsum = 0.0
|
||||||
|
distance = 0.0
|
||||||
|
for i in range(len(a)):
|
||||||
|
prefixsum += a[i] - b[i]
|
||||||
|
distance += abs(prefixsum)
|
||||||
|
|
||||||
|
return distance / sum(a) # the normalization is a fix to the original MDPA
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_vote_field(df):
|
||||||
|
df['vote'] = df['vote'].fillna('0')
|
||||||
|
df['vote'] = df['vote'].apply(lambda x: x.replace(',', ''))
|
||||||
|
df['vote'] = pd.to_numeric(df['vote'])
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def read_from_huge_json(filein):
|
||||||
|
df = pd.read_json(filein, lines=True)
|
||||||
|
df.drop(columns=[
|
||||||
|
'verified', 'reviewTime', 'reviewerID', 'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime',
|
||||||
|
'image'
|
||||||
|
], inplace=True)
|
||||||
|
|
||||||
|
df = prepare_vote_field(df)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def read_from_metadata(filein):
|
||||||
|
df = pd.read_csv(filein)
|
||||||
|
df['vote'] = pd.to_numeric(df['vote'])
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def filter_by_vote(df, vote_threshold=1):
|
||||||
|
df = df[df['vote'] >= vote_threshold]
|
||||||
|
df.drop(columns=['vote'], inplace=True)
|
||||||
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
if read_meta:
|
||||||
|
filein = '/media/moreo/Volume/Datasets/Amazon/meta/Books.csv'
|
||||||
|
readfn = read_from_metadata
|
||||||
|
else:
|
||||||
|
filein='/media/moreo/Volume/Datasets/Amazon/raw/Books.json'
|
||||||
|
readfn = read_from_huge_json
|
||||||
|
|
||||||
|
|
||||||
|
def create_dictionary_bookid_ratings(df):
|
||||||
|
# pass to dictionaries
|
||||||
|
df = df.reset_index() # make sure indexes pair with number of rows
|
||||||
|
|
||||||
|
ids = df['asin'].values
|
||||||
|
overalls = df['overall'].values
|
||||||
|
|
||||||
|
# Defining a dict
|
||||||
|
d = defaultdict(list)
|
||||||
|
for i, id in tqdm(enumerate(ids), total=len(ids), desc='passing to dictionary'):
|
||||||
|
d[id].append(overalls[i])
|
||||||
|
|
||||||
|
return d
|
||||||
|
|
||||||
|
def get_stats(distribution, msg=''):
|
||||||
|
# computes the mean, max, min, perc5, perc25, perc50, perc75, perc95 of the distribution
|
||||||
|
vmean = np.mean(distribution)
|
||||||
|
vmax = np.max(distribution)
|
||||||
|
vmin = np.min(distribution)
|
||||||
|
q05 = np.percentile(distribution, 5)
|
||||||
|
q25 = np.percentile(distribution, 25)
|
||||||
|
q50 = np.percentile(distribution, 50)
|
||||||
|
q75 = np.percentile(distribution, 75)
|
||||||
|
q95 = np.percentile(distribution, 95)
|
||||||
|
print(f'{msg}: percentiles {q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}')
|
||||||
|
print(f'{msg}: ave={np.mean(distribution):.5f}')
|
||||||
|
print(f'{msg}: max={np.max(distribution):.5f}')
|
||||||
|
print(f'{msg}: min={np.min(distribution):.5f}')
|
||||||
|
return vmean, vmax, vmin, q05, q25, q50, q75, q95
|
||||||
|
|
||||||
|
with open('book_stats.csv', 'wt') as foo:
|
||||||
|
foo.write(f'minvotes\tminreviews\t#products\t#reviews'
|
||||||
|
f'\tsharp-ave\tsharp-max\tsharp-min\t'
|
||||||
|
f'sharp-P5\tsharp-P25\tsharp-P50\tsharp-P75\tsharp-P95'
|
||||||
|
f'\tshift-ave\tshift-max\tshift-min\t'
|
||||||
|
f'shift-P5\tshift-P25\tshift-P50\tshift-P75\tshift-P95'
|
||||||
|
f'\n')
|
||||||
|
|
||||||
|
for votes_support in [1]:
|
||||||
|
|
||||||
|
df = readfn(filein)
|
||||||
|
df = df[df['overall']>0] # there are a couple of reviews with 0 stars (the min should be 1)
|
||||||
|
|
||||||
|
num_entries = len(df)
|
||||||
|
df = filter_by_vote(df, vote_threshold=votes_support)
|
||||||
|
num_entries_with_vote = len(df)
|
||||||
|
|
||||||
|
unique_product_ids = df['asin'].unique()
|
||||||
|
num_products = len(unique_product_ids)
|
||||||
|
|
||||||
|
print(df.columns)
|
||||||
|
print(f'num rows {len(df)} (before vote-thresholding {num_entries}, after thresholding {num_entries_with_vote})')
|
||||||
|
print(f'num unique products {num_products}')
|
||||||
|
|
||||||
|
d = create_dictionary_bookid_ratings(df)
|
||||||
|
|
||||||
|
for reviews_support in [100]:
|
||||||
|
with open(f'./prevalence_votes{votes_support}_reviews{reviews_support}.csv', 'wt') as fprev:
|
||||||
|
sharpness_all = []
|
||||||
|
num_products_with_reviews = 0
|
||||||
|
sel_ids, sel_overalls = [], []
|
||||||
|
for product_id, ratings in tqdm(d.items(), total=len(d), desc='processing histograms'):
|
||||||
|
n_ratings = len(ratings)
|
||||||
|
if n_ratings >= reviews_support:
|
||||||
|
sel_ids.extend([product_id] * n_ratings)
|
||||||
|
sel_overalls.extend(ratings)
|
||||||
|
|
||||||
|
prev = np.histogram(ratings, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
|
||||||
|
for i, prev_i in enumerate(prev):
|
||||||
|
fprev.write(f'{prev_i:.5f}')
|
||||||
|
if i < len(prev)-1:
|
||||||
|
fprev.write('\t')
|
||||||
|
else:
|
||||||
|
fprev.write('\n')
|
||||||
|
sharpness = not_smoothness(prev)
|
||||||
|
sharpness_all.append(sharpness)
|
||||||
|
num_products_with_reviews+=1
|
||||||
|
|
||||||
|
print(f'#votes-support (min number of votes): {votes_support}')
|
||||||
|
print(f'#reviews with >#votes-support: {num_entries_with_vote}/{num_entries}={100*num_entries_with_vote/num_entries:.2f}%')
|
||||||
|
|
||||||
|
print(f'#reviews-support (min number of reviews): {reviews_support}')
|
||||||
|
print(f'#products with >#reviews-support: {num_products_with_reviews}/{num_products}={100*num_products_with_reviews/num_products:.2f}%')
|
||||||
|
|
||||||
|
vmean, vmax, vmin, q05, q25, q50, q75, q95 = get_stats(sharpness_all, 'sharpness')
|
||||||
|
|
||||||
|
allbooks_prev = np.histogram(sel_overalls, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
|
||||||
|
allbooks_sharpness = not_smoothness(allbooks_prev)
|
||||||
|
print(f'all books prev={allbooks_prev} has sharpness {allbooks_sharpness:.4f}')
|
||||||
|
|
||||||
|
sel_collection = LabelledCollection(instances=sel_ids, labels=sel_overalls, classes=[1,2,3,4,5])
|
||||||
|
prot = UPP(sel_collection, sample_size=1000, repeats=5000)
|
||||||
|
prot_iterator = prot()
|
||||||
|
shifts = []
|
||||||
|
for _, test_prev in tqdm(prot_iterator, total=prot.total()):
|
||||||
|
shifts.append(nmd(allbooks_prev, prev_hat=test_prev))
|
||||||
|
s_mean, s_max, s_min, s_q05, s_q25, s_q50, s_q75, s_q95 = get_stats(shifts, 'shift')
|
||||||
|
|
||||||
|
foo.write(f'{votes_support}\t{reviews_support}\t{num_products_with_reviews}\t{len(sel_ids)}'
|
||||||
|
f'\t{vmean:.5f}\t{vmax:.5f}\t{vmin:.5f}\t'
|
||||||
|
f'{q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}'
|
||||||
|
f'\t{s_mean:.5f}\t{s_max:.5f}\t{s_min:.5f}\t'
|
||||||
|
f'{s_q05:.5f}\t{s_q25:.5f}\t{s_q50:.5f}\t{s_q75:.5f}\t{s_q95:.5f}\n')
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from scipy import optimize
|
||||||
|
|
||||||
|
|
||||||
|
# this script checks for the prevalence values that yield the maximum or minimum values of smoothness;
|
||||||
|
# the result indicates any linear distribution (not only the uniform) satisfies this requirement
|
||||||
|
|
||||||
|
def sharpness(p):
|
||||||
|
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||||
|
|
||||||
|
def smoothness(p):
|
||||||
|
return 1-sharpness(p)
|
||||||
|
|
||||||
|
nclasses = 5
|
||||||
|
uniform_distribution = np.random.rand(nclasses) #np.full(fill_value=1/nclasses, shape=nclasses)
|
||||||
|
uniform_distribution /= uniform_distribution.sum()
|
||||||
|
|
||||||
|
bounds = tuple((0, 1) for x in range(nclasses)) # values in [0,1]
|
||||||
|
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||||
|
r = optimize.minimize(sharpness, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
|
||||||
|
print(f'minimum of sharpness function {r.x}')
|
||||||
|
|
||||||
|
r = optimize.minimize(smoothness, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||||
|
print(f'maximum of sharpness function {r.x}')
|
|
@ -0,0 +1,105 @@
|
||||||
|
import gzip
|
||||||
|
import os
|
||||||
|
from collections import Counter
|
||||||
|
from Ordinal.utils import jaggedness
|
||||||
|
import quapy as qp
|
||||||
|
import pickle
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
base_path = '/media/moreo/Volume/Datasets/Amazon/reviews'
|
||||||
|
categories_path = '/media/moreo/Volume/Datasets/Amazon/raw/amazon_categories.txt'
|
||||||
|
|
||||||
|
|
||||||
|
def get_prevalence_merchandise(category):
|
||||||
|
input_file = os.path.join(base_path, category+'.txt.gz')
|
||||||
|
labels = []
|
||||||
|
print(f'{category} starts')
|
||||||
|
with gzip.open(input_file, 'rt') as f:
|
||||||
|
for line in f:
|
||||||
|
try:
|
||||||
|
stars, doc = line.split('\t')
|
||||||
|
labels.append(stars)
|
||||||
|
except:
|
||||||
|
print('error in line: ', line)
|
||||||
|
counts = Counter(labels)
|
||||||
|
print(f'\t{category} done')
|
||||||
|
return counts
|
||||||
|
|
||||||
|
target_file = './counters_Amazon_merchandise.pkl'
|
||||||
|
|
||||||
|
if not os.path.exists(target_file):
|
||||||
|
categories = [c.strip().replace(' ', '_') for c in open(categories_path, 'rt').readlines()]
|
||||||
|
|
||||||
|
# categories = ['Gift_Cards', 'Magazine_Subscriptions']
|
||||||
|
counters = qp.util.parallel(get_prevalence_merchandise, categories, n_jobs=-1)
|
||||||
|
|
||||||
|
print('saving pickle')
|
||||||
|
pickle.dump((categories, counters), open(target_file, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
else:
|
||||||
|
(categories, counters) = pickle.load(open(target_file, 'rb'))
|
||||||
|
|
||||||
|
index_gift_cards = categories.index('Gift_Cards')
|
||||||
|
del categories[index_gift_cards]
|
||||||
|
del counters[index_gift_cards]
|
||||||
|
|
||||||
|
class_smooth = []
|
||||||
|
for cat, counter in zip(categories, counters):
|
||||||
|
total = sum(count for label, count in counter.items())
|
||||||
|
counts = [counter[i] for i in map(str, [1,2,3,4,5])]
|
||||||
|
p = np.asarray(counts)/total
|
||||||
|
smooth = jaggedness(p)
|
||||||
|
class_smooth.append([smooth, cat, p])
|
||||||
|
|
||||||
|
class_smooth = sorted(class_smooth)
|
||||||
|
|
||||||
|
# df = pd.DataFrame(class_smooth, columns=['smoothness', 'category', 'prevalence'])
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
|
||||||
|
sns.set_theme('paper')
|
||||||
|
sns.set_style('dark')
|
||||||
|
sns.set(font_scale=0.5)
|
||||||
|
|
||||||
|
nrows = 7
|
||||||
|
ncols = 4
|
||||||
|
figure, axis = plt.subplots(nrows, ncols, figsize=(ncols*2, nrows))
|
||||||
|
with open('categories.txt', 'wt') as foo:
|
||||||
|
foo.write(f'Category\tSmooth\tPrevalence\n')
|
||||||
|
for i, (smooth, category, prevalence) in enumerate(class_smooth):
|
||||||
|
row = i // 4
|
||||||
|
col = i % 4
|
||||||
|
# print(i, row, col)
|
||||||
|
axis[row, col].bar([1,2,3,4,5], prevalence, width=1)
|
||||||
|
axis[row, col].set_ylim(0, 0.75)
|
||||||
|
axis[row, col].set_facecolor('white')
|
||||||
|
for spine in axis[row, col].spines.values():
|
||||||
|
spine.set_edgecolor('black')
|
||||||
|
spine.set_linewidth(0.3)
|
||||||
|
# axis[row, col].set_xticks(loc=0)
|
||||||
|
if row==6:
|
||||||
|
axis[row, col].set_xlabel("stars")
|
||||||
|
# axis[row, col].set_xticks([1,2,3,4,5])
|
||||||
|
# else:
|
||||||
|
# axis[row, col].set_xticks([])
|
||||||
|
if col==0:
|
||||||
|
axis[row, col].set_ylabel("")
|
||||||
|
axis[row, col].set_yticks([])
|
||||||
|
else:
|
||||||
|
axis[row, col].set_ylabel("")
|
||||||
|
axis[row, col].set_yticks([])
|
||||||
|
|
||||||
|
category = category.replace('_', ' ').title()
|
||||||
|
category = category.replace(' And ', ' & ')
|
||||||
|
axis[row, col].set_title(f'{category} ({smooth:.4f})', x=0.5, y=0.75)
|
||||||
|
# axis[row, col].set_title
|
||||||
|
|
||||||
|
foo.write(f'{category}\t{smooth}\t{prevalence}\n')
|
||||||
|
|
||||||
|
# plt.show()
|
||||||
|
plt.subplots_adjust(wspace=0, hspace=0)
|
||||||
|
plt.savefig('Amazon_categories_plotgrid.pdf', bbox_inches='tight')
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,147 @@
|
||||||
|
import gzip
|
||||||
|
import quapy as qp
|
||||||
|
from Ordinal.utils import load_simple_sample_raw
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
import quapy.functional as F
|
||||||
|
import os
|
||||||
|
from os.path import join
|
||||||
|
from pathlib import Path
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
datadir = '/media/moreo/Volume/Datasets/Amazon/reviews'
|
||||||
|
outdir = './data/'
|
||||||
|
real_prev_path = './data/Books-real-prevalence-by-product_votes1_reviews100.csv'
|
||||||
|
domain = 'Books'
|
||||||
|
seed = 7
|
||||||
|
|
||||||
|
tr_size = 20000
|
||||||
|
val_size = 1000
|
||||||
|
te_size = 1000
|
||||||
|
nval = 1000
|
||||||
|
nte = 5000
|
||||||
|
|
||||||
|
|
||||||
|
def from_text(path, encoding='utf-8', class2int=True):
|
||||||
|
"""
|
||||||
|
Reads a labelled colletion of documents.
|
||||||
|
File fomart <0-4>\t<document>\n
|
||||||
|
|
||||||
|
:param path: path to the labelled collection
|
||||||
|
:param encoding: the text encoding used to open the file
|
||||||
|
:return: a list of sentences, and a list of labels
|
||||||
|
"""
|
||||||
|
all_sentences, all_labels = [], []
|
||||||
|
file = open(path, 'rt', encoding=encoding).readlines()
|
||||||
|
for line in file:
|
||||||
|
line = line.strip()
|
||||||
|
if line:
|
||||||
|
try:
|
||||||
|
label, sentence = line.split('\t')
|
||||||
|
sentence = sentence.strip()
|
||||||
|
if class2int:
|
||||||
|
label = int(label)
|
||||||
|
if label >= 0:
|
||||||
|
if sentence:
|
||||||
|
all_sentences.append(sentence)
|
||||||
|
all_labels.append(label)
|
||||||
|
except ValueError:
|
||||||
|
print(f'format error in {line}')
|
||||||
|
return all_sentences, all_labels
|
||||||
|
|
||||||
|
|
||||||
|
def write_txt_sample(sample: LabelledCollection, path):
|
||||||
|
os.makedirs(Path(path).parent, exist_ok=True)
|
||||||
|
with open(path, 'wt') as foo:
|
||||||
|
for document, label in zip(*sample.Xy):
|
||||||
|
foo.write(f'{label}\t{document}\n')
|
||||||
|
|
||||||
|
|
||||||
|
def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||||
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
with open(prevpath, 'wt') as prevfile:
|
||||||
|
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||||
|
for i, prev in enumerate(F.uniform_simplex_sampling(n_classes=pool.n_classes, size=nsamples)):
|
||||||
|
sample = pool.sampling(sample_size, *prev)
|
||||||
|
write_txt_sample(sample, join(outdir, f'{i}.txt'))
|
||||||
|
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||||
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
with open(prevpath, 'wt') as prevfile:
|
||||||
|
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||||
|
for i, sample in enumerate(pool.natural_sampling_generator(sample_size, repeats=nsamples)):
|
||||||
|
write_txt_sample(sample, join(outdir, f'{i}.txt'))
|
||||||
|
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
def gen_samples_real_prevalences(real_prevalences, pool: LabelledCollection, sample_size, outdir, prevpath_out):
|
||||||
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
with open(prevpath_out, 'wt') as prevfile:
|
||||||
|
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||||
|
for i, prev in enumerate(real_prevalences):
|
||||||
|
sample = pool.sampling(sample_size, *prev[:-1])
|
||||||
|
write_txt_sample(sample, join(outdir, f'{i}.txt'))
|
||||||
|
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
# fullpath = join(datadir,domain)+'.txt.gz' <- deprecated; there were duplicates
|
||||||
|
# data = LabelledCollection.load(fullpath, from_gz_text)
|
||||||
|
|
||||||
|
fullpath = './data/Books/Books.txt'
|
||||||
|
data = LabelledCollection.load(fullpath, from_text)
|
||||||
|
|
||||||
|
print(len(data))
|
||||||
|
print(data.classes_)
|
||||||
|
print(data.prevalence())
|
||||||
|
|
||||||
|
with qp.util.temp_seed(seed):
|
||||||
|
train, rest = data.split_stratified(train_prop=tr_size)
|
||||||
|
|
||||||
|
devel, test = rest.split_stratified(train_prop=0.5)
|
||||||
|
print(len(train))
|
||||||
|
print(len(devel))
|
||||||
|
print(len(test))
|
||||||
|
|
||||||
|
domaindir = join(outdir, domain)
|
||||||
|
|
||||||
|
write_txt_sample(train, join(domaindir, 'training_data.txt'))
|
||||||
|
write_txt_sample(devel, join(domaindir, 'development_data.txt'))
|
||||||
|
write_txt_sample(test, join(domaindir, 'test_data.txt'))
|
||||||
|
|
||||||
|
# this part is to be used when the partitions have already been created, in order to avoid re-generating them
|
||||||
|
#train = load_simple_sample_raw(domaindir, 'training_data')
|
||||||
|
#devel = load_simple_sample_raw(domaindir, 'development_data')
|
||||||
|
#test = load_simple_sample_raw(domaindir, 'test_data')
|
||||||
|
|
||||||
|
gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
|
||||||
|
prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
|
||||||
|
gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
|
||||||
|
prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
|
||||||
|
|
||||||
|
# gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
|
||||||
|
# prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
|
||||||
|
# gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
|
||||||
|
# prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
|
||||||
|
|
||||||
|
|
||||||
|
# this part generates samples based on real prevalences (in this case, prevalences of sets of books reviews
|
||||||
|
# groupped by product). It loads the real prevalences (computed elsewhere), and randomly extract 5000 for test
|
||||||
|
# and 1000 for val (disjoint). Then realize the samplings
|
||||||
|
|
||||||
|
assert os.path.exists(real_prev_path), f'real prevalence file does not seem to exist...'
|
||||||
|
real_prevalences = np.genfromtxt(real_prev_path, delimiter='\t')
|
||||||
|
|
||||||
|
nrows = real_prevalences.shape[0]
|
||||||
|
rand_sel = np.random.permutation(nrows)
|
||||||
|
real_prevalences_val = real_prevalences[rand_sel[:nval]]
|
||||||
|
real_prevalences_te = real_prevalences[rand_sel[nval:nval+nte]]
|
||||||
|
|
||||||
|
gen_samples_real_prevalences(real_prevalences_val, devel, sample_size=val_size, outdir=join(domaindir, 'real', 'dev_samples'),
|
||||||
|
prevpath_out=join(domaindir, 'real', 'dev_prevalences.txt'))
|
||||||
|
gen_samples_real_prevalences(real_prevalences_te, test, sample_size=te_size, outdir=join(domaindir, 'real', 'test_samples'),
|
||||||
|
prevpath_out=join(domaindir, 'real', 'test_prevalences.txt'))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,116 @@
|
||||||
|
import gzip
|
||||||
|
import quapy as qp
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
import quapy.functional as F
|
||||||
|
import os
|
||||||
|
from os.path import join
|
||||||
|
from pathlib import Path
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
|
||||||
|
datadir = '../OrdinalQuantification'
|
||||||
|
outdir = './data/'
|
||||||
|
domain = 'fact'
|
||||||
|
seed = 7
|
||||||
|
|
||||||
|
tr_size = 20000
|
||||||
|
val_size = 1000
|
||||||
|
te_size = 1000
|
||||||
|
nval = 1000
|
||||||
|
nte = 5000
|
||||||
|
|
||||||
|
|
||||||
|
def from_csv(path):
|
||||||
|
df = pd.read_csv(path)
|
||||||
|
|
||||||
|
# divide the continuous labels into ordered classes
|
||||||
|
energy_boundaries = np.arange(start=2.4, stop=4.2, step=0.15)[1:-1]
|
||||||
|
y = np.digitize(np.array(df['log10_energy'], dtype=np.float32), energy_boundaries)
|
||||||
|
|
||||||
|
# note: omitting the dtype will result in a single instance having a different class
|
||||||
|
|
||||||
|
# obtain a matrix of shape (n_samples, n_features)
|
||||||
|
X = df.iloc[:, 1:].to_numpy().astype(np.float32)
|
||||||
|
return X, y
|
||||||
|
|
||||||
|
|
||||||
|
def write_pkl(sample: LabelledCollection, path):
|
||||||
|
os.makedirs(Path(path).parent, exist_ok=True)
|
||||||
|
pickle.dump(sample, open(path, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
|
def gen_samples_APP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||||
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
with open(prevpath, 'wt') as prevfile:
|
||||||
|
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||||
|
for i, prev in enumerate(F.uniform_simplex_sampling(n_classes=pool.n_classes, size=nsamples)):
|
||||||
|
sample = pool.sampling(sample_size, *prev)
|
||||||
|
write_pkl(sample, join(outdir, f'{i}.pkl'))
|
||||||
|
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
def gen_samples_NPP(pool: LabelledCollection, nsamples, sample_size, outdir, prevpath):
|
||||||
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
with open(prevpath, 'wt') as prevfile:
|
||||||
|
prevfile.write('id,' + ','.join(f'{c}' for c in pool.classes_) + '\n')
|
||||||
|
for i, sample in enumerate(pool.natural_sampling_generator(sample_size, repeats=nsamples)):
|
||||||
|
write_pkl(sample, join(outdir, f'{i}.pkl'))
|
||||||
|
prevfile.write(f'{i},' + ','.join(f'{p:.3f}' for p in sample.prevalence()) + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
fullpath = join(datadir,domain, 'fact_wobble.csv')
|
||||||
|
|
||||||
|
data = LabelledCollection.load(fullpath, from_csv)
|
||||||
|
|
||||||
|
if np.isnan(data.instances).any():
|
||||||
|
rows, cols = np.where(np.isnan(data.instances))
|
||||||
|
data.instances = np.delete(data.instances, rows, axis=0)
|
||||||
|
data.labels = np.delete(data.labels, rows, axis=0)
|
||||||
|
print('deleted nan rows')
|
||||||
|
|
||||||
|
if np.isnan(data.instances).any():
|
||||||
|
rows, cols = np.where(np.isnan(data.instances))
|
||||||
|
data.instances = np.delete(data.instances, rows, axis=0)
|
||||||
|
data.labels = np.delete(data.labels, rows, axis=0)
|
||||||
|
print('deleted nan rows')
|
||||||
|
|
||||||
|
if np.isinf(data.instances).any():
|
||||||
|
rows, cols = np.where(np.isinf(data.instances))
|
||||||
|
data.instances = np.delete(data.instances, rows, axis=0)
|
||||||
|
data.labels = np.delete(data.labels, rows, axis=0)
|
||||||
|
print('deleted inf rows')
|
||||||
|
|
||||||
|
|
||||||
|
print(len(data))
|
||||||
|
print(data.classes_)
|
||||||
|
print(data.prevalence())
|
||||||
|
|
||||||
|
with qp.util.temp_seed(seed):
|
||||||
|
train, rest = data.split_stratified(train_prop=tr_size)
|
||||||
|
|
||||||
|
devel, test = rest.split_stratified(train_prop=0.5)
|
||||||
|
print(len(train))
|
||||||
|
print(len(devel))
|
||||||
|
print(len(test))
|
||||||
|
|
||||||
|
domaindir = join(outdir, domain)
|
||||||
|
|
||||||
|
write_pkl(train, join(domaindir, 'training_data.pkl'))
|
||||||
|
write_pkl(devel, join(domaindir, 'development_data.pkl'))
|
||||||
|
write_pkl(test, join(domaindir, 'test_data.pkl'))
|
||||||
|
|
||||||
|
gen_samples_APP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'app', 'dev_samples'),
|
||||||
|
prevpath=join(domaindir, 'app', 'dev_prevalences.txt'))
|
||||||
|
gen_samples_APP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'app', 'test_samples'),
|
||||||
|
prevpath=join(domaindir, 'app', 'test_prevalences.txt'))
|
||||||
|
|
||||||
|
gen_samples_NPP(devel, nsamples=nval, sample_size=val_size, outdir=join(domaindir, 'npp', 'dev_samples'),
|
||||||
|
prevpath=join(domaindir, 'npp', 'dev_prevalences.txt'))
|
||||||
|
gen_samples_NPP(test, nsamples=nte, sample_size=te_size, outdir=join(domaindir, 'npp', 'test_samples'),
|
||||||
|
prevpath=join(domaindir, 'npp', 'test_prevalences.txt'))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,50 @@
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
# smoothing approximation
|
||||||
|
def smoothness(p):
|
||||||
|
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||||
|
|
||||||
|
|
||||||
|
def _check_arrays(prevs):
|
||||||
|
prevs = np.asarray(prevs)
|
||||||
|
if prevs.ndim==1:
|
||||||
|
prevs = prevs.reshape(1,-1)
|
||||||
|
return prevs
|
||||||
|
|
||||||
|
|
||||||
|
# mean normalized match distance
|
||||||
|
def mnmd(prevs, prevs_hat):
|
||||||
|
prevs = _check_arrays(prevs)
|
||||||
|
prevs_hat = _check_arrays(prevs_hat)
|
||||||
|
assert prevs.shape == prevs_hat.shape, f'wrong shape; found {prevs.shape} and {prevs_hat.shape}'
|
||||||
|
|
||||||
|
nmds = [nmd(p, p_hat) for p, p_hat in zip(prevs, prevs_hat)]
|
||||||
|
return np.mean(nmds)
|
||||||
|
|
||||||
|
|
||||||
|
# normalized match distance
|
||||||
|
def nmd(prev, prev_hat):
|
||||||
|
n = len(prev)
|
||||||
|
return (1./(n-1))*mdpa(prev, prev_hat)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Minimum Distance of Pair Assignments (MDPA) [cha2002measuring] for ordinal pdfs `a` and `b`.
|
||||||
|
The MDPA is a special case of the Earth Mover's Distance [rubner1998metric] that can be
|
||||||
|
computed efficiently.
|
||||||
|
[Mirko Bunse's code from Julia adapted]
|
||||||
|
"""
|
||||||
|
def mdpa(a, b):
|
||||||
|
assert len(a) == len(b), "histograms have to have the same length"
|
||||||
|
assert np.isclose(sum(a), sum(b)), "histograms have to have the same mass (difference is $(sum(a)-sum(b))"
|
||||||
|
|
||||||
|
# algorithm 1 in [cha2002measuring]
|
||||||
|
prefixsum = 0.0
|
||||||
|
distance = 0.0
|
||||||
|
for i in range(len(a)):
|
||||||
|
prefixsum += a[i] - b[i]
|
||||||
|
distance += abs(prefixsum)
|
||||||
|
|
||||||
|
return distance / sum(a) # the normalization is a fix to the original MDPA
|
||||||
|
|
|
@ -0,0 +1,151 @@
|
||||||
|
import numpy as np
|
||||||
|
import quapy as qp
|
||||||
|
import os
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from Ordinal.model import LogisticAT, LogisticSE, LogisticIT, LAD, OrdinalRidge #, RegressionQuantification
|
||||||
|
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC
|
||||||
|
from os.path import join
|
||||||
|
from utils import load_samples_folder, load_single_sample_pkl
|
||||||
|
from Ordinal.evaluation import nmd, mnmd
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script generates all results from Table 1 in the paper, i.e., all results comparing quantifiers equipped with
|
||||||
|
standard logistic regression against quantifiers equipped with order-aware classifiers
|
||||||
|
"""
|
||||||
|
|
||||||
|
def quantifiers():
|
||||||
|
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||||
|
params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
|
||||||
|
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||||
|
params_Ridge = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced'], 'normalize':[True,False]}
|
||||||
|
|
||||||
|
# baselines
|
||||||
|
yield 'CC(LR)', CC(LogisticRegression()), params_LR
|
||||||
|
yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
|
||||||
|
yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
|
||||||
|
yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
|
||||||
|
yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
|
||||||
|
|
||||||
|
# with order-aware classifiers
|
||||||
|
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||||
|
yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
|
||||||
|
yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
|
||||||
|
yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
|
||||||
|
yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
|
||||||
|
yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
|
||||||
|
|
||||||
|
# yield 'CC(OLR-SE)', CC(LogisticSE()), params_OLR
|
||||||
|
# yield 'PCC(OLR-SE)', PCC(LogisticSE()), params_OLR
|
||||||
|
# yield 'ACC(OLR-SE)', ACC(LogisticSE()), params_OLR
|
||||||
|
# yield 'PACC(OLR-SE)', PACC(LogisticSE()), params_OLR
|
||||||
|
# yield 'SLD(OLR-SE)', EMQ(LogisticSE()), params_OLR
|
||||||
|
|
||||||
|
yield 'CC(OLR-IT)', CC(LogisticIT()), params_OLR
|
||||||
|
yield 'PCC(OLR-IT)', PCC(LogisticIT()), params_OLR
|
||||||
|
yield 'ACC(OLR-IT)', ACC(LogisticIT()), params_OLR
|
||||||
|
yield 'PACC(OLR-IT)', PACC(LogisticIT()), params_OLR
|
||||||
|
yield 'SLD(OLR-IT)', EMQ(LogisticIT()), params_OLR
|
||||||
|
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
|
||||||
|
|
||||||
|
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||||
|
yield 'CC(LAD)', CC(LAD()), params_SVR
|
||||||
|
yield 'ACC(LAD)', ACC(LAD()), params_SVR
|
||||||
|
|
||||||
|
yield 'CC(ORidge)', CC(OrdinalRidge()), params_Ridge
|
||||||
|
yield 'ACC(ORidge)', ACC(OrdinalRidge()), params_Ridge
|
||||||
|
|
||||||
|
|
||||||
|
def run_experiment(params):
|
||||||
|
qname, q, param_grid = params
|
||||||
|
qname += posfix
|
||||||
|
resultfile = join(resultpath, f'{qname}.all.APP-OQ.csv')
|
||||||
|
if os.path.exists(resultfile):
|
||||||
|
print(f'result file {resultfile} already exists: continue')
|
||||||
|
return None
|
||||||
|
|
||||||
|
print(f'fitting {qname} for all-drift')
|
||||||
|
|
||||||
|
|
||||||
|
def load_test_samples():
|
||||||
|
folderpath = join(datapath, domain, protocol, 'test_samples')
|
||||||
|
for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=5000):
|
||||||
|
if posfix == '-std':
|
||||||
|
sample.instances = zscore.transform(sample.instances)
|
||||||
|
yield sample.instances, sample.prevalence()
|
||||||
|
|
||||||
|
|
||||||
|
def load_dev_samples():
|
||||||
|
folderpath = join(datapath, domain, protocol, 'dev_samples')
|
||||||
|
for sample in tqdm(load_samples_folder(folderpath, filter=None, load_fn=load_sample_fn), total=1000):
|
||||||
|
if posfix == '-std':
|
||||||
|
sample.instances = zscore.transform(sample.instances)
|
||||||
|
yield sample.instances, sample.prevalence()
|
||||||
|
|
||||||
|
q = qp.model_selection.GridSearchQ(
|
||||||
|
q,
|
||||||
|
param_grid,
|
||||||
|
sample_size=1000,
|
||||||
|
protocol='gen',
|
||||||
|
error=mnmd,
|
||||||
|
val_split=load_dev_samples,
|
||||||
|
n_jobs=-1,
|
||||||
|
refit=False,
|
||||||
|
timeout=60*60*2,
|
||||||
|
verbose=True).fit(train)
|
||||||
|
|
||||||
|
hyperparams = f'{qname}\tall\t{q.best_params_}\t{q.best_score_}'
|
||||||
|
|
||||||
|
print('[done]')
|
||||||
|
|
||||||
|
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||||
|
mean_nmd = report['nmd'].mean()
|
||||||
|
std_nmd = report['nmd'].std()
|
||||||
|
print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||||
|
report.to_csv(resultfile, index=False)
|
||||||
|
|
||||||
|
# print('[learning regressor-based adjustment]')
|
||||||
|
# q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
|
||||||
|
# q.fit(None)
|
||||||
|
|
||||||
|
# report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||||
|
# mean_nmd = report['nmd'].mean()
|
||||||
|
# std_nmd = report['nmd'].std()
|
||||||
|
# print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||||
|
# resultfile = join(resultpath, f'{qname}.all.reg.csv')
|
||||||
|
# report.to_csv(resultfile, index=False)
|
||||||
|
|
||||||
|
return hyperparams
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||||
|
#domain = 'Books-tfidf'
|
||||||
|
posfix = ''
|
||||||
|
|
||||||
|
# domain = 'fact'
|
||||||
|
# posfix = '-std' # set to '' to avoid standardization
|
||||||
|
# posfix = ''
|
||||||
|
|
||||||
|
load_sample_fn = load_single_sample_pkl
|
||||||
|
datapath = './data'
|
||||||
|
protocol = 'app'
|
||||||
|
resultpath = join('./results', domain, protocol)
|
||||||
|
os.makedirs(resultpath, exist_ok=True)
|
||||||
|
|
||||||
|
train = load_sample_fn(join(datapath, domain), 'training_data')
|
||||||
|
|
||||||
|
if posfix=='-std':
|
||||||
|
zscore = StandardScaler()
|
||||||
|
train.instances = zscore.fit_transform(train.instances)
|
||||||
|
|
||||||
|
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
|
||||||
|
hypers = qp.util.parallel(run_experiment, quantifiers(), n_jobs=-3)
|
||||||
|
for h in hypers:
|
||||||
|
if h is not None:
|
||||||
|
foo.write(h)
|
||||||
|
foo.write('\n')
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,137 @@
|
||||||
|
import numpy as np
|
||||||
|
from scipy.stats import wilcoxon
|
||||||
|
|
||||||
|
import quapy as qp
|
||||||
|
import os
|
||||||
|
from os.path import join
|
||||||
|
|
||||||
|
from Ordinal.tabular import Table
|
||||||
|
from utils import load_samples_folder, load_single_sample_pkl, jaggedness
|
||||||
|
from Ordinal.evaluation import nmd, mnmd
|
||||||
|
from tqdm import tqdm
|
||||||
|
import pandas as pd
|
||||||
|
from glob import glob
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script takes all results from the book domain, that correspond to the APP protocol, and filters by
|
||||||
|
smoothness so that only the 50% smoothest examples are considered, and recomputes the averages of the nmd
|
||||||
|
thus effectively reporting the results for the APP-OQ protocol
|
||||||
|
"""
|
||||||
|
|
||||||
|
def parse_str_prev(df_col):
|
||||||
|
values = df_col.values
|
||||||
|
array_list = [np.fromstring(array[1:-1], sep=' ') for array in values]
|
||||||
|
return np.asarray(array_list)
|
||||||
|
|
||||||
|
def parse_result_file(path):
|
||||||
|
df = pd.read_csv(path)
|
||||||
|
true_prev = parse_str_prev(df['true-prev'])
|
||||||
|
estim_prev = parse_str_prev(df['estim-prev'])
|
||||||
|
nmd = df['nmd'].values
|
||||||
|
return true_prev, estim_prev, nmd
|
||||||
|
|
||||||
|
def ave_jaggedness(prevs, less_percentile=1):
|
||||||
|
jag = np.sort([jaggedness(p) for p in prevs])
|
||||||
|
up_to = int(less_percentile * len(jag))
|
||||||
|
return np.mean(jag[:up_to])
|
||||||
|
|
||||||
|
|
||||||
|
def retain_half_smoothest(true_prev, estim_prev, nmd):
|
||||||
|
jag = [jaggedness(p) for p in true_prev]
|
||||||
|
order = np.argsort(jag)
|
||||||
|
up_to = len(order)//2
|
||||||
|
order = order[:up_to]
|
||||||
|
return true_prev[order], estim_prev[order], nmd[order]
|
||||||
|
|
||||||
|
|
||||||
|
def compute_half_smoothest_nmd(true_prev, estim_prev, nmd):
|
||||||
|
_, _, nmd_smooth = retain_half_smoothest(true_prev, estim_prev, nmd)
|
||||||
|
return nmd_smooth
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||||
|
datapath = './data'
|
||||||
|
in_protocol = 'app'
|
||||||
|
out_protocol = 'app-oq'
|
||||||
|
in_result_path = join('./results', domain, in_protocol)
|
||||||
|
out_result_path = join('./results', domain, out_protocol)
|
||||||
|
os.makedirs(out_result_path, exist_ok=True)
|
||||||
|
|
||||||
|
# recompute the results in terms of APP-OQ
|
||||||
|
result_dict = {}
|
||||||
|
for filepath in glob(f'{in_result_path}/*).all.csv'):
|
||||||
|
name = Path(filepath).name
|
||||||
|
quantifier = name[:name.index('(')]
|
||||||
|
classifier = name[name.index('(')+1:name.index(')')]
|
||||||
|
|
||||||
|
true_prev, estim_prev, nmds = parse_result_file(filepath)
|
||||||
|
nmds = compute_half_smoothest_nmd(true_prev, estim_prev, nmds)
|
||||||
|
|
||||||
|
result_dict[classifier + '-' + quantifier] = nmds
|
||||||
|
|
||||||
|
# convert to numbers and search for the best in each quantifier
|
||||||
|
best_keys = {}
|
||||||
|
best_nmds = {}
|
||||||
|
for quantifier in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']:
|
||||||
|
best_ave, best_key, best_nmd = None, None, None
|
||||||
|
for classifier in ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']:
|
||||||
|
key = classifier + '-' + quantifier
|
||||||
|
if key in result_dict:
|
||||||
|
nmds = result_dict[key]
|
||||||
|
mean_val = np.mean(nmds)
|
||||||
|
if best_ave is None or mean_val < best_ave:
|
||||||
|
best_ave = mean_val
|
||||||
|
best_key = key
|
||||||
|
best_nmd = nmds
|
||||||
|
best_keys[quantifier] = best_key
|
||||||
|
best_nmds[quantifier] = best_nmd
|
||||||
|
|
||||||
|
# print(best_keys)
|
||||||
|
|
||||||
|
# write a latex table
|
||||||
|
for q in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']:
|
||||||
|
print('& \multicolumn{2}{c}{'+q+'} ', end='')
|
||||||
|
print('\\\\')
|
||||||
|
print('\\midrule')
|
||||||
|
for classifier in ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']:
|
||||||
|
print(classifier + '\t', end='')
|
||||||
|
for quantifier in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']:
|
||||||
|
key = classifier + '-' + quantifier
|
||||||
|
the_best_nmds = best_nmds[quantifier]
|
||||||
|
|
||||||
|
if key in result_dict:
|
||||||
|
nmds = result_dict[key]
|
||||||
|
mean_val = np.mean(nmds)
|
||||||
|
|
||||||
|
bold = False
|
||||||
|
if best_keys[quantifier] == key:
|
||||||
|
bold = True
|
||||||
|
else:
|
||||||
|
_, pval = wilcoxon(nmds, the_best_nmds)
|
||||||
|
if pval > 0.01:
|
||||||
|
bold = True
|
||||||
|
|
||||||
|
str_mean = f'{mean_val:.4f}'
|
||||||
|
if bold:
|
||||||
|
str_mean = '\\textbf{' + str_mean + '}'
|
||||||
|
|
||||||
|
if classifier == 'LR':
|
||||||
|
std_val = np.std(nmds)
|
||||||
|
str_val = f'{str_mean} & $\pm {std_val:.4f}$'
|
||||||
|
else:
|
||||||
|
rel_increment = 100 * (mean_val-np.mean(the_best_nmds)) / np.mean(the_best_nmds)
|
||||||
|
sign = '+' if rel_increment>0 else ''
|
||||||
|
str_val = f'{str_mean} & ({sign}{rel_increment:.1f}\\%)'
|
||||||
|
else:
|
||||||
|
str_val = '\multicolumn{2}{c}{---}'
|
||||||
|
|
||||||
|
str_val = ' & ' + str_val
|
||||||
|
|
||||||
|
print(str_val, end='')
|
||||||
|
print('\\\\')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,105 @@
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
import datasets
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import torch.cuda
|
||||||
|
from datasets import Dataset, DatasetDict
|
||||||
|
from sklearn.metrics import f1_score
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from transformers import AutoModelForSequenceClassification
|
||||||
|
from transformers import AutoTokenizer, DataCollatorWithPadding
|
||||||
|
from transformers import Trainer
|
||||||
|
from transformers import TrainingArguments
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script fine-tunes a pre-trained language model on a given textual training set.
|
||||||
|
The training goes for a maximum of 5 epochs, but stores the model parameters of the best performing epoch according
|
||||||
|
to the validation loss in a hold-out val split of 1000 documents (stratified).
|
||||||
|
|
||||||
|
We used it with RoBERTa in the training set of the Amazon-OQ-BK domain, i.e.:
|
||||||
|
$> python3 ./data/Books/training_data.txt roberta-base
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize_function(example):
|
||||||
|
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else 256)
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
def compute_metrics(eval_preds):
|
||||||
|
logits, labels = eval_preds
|
||||||
|
preds = np.argmax(logits, axis=-1)
|
||||||
|
return {
|
||||||
|
'macro-f1': f1_score(labels, preds, average='macro'),
|
||||||
|
'micro-f1': f1_score(labels, preds, average='micro'),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
debug = False
|
||||||
|
assert torch.cuda.is_available(), 'cuda is not available'
|
||||||
|
|
||||||
|
# datapath = './data/Books/training_data.txt'
|
||||||
|
# checkpoint = 'roberta-base'
|
||||||
|
n_args = len(sys.argv)
|
||||||
|
assert n_args==3, 'wrong arguments, expected: <training-path> <transformer-name>'
|
||||||
|
|
||||||
|
datapath = sys.argv[1] # './data/Books/training_data.txt'
|
||||||
|
checkpoint = sys.argv[2] #e.g., 'bert-base-uncased' or 'distilbert-base-uncased' or 'roberta-base'
|
||||||
|
|
||||||
|
modelout = checkpoint+'-finetuned-new'
|
||||||
|
|
||||||
|
# load the training set, and extract a held-out validation split of 1000 documents (stratified)
|
||||||
|
df = pd.read_csv(datapath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
|
||||||
|
labels = df['labels'].to_frame()
|
||||||
|
X_train, X_val = train_test_split(df, stratify=labels, test_size=.25, random_state=1)
|
||||||
|
num_labels = len(pd.unique(labels['labels']))
|
||||||
|
|
||||||
|
features = datasets.Features({'labels': datasets.Value('int32'), 'review': datasets.Value('string')})
|
||||||
|
train = Dataset.from_pandas(df=X_train, split='train', features=features)
|
||||||
|
validation = Dataset.from_pandas(df=X_val, split='validation', features=features)
|
||||||
|
|
||||||
|
dataset = DatasetDict({
|
||||||
|
'train': train.select(range(500)) if debug else train,
|
||||||
|
'validation': validation.select(range(500)) if debug else validation
|
||||||
|
})
|
||||||
|
|
||||||
|
# tokenize the dataset
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||||
|
tokenized_datasets = dataset.map(tokenize_function, batched=True)
|
||||||
|
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
|
||||||
|
|
||||||
|
# fine-tuning
|
||||||
|
training_args = TrainingArguments(
|
||||||
|
modelout,
|
||||||
|
learning_rate=2e-5,
|
||||||
|
num_train_epochs=5,
|
||||||
|
weight_decay=0.01,
|
||||||
|
evaluation_strategy='epoch',
|
||||||
|
save_strategy='epoch',
|
||||||
|
per_device_train_batch_size=16,
|
||||||
|
per_device_eval_batch_size=16,
|
||||||
|
# eval_steps=10,
|
||||||
|
save_total_limit=1,
|
||||||
|
load_best_model_at_end=True
|
||||||
|
)
|
||||||
|
trainer = Trainer(
|
||||||
|
model,
|
||||||
|
args=training_args,
|
||||||
|
train_dataset=tokenized_datasets['train'],
|
||||||
|
eval_dataset=tokenized_datasets['validation'],
|
||||||
|
data_collator=DataCollatorWithPadding(tokenizer),
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
compute_metrics=compute_metrics
|
||||||
|
)
|
||||||
|
|
||||||
|
trainer.train()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
import pandas as pd
|
||||||
|
from os.path import join
|
||||||
|
import os
|
||||||
|
from glob import glob
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from Ordinal.main import quantifiers
|
||||||
|
from Ordinal.tabular import Table
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script generates some tables for Amazon-OQ-BK (for internal use only)
|
||||||
|
"""
|
||||||
|
|
||||||
|
domain = 'Books-tfidf'
|
||||||
|
domain_bert_last = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
|
||||||
|
domain_bert_ave = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||||
|
domain_bert_post = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
|
||||||
|
prot = 'app'
|
||||||
|
outpath = f'./tables/{domain}/{prot}/results.tex'
|
||||||
|
|
||||||
|
resultpath = join('./results', domain, prot)
|
||||||
|
resultpath_bertlast = join('./results', domain_bert_last, prot)
|
||||||
|
resultpath_bertave = join('./results', domain_bert_ave, prot)
|
||||||
|
resultpath_bertpost = join('./results', domain_bert_post, prot)
|
||||||
|
|
||||||
|
methods = [qname for qname, *_ in quantifiers()]
|
||||||
|
methods += ['SLD(LR)-agg']
|
||||||
|
methods_Rlast = [m+'-RoBERTa-last' for m in methods]
|
||||||
|
methods_Rave = [m+'-RoBERTa-average' for m in methods]
|
||||||
|
methods_Rpost = [m+'-RoBERTa-posteriors' for m in methods]
|
||||||
|
methods = methods + methods_Rlast + methods_Rave + methods_Rpost
|
||||||
|
# methods += [m+'-r' for m in methods]
|
||||||
|
|
||||||
|
table = Table(benchmarks=['low', 'mid', 'high', 'all'], methods=methods, prec_mean=4, show_std=True, prec_std=4)
|
||||||
|
|
||||||
|
resultfiles = list(glob(f'{resultpath}/*.csv')) \
|
||||||
|
+ list(glob(f'{resultpath_bertlast}/*.csv')) \
|
||||||
|
+ list(glob(f'{resultpath_bertave}/*.csv')) \
|
||||||
|
+ list(glob(f'{resultpath_bertpost}/*.csv'))
|
||||||
|
|
||||||
|
for resultfile in resultfiles:
|
||||||
|
df = pd.read_csv(resultfile)
|
||||||
|
nmd = df['nmd'].values
|
||||||
|
resultname = Path(resultfile).name
|
||||||
|
method, drift, *other = resultname.replace('.csv', '').split('.')
|
||||||
|
if other:
|
||||||
|
method += '-r'
|
||||||
|
if method not in methods:
|
||||||
|
continue
|
||||||
|
|
||||||
|
table.add(drift, method, nmd)
|
||||||
|
|
||||||
|
os.makedirs(Path(outpath).parent, exist_ok=True)
|
||||||
|
|
||||||
|
tabular = """
|
||||||
|
\\resizebox{\\textwidth}{!}{%
|
||||||
|
\\begin{tabular}{|c||""" + ('c|' * (table.nbenchmarks)) + """} \hline
|
||||||
|
"""
|
||||||
|
tabular += table.latexTabularT(average=False)
|
||||||
|
tabular += """
|
||||||
|
\end{tabular}%
|
||||||
|
}"""
|
||||||
|
|
||||||
|
print('saving table in', outpath)
|
||||||
|
with open(outpath, 'wt') as foo:
|
||||||
|
foo.write(tabular)
|
||||||
|
foo.write('\n')
|
||||||
|
|
||||||
|
print('[done]')
|
||||||
|
|
|
@ -0,0 +1,82 @@
|
||||||
|
import pandas as pd
|
||||||
|
from os.path import join
|
||||||
|
import os
|
||||||
|
from glob import glob
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from Ordinal.experiments_lr_vs_ordlr import quantifiers
|
||||||
|
from Ordinal.tabular import Table
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script generates some tables for Fact-OQ (for internal use only)
|
||||||
|
"""
|
||||||
|
|
||||||
|
#domain = 'fact'
|
||||||
|
#domain = 'Books-tfidf'
|
||||||
|
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||||
|
prot = 'app'
|
||||||
|
outpath = f'./tables/{domain}/{prot}/results.tex'
|
||||||
|
|
||||||
|
resultpath = join('./results', domain, prot)
|
||||||
|
|
||||||
|
withstd=False
|
||||||
|
|
||||||
|
methods = [qname for qname, *_ in quantifiers()]
|
||||||
|
if withstd:
|
||||||
|
methods = [m+'-std' for m in methods]
|
||||||
|
#methods = methods + methods_variant
|
||||||
|
# methods += [m+'-r' for m in methods]
|
||||||
|
|
||||||
|
quantifiers_families = ['CC', 'PCC', 'ACC', 'PACC', 'SLD']
|
||||||
|
# method_variants = ['LR', 'OLR-AT', 'OLR-SE', 'OLR-IT', 'ORidge', 'LAD']
|
||||||
|
method_variants = ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']
|
||||||
|
if withstd:
|
||||||
|
method_variants = [m+'-std' for m in method_variants]
|
||||||
|
|
||||||
|
print('families:', quantifiers_families)
|
||||||
|
print('variants', method_variants)
|
||||||
|
table = Table(benchmarks=quantifiers_families, methods=method_variants, prec_mean=4, show_std=True, prec_std=4,
|
||||||
|
color=False, show_rel_to=0, missing_str='\multicolumn{1}{c}{---}', clean_zero=True)
|
||||||
|
|
||||||
|
resultfiles = list(glob(f'{resultpath}/*).all.csv'))
|
||||||
|
|
||||||
|
for resultfile in resultfiles:
|
||||||
|
df = pd.read_csv(resultfile)
|
||||||
|
nmd = df['nmd'].values
|
||||||
|
resultname = Path(resultfile).name
|
||||||
|
|
||||||
|
method, drift, *other = resultname.replace('.csv', '').replace('-RoBERTa-average','').split('.')
|
||||||
|
if drift!='all':
|
||||||
|
continue
|
||||||
|
if other:
|
||||||
|
method += '-r'
|
||||||
|
if method not in methods:
|
||||||
|
continue
|
||||||
|
|
||||||
|
family, variant = method.split('(')
|
||||||
|
variant = variant.replace(')', '')
|
||||||
|
if variant not in method_variants:
|
||||||
|
continue
|
||||||
|
table.add(family, variant, nmd)
|
||||||
|
|
||||||
|
os.makedirs(Path(outpath).parent, exist_ok=True)
|
||||||
|
|
||||||
|
tabular = """
|
||||||
|
\\resizebox{\\textwidth}{!}{%
|
||||||
|
|
||||||
|
\\begin{tabular}{c""" + ('l' * (table.nbenchmarks)) + """}
|
||||||
|
\\toprule
|
||||||
|
"""
|
||||||
|
|
||||||
|
tabular += table.latexTabularT(average=False)
|
||||||
|
tabular += """
|
||||||
|
\end{tabular}%
|
||||||
|
}"""
|
||||||
|
|
||||||
|
print('saving table in', outpath)
|
||||||
|
with open(outpath, 'wt') as foo:
|
||||||
|
foo.write(tabular)
|
||||||
|
foo.write('\n')
|
||||||
|
|
||||||
|
print('[done]')
|
||||||
|
|
|
@ -0,0 +1,152 @@
|
||||||
|
import sys
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from torch.utils.data import DataLoader
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
from transformers import AutoModelForSequenceClassification
|
||||||
|
from os.path import join
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from Ordinal.utils import load_samples_folder, load_single_sample_as_csv
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
This scripts takes a pre-trained model (a fine-tuned one) and generates numerical representations for all
|
||||||
|
samples in the dataset. The representations are saved in npy-txt plain format.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize_function(example):
|
||||||
|
tokens = tokenizer(example['review'], padding='max_length', truncation=True, max_length=64 if debug else None, return_tensors='pt')
|
||||||
|
return {
|
||||||
|
'input_ids': tokens.input_ids.cuda(),
|
||||||
|
'attention_mask': tokens.attention_mask.cuda()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def save_samples_as_txt(tensors, labels, path):
|
||||||
|
vectors = tensors
|
||||||
|
labels = labels.values
|
||||||
|
vec_lab = np.hstack([labels, vectors])
|
||||||
|
n_cols = vectors.shape[1]
|
||||||
|
np.savetxt(path, vec_lab, fmt=['%d']+['%f']*n_cols)
|
||||||
|
|
||||||
|
|
||||||
|
def transform_sample(instances, labels, outpath, batch_size=50):
|
||||||
|
ndocs = len(labels)
|
||||||
|
batches = ndocs // batch_size
|
||||||
|
assert ndocs % batches == 0, 'fragmented last bach not supported'
|
||||||
|
|
||||||
|
transformations = []
|
||||||
|
for batch_id in range(0, ndocs, batch_size):
|
||||||
|
|
||||||
|
batch_instances = instances[batch_id:batch_id + batch_size]
|
||||||
|
|
||||||
|
tokenized_dataset = tokenize_function(batch_instances)
|
||||||
|
out = model(**tokenized_dataset, output_hidden_states=True)
|
||||||
|
|
||||||
|
if generation_mode == 'posteriors':
|
||||||
|
logits = out.logits
|
||||||
|
posteriors = torch.softmax(logits, dim=-1)
|
||||||
|
transformed = posteriors
|
||||||
|
elif generation_mode == 'last':
|
||||||
|
hidden_states = out.hidden_states
|
||||||
|
last_layer_cls = hidden_states[-1][:, 0, :]
|
||||||
|
transformed = last_layer_cls
|
||||||
|
elif generation_mode == 'average':
|
||||||
|
hidden_states = out.hidden_states
|
||||||
|
hidden_states = torch.stack(hidden_states)
|
||||||
|
all_layer_cls = hidden_states[:, :, 0, :]
|
||||||
|
average_cls = torch.mean(all_layer_cls, dim=0)
|
||||||
|
transformed = average_cls
|
||||||
|
else:
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
transformations.append(transformed.cpu().numpy())
|
||||||
|
|
||||||
|
transformations = np.vstack(transformations)
|
||||||
|
save_samples_as_txt(transformations, labels, outpath)
|
||||||
|
|
||||||
|
|
||||||
|
def transform_folder_samples(protocol, splitname, skip=0):
|
||||||
|
in_folder = join(datapath, domain, protocol, splitname)
|
||||||
|
out_folder = join(datapath, outname, protocol, splitname)
|
||||||
|
total = 1000 if splitname.startswith('dev') else 5000
|
||||||
|
|
||||||
|
for i, (instances, labels) in tqdm(enumerate(
|
||||||
|
load_samples_folder(in_folder, load_fn=load_single_sample_as_csv)), desc=f'{protocol} {splitname}', total=total):
|
||||||
|
if i>= skip:
|
||||||
|
transform_sample(instances, labels, outpath=join(out_folder, f'{i}.txt'))
|
||||||
|
|
||||||
|
|
||||||
|
def get_best_checkpoint(checkpointdir):
|
||||||
|
from glob import glob
|
||||||
|
steps = []
|
||||||
|
for folder in glob(f'{checkpointdir}/checkpoint-*'):
|
||||||
|
step=int(folder.split('checkpoint-')[1])
|
||||||
|
steps.append(step)
|
||||||
|
assert len(steps) <= 2, 'unexpected number of steps, only two where expected (the best one and the last one)'
|
||||||
|
choosen = f'{checkpointdir}/checkpoint-{min(steps)}'
|
||||||
|
print(f'choosen checkpoint is {choosen}')
|
||||||
|
return choosen
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
debug = False
|
||||||
|
assert torch.cuda.is_available(), 'cuda is not available'
|
||||||
|
|
||||||
|
#checkpoint='roberta-base-val-finetuned'
|
||||||
|
#generation_mode = 'average' #ave seemed to work slightly better
|
||||||
|
|
||||||
|
n_args = len(sys.argv)
|
||||||
|
assert n_args==3, 'wrong arguments, expected: <checkpoint> <generation-mode>\n' \
|
||||||
|
'\tgeneration-mode: last (last layer), ave (average pooling), or posteriors (posterior probabilities)'
|
||||||
|
|
||||||
|
checkpoint = sys.argv[1] #e.g., 'bert-base-uncased'
|
||||||
|
generation_mode = sys.argv[2] # e.g., 'average' # ave seemed to work slightly better
|
||||||
|
|
||||||
|
assert 'finetuned' in checkpoint, 'looks like this model is not finetuned'
|
||||||
|
|
||||||
|
checkpoint = get_best_checkpoint(checkpoint)
|
||||||
|
|
||||||
|
num_labels = 5
|
||||||
|
|
||||||
|
datapath = './data'
|
||||||
|
domain = 'Books'
|
||||||
|
protocols = ['real', 'app'] # ['app', 'npp']
|
||||||
|
|
||||||
|
assert generation_mode in ['last', 'average', 'posteriors'], 'unknown generation_model'
|
||||||
|
outname = domain + f'-{checkpoint}-{generation_mode}'
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
print('loading', checkpoint)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
||||||
|
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).cuda()
|
||||||
|
|
||||||
|
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||||
|
|
||||||
|
print('transforming the training set')
|
||||||
|
instances, labels = load_single_sample_as_csv(join(datapath, domain), 'training_data')
|
||||||
|
transform_sample(instances, labels, join(datapath, outname, 'training_data.txt'))
|
||||||
|
print('[done]')
|
||||||
|
|
||||||
|
for protocol in protocols:
|
||||||
|
in_path = join(datapath, domain, protocol)
|
||||||
|
out_path = join(datapath, outname, protocol)
|
||||||
|
os.makedirs(out_path, exist_ok=True)
|
||||||
|
os.makedirs(join(out_path, 'dev_samples'), exist_ok=True)
|
||||||
|
os.makedirs(join(out_path, 'test_samples'), exist_ok=True)
|
||||||
|
shutil.copyfile(join(in_path, 'dev_prevalences.txt'), join(out_path, 'dev_prevalences.txt'))
|
||||||
|
shutil.copyfile(join(in_path, 'test_prevalences.txt'), join(out_path, 'test_prevalences.txt'))
|
||||||
|
|
||||||
|
print('processing', protocol)
|
||||||
|
transform_folder_samples(protocol, 'dev_samples')
|
||||||
|
transform_folder_samples(protocol, 'test_samples')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,156 @@
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.linear_model import LogisticRegression
|
||||||
|
import quapy as qp
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from Ordinal.model import OrderedLogisticRegression, LogisticAT
|
||||||
|
from quapy.method.aggregative import PACC, CC, EMQ, PCC, ACC, SLD, HDy
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from os.path import join
|
||||||
|
import os
|
||||||
|
from utils import load_samples_folder, load_simple_sample_npytxt, load_single_sample_pkl
|
||||||
|
from evaluation import nmd, mnmd
|
||||||
|
from time import time
|
||||||
|
import pickle
|
||||||
|
from tqdm import tqdm
|
||||||
|
import mord
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def quantifiers():
|
||||||
|
params_LR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||||
|
# params_OLR = {'alpha':np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
|
||||||
|
params_OLR = {'alpha': np.logspace(-3, 3, 7), 'class_weight': [None, 'balanced']}
|
||||||
|
params_SVR = {'C': np.logspace(-3,3,7), 'class_weight': [None, 'balanced']}
|
||||||
|
# params_SVR = {'C': np.logspace(0, 1, 2)}
|
||||||
|
|
||||||
|
# baselines
|
||||||
|
yield 'CC(LR)', CC(LogisticRegression()), params_LR
|
||||||
|
yield 'PCC(LR)', PCC(LogisticRegression()), params_LR
|
||||||
|
yield 'ACC(LR)', ACC(LogisticRegression()), params_LR
|
||||||
|
yield 'PACC(LR)', PACC(LogisticRegression()), params_LR
|
||||||
|
#yield 'HDy(LR)', HDy(LogisticRegression()), params_LR
|
||||||
|
yield 'SLD(LR)', EMQ(LogisticRegression()), params_LR
|
||||||
|
|
||||||
|
# with order-aware classifiers
|
||||||
|
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||||
|
#yield 'CC(OLR-AT)', CC(LogisticAT()), params_OLR
|
||||||
|
#yield 'PCC(OLR-AT)', PCC(LogisticAT()), params_OLR
|
||||||
|
#yield 'ACC(OLR-AT)', ACC(LogisticAT()), params_OLR
|
||||||
|
#yield 'PACC(OLR-AT)', PACC(LogisticAT()), params_OLR
|
||||||
|
#yield 'HDy(OLR-AT)', HDy(mord.LogisticAT()), params_OLR
|
||||||
|
#yield 'SLD(OLR-AT)', EMQ(LogisticAT()), params_OLR
|
||||||
|
# other options include mord.LogisticIT(alpha=1.), mord.LogisticSE(alpha=1.)
|
||||||
|
|
||||||
|
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||||
|
# I am using my implementation, which caters for predict_proba (linear distance to the two closest classes, 0 in the rest)
|
||||||
|
# the other implementation has OrdinalRidge(alpha=1.0) and LAD(C=1.0) with my wrapper classes for having the nclasses_; those do
|
||||||
|
# not implement predict_proba nor decision_score
|
||||||
|
#yield 'CC(SVR)', CC(RegressorClassifier()), params_SVR
|
||||||
|
#yield 'PCC(SVR)', PCC(RegressorClassifier()), params_SVR
|
||||||
|
# yield 'PCC-cal(SVR)', PCC(RegressorClassifier()), params_SVR
|
||||||
|
# yield 'ACC(SVR)', ACC(RegressorClassifier()), params_SVR
|
||||||
|
# yield 'PACC(SVR)', PACC(RegressorClassifier()), params_SVR
|
||||||
|
#yield 'HDy(SVR)', HDy(RegressorClassifier()), params_SVR
|
||||||
|
# yield 'SLD(SVR)', EMQ(RegressorClassifier()), params_SVR
|
||||||
|
|
||||||
|
|
||||||
|
def run_experiment(params):
|
||||||
|
qname, q, param_grid, drift = params
|
||||||
|
qname += posfix
|
||||||
|
resultfile = join(resultpath, f'{qname}.{drift}.csv')
|
||||||
|
if os.path.exists(resultfile):
|
||||||
|
print(f'result file {resultfile} already exists: continue')
|
||||||
|
return None
|
||||||
|
|
||||||
|
print(f'fitting {qname} for {drift}-drift')
|
||||||
|
|
||||||
|
|
||||||
|
def load_test_samples():
|
||||||
|
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.test.id.npy'))
|
||||||
|
ids = set(ids)
|
||||||
|
folderpath = join(datapath, domain, protocol, 'test_samples')
|
||||||
|
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
|
||||||
|
yield sample.instances, sample.prevalence()
|
||||||
|
|
||||||
|
|
||||||
|
def load_dev_samples():
|
||||||
|
ids = np.load(join(datapath, domain, protocol, f'{drift}drift.dev.id.npy'))
|
||||||
|
ids = set(ids)
|
||||||
|
folderpath = join(datapath, domain, protocol, 'dev_samples')
|
||||||
|
for sample in tqdm(load_samples_folder(folderpath, filter=ids, load_fn=load_sample_fn), total=len(ids)):
|
||||||
|
yield sample.instances, sample.prevalence()
|
||||||
|
|
||||||
|
q = qp.model_selection.GridSearchQ(
|
||||||
|
q,
|
||||||
|
param_grid,
|
||||||
|
sample_size=1000,
|
||||||
|
protocol='gen',
|
||||||
|
error=mnmd,
|
||||||
|
val_split=load_dev_samples,
|
||||||
|
n_jobs=-1,
|
||||||
|
refit=False,
|
||||||
|
verbose=True).fit(train)
|
||||||
|
|
||||||
|
hyperparams = f'{qname}\t{drift}\t{q.best_params_}'
|
||||||
|
|
||||||
|
print('[done]')
|
||||||
|
|
||||||
|
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||||
|
mean_nmd = report['nmd'].mean()
|
||||||
|
std_nmd = report['nmd'].std()
|
||||||
|
print(f'{qname}: {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||||
|
report.to_csv(resultfile, index=False)
|
||||||
|
|
||||||
|
print('[learning regressor-based adjustment]')
|
||||||
|
q = RegressionQuantification(q.best_model(), val_samples_generator=load_dev_samples)
|
||||||
|
q.fit(None)
|
||||||
|
|
||||||
|
report = qp.evaluation.gen_prevalence_report(q, gen_fn=load_test_samples, error_metrics=[nmd])
|
||||||
|
mean_nmd = report['nmd'].mean()
|
||||||
|
std_nmd = report['nmd'].std()
|
||||||
|
print(f'[{qname} regression-correction] {mean_nmd:.4f} +-{std_nmd:.4f}')
|
||||||
|
resultfile = join(resultpath, f'{qname}.{drift}.reg.csv')
|
||||||
|
report.to_csv(resultfile, index=False)
|
||||||
|
|
||||||
|
return hyperparams
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
#preprocessing = 'roberta.last'
|
||||||
|
preprocessing = 'roberta.average'
|
||||||
|
# preprocessing = 'roberta.posteriors'
|
||||||
|
#preprocessing = 'tfidf'
|
||||||
|
if preprocessing=='tfidf':
|
||||||
|
domain = 'Books-tfidf'
|
||||||
|
posfix = ''
|
||||||
|
elif preprocessing=='roberta.last':
|
||||||
|
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-last'
|
||||||
|
posfix = '-RoBERTa-last'
|
||||||
|
elif preprocessing=='roberta.average':
|
||||||
|
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||||
|
posfix = '-RoBERTa-average'
|
||||||
|
elif preprocessing=='roberta.posteriors':
|
||||||
|
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
|
||||||
|
posfix = '-RoBERTa-posteriors'
|
||||||
|
load_sample_fn = load_single_sample_pkl
|
||||||
|
datapath = './data'
|
||||||
|
protocol = 'app'
|
||||||
|
resultpath = join('./results', domain, protocol)
|
||||||
|
os.makedirs(resultpath, exist_ok=True)
|
||||||
|
|
||||||
|
train = load_sample_fn(join(datapath, domain), 'training_data')
|
||||||
|
|
||||||
|
with open(join(resultpath, 'hyper.txt'), 'at') as foo:
|
||||||
|
#for drift in [f'smooth{i}' for i in range(5)] + ['all']:
|
||||||
|
params = [(*qs, drift) for qs in quantifiers() for drift in ['low', 'mid', 'high', 'all']]
|
||||||
|
hypers = qp.util.parallel(run_experiment, params, n_jobs=-2)
|
||||||
|
for h in hypers:
|
||||||
|
if h is not None:
|
||||||
|
foo.write(h)
|
||||||
|
foo.write('\n')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,195 @@
|
||||||
|
import mord
|
||||||
|
import numpy as np
|
||||||
|
from scipy.sparse import issparse
|
||||||
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
|
from sklearn.decomposition import TruncatedSVD
|
||||||
|
from sklearn.linear_model import Ridge
|
||||||
|
from sklearn.svm import LinearSVR
|
||||||
|
from sklearn.utils.class_weight import compute_class_weight
|
||||||
|
from statsmodels.miscmodels.ordinal_model import OrderedModel
|
||||||
|
|
||||||
|
|
||||||
|
class OrderedLogisticRegression:
|
||||||
|
def __init__(self, model='logit'):
|
||||||
|
assert model in ['logit', 'probit'], 'unknown ordered model, valid ones are logit or probit'
|
||||||
|
self.model = model
|
||||||
|
|
||||||
|
def fit(self, X, y):
|
||||||
|
if issparse(X):
|
||||||
|
self.svd = TruncatedSVD(500)
|
||||||
|
X = self.svd.fit_transform(X)
|
||||||
|
self.learner = OrderedModel(y, X, distr=self.model)
|
||||||
|
self.res_prob = self.learner.fit(method='bfgs', disp=False, skip_hessian=True)
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
prob = self.predict_proba(X)
|
||||||
|
return np.argmax(prob, axis=1)
|
||||||
|
|
||||||
|
def predict_proba(self, X):
|
||||||
|
if issparse(X):
|
||||||
|
assert hasattr(self, 'svd'), \
|
||||||
|
'X matrix in predict is sparse, but the method has not been fit with sparse type'
|
||||||
|
X = self.svd.transform(X)
|
||||||
|
return self.res_prob.model.predict(self.res_prob.params, exog=X)
|
||||||
|
|
||||||
|
|
||||||
|
class LAD(BaseEstimator, ClassifierMixin):
|
||||||
|
def __init__(self, C=1.0, class_weight=None):
|
||||||
|
self.C = C
|
||||||
|
self.class_weight = class_weight
|
||||||
|
|
||||||
|
def fit(self, X, y, sample_weight=None):
|
||||||
|
self.regressor = LinearSVR(C=self.C)
|
||||||
|
# self.regressor = SVR()
|
||||||
|
# self.regressor = Ridge(normalize=True)
|
||||||
|
classes = sorted(np.unique(y))
|
||||||
|
self.nclasses = len(classes)
|
||||||
|
if self.class_weight == 'balanced':
|
||||||
|
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||||
|
sample_weight = class_weight[y]
|
||||||
|
self.regressor.fit(X, y, sample_weight=sample_weight)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
r = self.regressor.predict(X)
|
||||||
|
c = np.round(r)
|
||||||
|
c[c<0]=0
|
||||||
|
c[c>(self.nclasses-1)]=self.nclasses-1
|
||||||
|
return c.astype(np.int)
|
||||||
|
|
||||||
|
# def predict_proba(self, X):
|
||||||
|
# r = self.regressor.predict(X)
|
||||||
|
# nC = len(self.classes_)
|
||||||
|
# r = np.clip(r, 0, nC - 1)
|
||||||
|
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||||
|
# invdist = 1 - dists
|
||||||
|
# invdist[invdist < 0] = 0
|
||||||
|
# return invdist
|
||||||
|
|
||||||
|
def decision_function(self, X):
|
||||||
|
r = self.regressor.predict(X)
|
||||||
|
nC = len(self.classes_)
|
||||||
|
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||||
|
invdist = 1 - dists
|
||||||
|
return invdist
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes_(self):
|
||||||
|
return np.arange(self.nclasses)
|
||||||
|
|
||||||
|
def get_params(self, deep=True):
|
||||||
|
return {'C':self.C, 'class_weight': self.class_weight}
|
||||||
|
|
||||||
|
def set_params(self, **params):
|
||||||
|
self.C = params['C']
|
||||||
|
self.class_weight = params['class_weight']
|
||||||
|
|
||||||
|
|
||||||
|
class OrdinalRidge(BaseEstimator, ClassifierMixin):
|
||||||
|
def __init__(self, alpha=1.0, class_weight=None, normalize=False):
|
||||||
|
self.alpha = alpha
|
||||||
|
self.class_weight = class_weight
|
||||||
|
self.normalize = normalize
|
||||||
|
|
||||||
|
def fit(self, X, y, sample_weight=None):
|
||||||
|
self.regressor = Ridge(alpha=self.alpha, normalize=self.normalize)
|
||||||
|
classes = sorted(np.unique(y))
|
||||||
|
self.nclasses = len(classes)
|
||||||
|
if self.class_weight == 'balanced':
|
||||||
|
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||||
|
sample_weight = class_weight[y]
|
||||||
|
self.regressor.fit(X, y, sample_weight=sample_weight)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
r = self.regressor.predict(X)
|
||||||
|
c = np.round(r)
|
||||||
|
c[c<0]=0
|
||||||
|
c[c>(self.nclasses-1)]=self.nclasses-1
|
||||||
|
return c.astype(np.int)
|
||||||
|
|
||||||
|
# def predict_proba(self, X):
|
||||||
|
# r = self.regressor.predict(X)
|
||||||
|
# nC = len(self.classes_)
|
||||||
|
# r = np.clip(r, 0, nC - 1)
|
||||||
|
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||||
|
# invdist = 1 - dists
|
||||||
|
# invdist[invdist < 0] = 0
|
||||||
|
# return invdist
|
||||||
|
|
||||||
|
def decision_function(self, X):
|
||||||
|
r = self.regressor.predict(X)
|
||||||
|
nC = len(self.classes_)
|
||||||
|
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||||
|
invdist = 1 - dists
|
||||||
|
return invdist
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes_(self):
|
||||||
|
return np.arange(self.nclasses)
|
||||||
|
|
||||||
|
def get_params(self, deep=True):
|
||||||
|
return {'alpha':self.alpha, 'class_weight': self.class_weight, 'normalize': self.normalize}
|
||||||
|
|
||||||
|
def set_params(self, **params):
|
||||||
|
self.alpha = params['alpha']
|
||||||
|
self.class_weight = params['class_weight']
|
||||||
|
self.normalize = params['normalize']
|
||||||
|
|
||||||
|
|
||||||
|
# with order-aware classifiers
|
||||||
|
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||||
|
class LogisticAT(mord.LogisticAT):
|
||||||
|
def __init__(self, alpha=1.0, class_weight=None):
|
||||||
|
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||||
|
self.class_weight = class_weight
|
||||||
|
super(LogisticAT, self).__init__(alpha=alpha)
|
||||||
|
|
||||||
|
def fit(self, X, y, sample_weight=None):
|
||||||
|
if self.class_weight == 'balanced':
|
||||||
|
classes = sorted(np.unique(y))
|
||||||
|
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||||
|
sample_weight = class_weight[y]
|
||||||
|
return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight)
|
||||||
|
|
||||||
|
|
||||||
|
class LogisticSE(mord.LogisticSE):
|
||||||
|
def __init__(self, alpha=1.0, class_weight=None):
|
||||||
|
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||||
|
self.class_weight = class_weight
|
||||||
|
super(LogisticSE, self).__init__(alpha=alpha)
|
||||||
|
|
||||||
|
def fit(self, X, y, sample_weight=None):
|
||||||
|
if self.class_weight == 'balanced':
|
||||||
|
classes = sorted(np.unique(y))
|
||||||
|
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||||
|
sample_weight = class_weight[y]
|
||||||
|
return super(LogisticSE, self).fit(X, y, sample_weight=sample_weight)
|
||||||
|
|
||||||
|
|
||||||
|
class LogisticIT(mord.LogisticIT):
|
||||||
|
def __init__(self, alpha=1.0, class_weight=None):
|
||||||
|
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||||
|
self.class_weight = class_weight
|
||||||
|
super(LogisticIT, self).__init__(alpha=alpha)
|
||||||
|
|
||||||
|
def fit(self, X, y, sample_weight=None):
|
||||||
|
if self.class_weight == 'balanced':
|
||||||
|
classes = sorted(np.unique(y))
|
||||||
|
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||||
|
sample_weight = class_weight[y]
|
||||||
|
return super(LogisticIT, self).fit(X, y, sample_weight=sample_weight)
|
||||||
|
|
||||||
|
|
||||||
|
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||||
|
# class LAD(mord.LAD):
|
||||||
|
# def fit(self, X, y):
|
||||||
|
# self.classes_ = sorted(np.unique(y))
|
||||||
|
# return super().fit(X, y)
|
||||||
|
|
||||||
|
|
||||||
|
# class OrdinalRidge(mord.OrdinalRidge):
|
||||||
|
# def fit(self, X, y):
|
||||||
|
# self.classes_ = sorted(np.unique(y))
|
||||||
|
# return super().fit(X, y)
|
||||||
|
|
|
@ -0,0 +1,296 @@
|
||||||
|
from copy import deepcopy
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
|
from sklearn.calibration import CalibratedClassifierCV
|
||||||
|
from sklearn.decomposition import TruncatedSVD
|
||||||
|
from sklearn.linear_model import LogisticRegression, Ridge
|
||||||
|
from scipy.sparse import issparse
|
||||||
|
from sklearn.multiclass import OneVsRestClassifier
|
||||||
|
from sklearn.multioutput import MultiOutputRegressor
|
||||||
|
from sklearn.preprocessing import StandardScaler
|
||||||
|
from sklearn.svm import LinearSVR, SVR
|
||||||
|
from statsmodels.miscmodels.ordinal_model import OrderedModel
|
||||||
|
import mord
|
||||||
|
from sklearn.utils.class_weight import compute_class_weight
|
||||||
|
|
||||||
|
|
||||||
|
class OrderedLogisticRegression:
|
||||||
|
def __init__(self, model='logit'):
|
||||||
|
assert model in ['logit', 'probit'], 'unknown ordered model, valid ones are logit or probit'
|
||||||
|
self.model = model
|
||||||
|
|
||||||
|
def fit(self, X, y):
|
||||||
|
if issparse(X):
|
||||||
|
self.svd = TruncatedSVD(500)
|
||||||
|
X = self.svd.fit_transform(X)
|
||||||
|
self.learner = OrderedModel(y, X, distr=self.model)
|
||||||
|
self.res_prob = self.learner.fit(method='bfgs', disp=False, skip_hessian=True)
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
prob = self.predict_proba(X)
|
||||||
|
return np.argmax(prob, axis=1)
|
||||||
|
|
||||||
|
def predict_proba(self, X):
|
||||||
|
if issparse(X):
|
||||||
|
assert hasattr(self, 'svd'), \
|
||||||
|
'X matrix in predict is sparse, but the method has not been fit with sparse type'
|
||||||
|
X = self.svd.transform(X)
|
||||||
|
return self.res_prob.quantifier.predict(self.res_prob.params, exog=X)
|
||||||
|
|
||||||
|
|
||||||
|
class StackedClassifier: # aka Funnelling Monolingual
|
||||||
|
def __init__(self, base_estimator=LogisticRegression()):
|
||||||
|
if not hasattr(base_estimator, 'predict_proba'):
|
||||||
|
print('the estimator does not seem to be probabilistic: calibrating')
|
||||||
|
base_estimator = CalibratedClassifierCV(base_estimator)
|
||||||
|
# self.base = deepcopy(OneVsRestClassifier(base_estimator))
|
||||||
|
# self.meta = deepcopy(OneVsRestClassifier(base_estimator))
|
||||||
|
self.base = deepcopy(base_estimator)
|
||||||
|
self.meta = deepcopy(base_estimator)
|
||||||
|
self.norm = StandardScaler()
|
||||||
|
|
||||||
|
def fit(self, X, y):
|
||||||
|
self.base.fit(X, y)
|
||||||
|
P = self.base.predict_proba(X)
|
||||||
|
P = self.norm.fit_transform(P)
|
||||||
|
self.meta.fit(P, y)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
P = self.base.predict_proba(X)
|
||||||
|
P = self.norm.transform(P)
|
||||||
|
return self.meta.predict(P)
|
||||||
|
|
||||||
|
def predict_proba(self, X):
|
||||||
|
P = self.base.predict_proba(X)
|
||||||
|
P = self.norm.transform(P)
|
||||||
|
return self.meta.predict_proba(P)
|
||||||
|
|
||||||
|
|
||||||
|
class RegressionQuantification:
|
||||||
|
def __init__(self,
|
||||||
|
base_quantifier,
|
||||||
|
regression='svr',
|
||||||
|
val_samples_generator=None,
|
||||||
|
norm=True):
|
||||||
|
|
||||||
|
self.base_quantifier = base_quantifier
|
||||||
|
if isinstance(regression, str):
|
||||||
|
assert regression in ['ridge', 'svr'], 'unknown regression model'
|
||||||
|
if regression == 'ridge':
|
||||||
|
self.reg = Ridge(normalize=norm)
|
||||||
|
elif regression == 'svr':
|
||||||
|
self.reg = MultiOutputRegressor(LinearSVR())
|
||||||
|
else:
|
||||||
|
self.reg = regression
|
||||||
|
# self.reg = MultiTaskLassoCV(normalize=norm)
|
||||||
|
# self.reg = KernelRidge(kernel='rbf')
|
||||||
|
# self.reg = LassoLarsCV(normalize=norm)
|
||||||
|
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
|
||||||
|
#self.reg = LinearRegression(normalize=norm) # <- bien
|
||||||
|
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
|
||||||
|
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
|
||||||
|
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
|
||||||
|
self.regression = regression
|
||||||
|
self.val_samples_generator = val_samples_generator
|
||||||
|
# self.norm = StandardScaler()
|
||||||
|
# self.covs = covs
|
||||||
|
|
||||||
|
def generate_validation_samples(self):
|
||||||
|
Xs, ys = [], []
|
||||||
|
for instances, prevalence in self.val_samples_generator():
|
||||||
|
ys.append(prevalence)
|
||||||
|
Xs.append(self.base_quantifier.quantify(instances))
|
||||||
|
Xs = np.asarray(Xs)
|
||||||
|
ys = np.asarray(ys)
|
||||||
|
return Xs, ys
|
||||||
|
|
||||||
|
def fit(self, data):
|
||||||
|
print('fitting quantifier')
|
||||||
|
if data is not None:
|
||||||
|
self.base_quantifier.fit(data)
|
||||||
|
print('generating val samples')
|
||||||
|
Xs, ys = self.generate_validation_samples()
|
||||||
|
# Xs = self.norm.fit_transform(Xs)
|
||||||
|
print('fitting regressor')
|
||||||
|
self.reg.fit(Xs, ys)
|
||||||
|
print('[done]')
|
||||||
|
return self
|
||||||
|
|
||||||
|
def quantify(self, instances):
|
||||||
|
Xs = self.base_quantifier.quantify(instances).reshape(1, -1)
|
||||||
|
# Xs = self.norm.transform(Xs)
|
||||||
|
Xs = self.reg.predict(Xs).flatten()
|
||||||
|
# Xs = self.norm.inverse_transform(Xs)
|
||||||
|
Xs = np.clip(Xs, 0, 1)
|
||||||
|
adjusted = Xs / Xs.sum()
|
||||||
|
# adjusted = np.clip(Xs, 0, 1)
|
||||||
|
adjusted = adjusted
|
||||||
|
return adjusted
|
||||||
|
|
||||||
|
def get_params(self, deep=True):
|
||||||
|
return self.base_quantifier.get_params()
|
||||||
|
|
||||||
|
def set_params(self, **params):
|
||||||
|
self.base_quantifier.set_params(**params)
|
||||||
|
|
||||||
|
|
||||||
|
class LAD(BaseEstimator, ClassifierMixin):
|
||||||
|
def __init__(self, C=1.0, class_weight=None):
|
||||||
|
self.C = C
|
||||||
|
self.class_weight = class_weight
|
||||||
|
|
||||||
|
def fit(self, X, y, sample_weight=None):
|
||||||
|
self.regressor = LinearSVR(C=self.C)
|
||||||
|
# self.regressor = SVR()
|
||||||
|
# self.regressor = Ridge(normalize=True)
|
||||||
|
classes = sorted(np.unique(y))
|
||||||
|
self.nclasses = len(classes)
|
||||||
|
if self.class_weight == 'balanced':
|
||||||
|
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||||
|
sample_weight = class_weight[y]
|
||||||
|
self.regressor.fit(X, y, sample_weight=sample_weight)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
r = self.regressor.predict(X)
|
||||||
|
c = np.round(r)
|
||||||
|
c[c<0]=0
|
||||||
|
c[c>(self.nclasses-1)]=self.nclasses-1
|
||||||
|
return c.astype(int)
|
||||||
|
|
||||||
|
# def predict_proba(self, X):
|
||||||
|
# r = self.regressor.predict(X)
|
||||||
|
# nC = len(self.classes_)
|
||||||
|
# r = np.clip(r, 0, nC - 1)
|
||||||
|
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||||
|
# invdist = 1 - dists
|
||||||
|
# invdist[invdist < 0] = 0
|
||||||
|
# return invdist
|
||||||
|
|
||||||
|
def decision_function(self, X):
|
||||||
|
r = self.regressor.predict(X)
|
||||||
|
nC = len(self.classes_)
|
||||||
|
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||||
|
invdist = 1 - dists
|
||||||
|
return invdist
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes_(self):
|
||||||
|
return np.arange(self.nclasses)
|
||||||
|
|
||||||
|
def get_params(self, deep=True):
|
||||||
|
return {'C':self.C, 'class_weight': self.class_weight}
|
||||||
|
|
||||||
|
def set_params(self, **params):
|
||||||
|
self.C = params['C']
|
||||||
|
self.class_weight = params['class_weight']
|
||||||
|
|
||||||
|
|
||||||
|
class OrdinalRidge(BaseEstimator, ClassifierMixin):
|
||||||
|
def __init__(self, alpha=1.0, class_weight=None, normalize=False):
|
||||||
|
self.alpha = alpha
|
||||||
|
self.class_weight = class_weight
|
||||||
|
self.normalize = normalize
|
||||||
|
|
||||||
|
def fit(self, X, y, sample_weight=None):
|
||||||
|
self.regressor = Ridge(alpha=self.alpha, normalize=self.normalize)
|
||||||
|
classes = sorted(np.unique(y))
|
||||||
|
self.nclasses = len(classes)
|
||||||
|
if self.class_weight == 'balanced':
|
||||||
|
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||||
|
sample_weight = class_weight[y]
|
||||||
|
self.regressor.fit(X, y, sample_weight=sample_weight)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
r = self.regressor.predict(X)
|
||||||
|
c = np.round(r)
|
||||||
|
c[c<0]=0
|
||||||
|
c[c>(self.nclasses-1)]=self.nclasses-1
|
||||||
|
return c.astype(int)
|
||||||
|
|
||||||
|
# def predict_proba(self, X):
|
||||||
|
# r = self.regressor.predict(X)
|
||||||
|
# nC = len(self.classes_)
|
||||||
|
# r = np.clip(r, 0, nC - 1)
|
||||||
|
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||||
|
# invdist = 1 - dists
|
||||||
|
# invdist[invdist < 0] = 0
|
||||||
|
# return invdist
|
||||||
|
|
||||||
|
def decision_function(self, X):
|
||||||
|
r = self.regressor.predict(X)
|
||||||
|
nC = len(self.classes_)
|
||||||
|
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||||
|
invdist = 1 - dists
|
||||||
|
return invdist
|
||||||
|
|
||||||
|
@property
|
||||||
|
def classes_(self):
|
||||||
|
return np.arange(self.nclasses)
|
||||||
|
|
||||||
|
def get_params(self, deep=True):
|
||||||
|
return {'alpha':self.alpha, 'class_weight': self.class_weight, 'normalize': self.normalize}
|
||||||
|
|
||||||
|
def set_params(self, **params):
|
||||||
|
self.alpha = params['alpha']
|
||||||
|
self.class_weight = params['class_weight']
|
||||||
|
self.normalize = params['normalize']
|
||||||
|
|
||||||
|
# with order-aware classifiers
|
||||||
|
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||||
|
class LogisticAT(mord.LogisticAT):
|
||||||
|
def __init__(self, alpha=1.0, class_weight=None):
|
||||||
|
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||||
|
self.class_weight = class_weight
|
||||||
|
super(LogisticAT, self).__init__(alpha=alpha)
|
||||||
|
|
||||||
|
def fit(self, X, y, sample_weight=None):
|
||||||
|
if self.class_weight == 'balanced':
|
||||||
|
classes = sorted(np.unique(y))
|
||||||
|
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||||
|
sample_weight = class_weight[y]
|
||||||
|
return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight)
|
||||||
|
|
||||||
|
|
||||||
|
class LogisticSE(mord.LogisticSE):
|
||||||
|
def __init__(self, alpha=1.0, class_weight=None):
|
||||||
|
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||||
|
self.class_weight = class_weight
|
||||||
|
super(LogisticSE, self).__init__(alpha=alpha)
|
||||||
|
|
||||||
|
def fit(self, X, y, sample_weight=None):
|
||||||
|
if self.class_weight == 'balanced':
|
||||||
|
classes = sorted(np.unique(y))
|
||||||
|
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||||
|
sample_weight = class_weight[y]
|
||||||
|
return super(LogisticSE, self).fit(X, y, sample_weight=sample_weight)
|
||||||
|
|
||||||
|
|
||||||
|
class LogisticIT(mord.LogisticIT):
|
||||||
|
def __init__(self, alpha=1.0, class_weight=None):
|
||||||
|
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||||
|
self.class_weight = class_weight
|
||||||
|
super(LogisticIT, self).__init__(alpha=alpha)
|
||||||
|
|
||||||
|
def fit(self, X, y, sample_weight=None):
|
||||||
|
if self.class_weight == 'balanced':
|
||||||
|
classes = sorted(np.unique(y))
|
||||||
|
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||||
|
sample_weight = class_weight[y]
|
||||||
|
return super(LogisticIT, self).fit(X, y, sample_weight=sample_weight)
|
||||||
|
|
||||||
|
|
||||||
|
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||||
|
# class LAD(mord.LAD):
|
||||||
|
# def fit(self, X, y):
|
||||||
|
# self.classes_ = sorted(np.unique(y))
|
||||||
|
# return super().fit(X, y)
|
||||||
|
|
||||||
|
|
||||||
|
# class OrdinalRidge(mord.OrdinalRidge):
|
||||||
|
# def fit(self, X, y):
|
||||||
|
# self.classes_ = sorted(np.unique(y))
|
||||||
|
# return super().fit(X, y)
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
import numpy as np
|
||||||
|
import quapy as qp
|
||||||
|
from evaluation import nmd
|
||||||
|
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
import pickle
|
||||||
|
import os
|
||||||
|
from os.path import join
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
This scripts generates a partition of a dataset in terms of "shift".
|
||||||
|
The partition is only carried out by generating index vectors.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def partition_by_drift(split, training_prevalence):
|
||||||
|
assert split in ['dev', 'test'], 'invalid split name'
|
||||||
|
total=1000 if split=='dev' else 5000
|
||||||
|
drifts = []
|
||||||
|
folderpath = join(datapath, domain, 'app', f'{split}_samples')
|
||||||
|
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
|
||||||
|
drifts.append(nmd(training_prevalence, sample.prevalence()))
|
||||||
|
drifts = np.asarray(drifts)
|
||||||
|
order = np.argsort(drifts)
|
||||||
|
nD = len(order)
|
||||||
|
low_drift, mid_drift, high_drift = order[:nD // 3], order[nD // 3:2 * nD // 3], order[2 * nD // 3:]
|
||||||
|
all_drift = np.arange(nD)
|
||||||
|
np.save(join(datapath, domain, 'app', f'lowdrift.{split}.id.npy'), low_drift)
|
||||||
|
np.save(join(datapath, domain, 'app', f'middrift.{split}.id.npy'), mid_drift)
|
||||||
|
np.save(join(datapath, domain, 'app', f'highdrift.{split}.id.npy'), high_drift)
|
||||||
|
np.save(join(datapath, domain, 'app', f'alldrift.{split}.id.npy'), all_drift)
|
||||||
|
lows = drifts[low_drift]
|
||||||
|
mids = drifts[mid_drift]
|
||||||
|
highs = drifts[high_drift]
|
||||||
|
all = drifts[all_drift]
|
||||||
|
print(f'low drift: interval [{lows.min():.4f}, {lows.max():.4f}] mean: {lows.mean():.4f}')
|
||||||
|
print(f'mid drift: interval [{mids.min():.4f}, {mids.max():.4f}] mean: {mids.mean():.4f}')
|
||||||
|
print(f'high drift: interval [{highs.min():.4f}, {highs.max():.4f}] mean: {highs.mean():.4f}')
|
||||||
|
print(f'all drift: interval [{all.min():.4f}, {all.max():.4f}] mean: {all.mean():.4f}')
|
||||||
|
|
||||||
|
|
||||||
|
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-posteriors'
|
||||||
|
datapath = './data'
|
||||||
|
|
||||||
|
training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
|
||||||
|
|
||||||
|
partition_by_drift('dev', training.prevalence())
|
||||||
|
partition_by_drift('test', training.prevalence())
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
import numpy as np
|
||||||
|
from Ordinal.evaluation import smoothness
|
||||||
|
from Ordinal.utils import load_samples_folder, load_single_sample_pkl
|
||||||
|
from os.path import join
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
This scripts generates a partition of a dataset in terms of "smoothness".
|
||||||
|
The partition is only carried out by generating index vectors.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def partition_by_smoothness(split):
|
||||||
|
assert split in ['dev', 'test'], 'invalid split name'
|
||||||
|
total=1000 if split=='dev' else 5000
|
||||||
|
smooths = []
|
||||||
|
folderpath = join(datapath, domain, 'app', f'{split}_samples')
|
||||||
|
for sample in tqdm(load_samples_folder(folderpath, load_fn=load_single_sample_pkl), total=total):
|
||||||
|
smooths.append(smoothness(sample.prevalence()))
|
||||||
|
smooths = np.asarray(smooths)
|
||||||
|
order = np.argsort(smooths)
|
||||||
|
nD = len(order)
|
||||||
|
low2high_smooth = np.array_split(order, 5)
|
||||||
|
all_drift = np.arange(nD)
|
||||||
|
for i, smooth_idx in enumerate(low2high_smooth):
|
||||||
|
block = smooths[smooth_idx]
|
||||||
|
print(f'smooth block {i}: shape={smooth_idx.shape}, interval=[{block.min()}, {block.max()}] mean={block.mean()}')
|
||||||
|
np.save(join(datapath, domain, 'app', f'smooth{i}.{split}.id.npy'), smooth_idx)
|
||||||
|
np.save(join(datapath, domain, 'app', f'all.{split}.id.npy'), all_drift)
|
||||||
|
|
||||||
|
|
||||||
|
#domain = 'Books-tfidf'
|
||||||
|
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||||
|
datapath = './data'
|
||||||
|
|
||||||
|
#training = pickle.load(open(join(datapath,domain,'training_data.pkl'), 'rb'))
|
||||||
|
|
||||||
|
partition_by_smoothness('dev')
|
||||||
|
partition_by_smoothness('test')
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from os.path import join
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
from utils import *
|
||||||
|
from tqdm import tqdm
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into dense vectors
|
||||||
|
extracted from a pretrained model (here we use the RoBERTa fine-tuned on the training set)
|
||||||
|
Three vector generation modes are available: posteriors, last, average
|
||||||
|
"""
|
||||||
|
|
||||||
|
vector_generation = 'posteriors'
|
||||||
|
|
||||||
|
datapath = './data'
|
||||||
|
domain = f'Books-roberta-base-finetuned/checkpoint-1188-{vector_generation}'
|
||||||
|
outname = domain.replace('-finetuned', '-finetuned-pkl')
|
||||||
|
|
||||||
|
protocol = 'app'
|
||||||
|
|
||||||
|
print('pickling npy txt files')
|
||||||
|
print('from:', join(datapath, domain))
|
||||||
|
print('to', join(datapath, outname))
|
||||||
|
print('for protocol:', protocol)
|
||||||
|
|
||||||
|
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, protocol), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, protocol, 'dev_samples'), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, protocol, 'test_samples'), exist_ok=True)
|
||||||
|
shutil.copyfile(join(datapath, domain, protocol, 'dev_prevalences.txt'), join(datapath, outname, protocol, 'dev_prevalences.txt'))
|
||||||
|
shutil.copyfile(join(datapath, domain, protocol, 'test_prevalences.txt'), join(datapath, outname, protocol, 'test_prevalences.txt'))
|
||||||
|
|
||||||
|
train = load_simple_sample_npytxt(join(datapath, domain), 'training_data', classes=np.arange(5))
|
||||||
|
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
|
def transform_folder_samples(protocol, splitname):
|
||||||
|
folder_dir=join(datapath, domain, protocol, splitname)
|
||||||
|
for i, sample in tqdm(enumerate(load_samples_folder(folder_dir, filter=None, load_fn=load_simple_sample_npytxt, classes=train.classes_))):
|
||||||
|
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
|
transform_folder_samples(protocol, 'dev_samples')
|
||||||
|
transform_folder_samples(protocol, 'test_samples')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,62 @@
|
||||||
|
import quapy as qp
|
||||||
|
from Ordinal.utils import load_simple_sample_raw, load_samples_raw
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from os.path import join
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
from tqdm import tqdm
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
This script generates a preprocessing of the raw Amazon-OQ-BK dataset and converts it into tfidf vectors.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
datapath = './data'
|
||||||
|
domain = 'Books'
|
||||||
|
outname = domain + '-tfidf'
|
||||||
|
|
||||||
|
|
||||||
|
def save_preprocessing_info(transformer):
|
||||||
|
with open(join(datapath, outname, 'prep-info.txt'), 'wt') as foo:
|
||||||
|
foo.write(f'{str(transformer)}\n')
|
||||||
|
|
||||||
|
|
||||||
|
os.makedirs(join(datapath, outname), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, 'app'), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, 'app', 'dev_samples'), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, 'app', 'test_samples'), exist_ok=True)
|
||||||
|
shutil.copyfile(join(datapath, domain, 'app', 'dev_prevalences.txt'), join(datapath, outname, 'app', 'dev_prevalences.txt'))
|
||||||
|
shutil.copyfile(join(datapath, domain, 'app', 'test_prevalences.txt'), join(datapath, outname, 'app', 'test_prevalences.txt'))
|
||||||
|
os.makedirs(join(datapath, outname, 'real'), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, 'real', 'dev_samples'), exist_ok=True)
|
||||||
|
os.makedirs(join(datapath, outname, 'real', 'test_samples'), exist_ok=True)
|
||||||
|
shutil.copyfile(join(datapath, domain, 'real', 'dev_prevalences.txt'), join(datapath, outname, 'real', 'dev_prevalences.txt'))
|
||||||
|
shutil.copyfile(join(datapath, domain, 'real', 'test_prevalences.txt'), join(datapath, outname, 'real', 'test_prevalences.txt'))
|
||||||
|
|
||||||
|
|
||||||
|
tfidf = TfidfVectorizer(sublinear_tf=True, ngram_range=(1,2), min_df=5)
|
||||||
|
|
||||||
|
train = LabelledCollection.load(join(datapath, domain, 'training_data.txt'), loader_func=qp.data.reader.from_text)
|
||||||
|
train.instances = tfidf.fit_transform(train.instances)
|
||||||
|
save_preprocessing_info(tfidf)
|
||||||
|
pickle.dump(train, open(join(datapath, outname, 'training_data.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def transform_folder_samples(protocol, splitname):
|
||||||
|
for i, sample in tqdm(enumerate(load_samples_raw(join(datapath, domain, protocol, splitname), classes=train.classes_))):
|
||||||
|
sample.instances = tfidf.transform(sample.instances)
|
||||||
|
pickle.dump(sample, open(join(datapath, outname, protocol, splitname, f'{i}.pkl'), 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||||
|
|
||||||
|
|
||||||
|
transform_folder_samples('app', 'dev_samples')
|
||||||
|
transform_folder_samples('app', 'test_samples')
|
||||||
|
transform_folder_samples('real', 'dev_samples')
|
||||||
|
transform_folder_samples('real', 'test_samples')
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,374 @@
|
||||||
|
import numpy as np
|
||||||
|
import itertools
|
||||||
|
from scipy.stats import ttest_ind_from_stats, wilcoxon
|
||||||
|
|
||||||
|
|
||||||
|
class Table:
|
||||||
|
VALID_TESTS = [None, "wilcoxon", "ttest"]
|
||||||
|
|
||||||
|
def __init__(self, benchmarks, methods, lower_is_better=True, significance_test='wilcoxon', prec_mean=3,
|
||||||
|
clean_zero=False, show_std=False, prec_std=3, average=True, missing=None, missing_str='--',
|
||||||
|
color=True, show_rel_to=-1):
|
||||||
|
assert significance_test in self.VALID_TESTS, f'unknown test, valid are {self.VALID_TESTS}'
|
||||||
|
|
||||||
|
self.benchmarks = np.asarray(benchmarks)
|
||||||
|
self.benchmark_index = {row: i for i, row in enumerate(benchmarks)}
|
||||||
|
|
||||||
|
self.methods = np.asarray(methods)
|
||||||
|
self.method_index = {col: j for j, col in enumerate(methods)}
|
||||||
|
|
||||||
|
self.map = {}
|
||||||
|
# keyed (#rows,#cols)-ndarrays holding computations from self.map['values']
|
||||||
|
self._addmap('values', dtype=object)
|
||||||
|
self.lower_is_better = lower_is_better
|
||||||
|
self.ttest = significance_test
|
||||||
|
self.prec_mean = prec_mean
|
||||||
|
self.clean_zero = clean_zero
|
||||||
|
self.show_std = show_std
|
||||||
|
self.prec_std = prec_std
|
||||||
|
self.add_average = average
|
||||||
|
self.missing = missing
|
||||||
|
self.missing_str = missing_str
|
||||||
|
self.color = color
|
||||||
|
self.show_rel_to = show_rel_to
|
||||||
|
|
||||||
|
self.touch()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def nbenchmarks(self):
|
||||||
|
return len(self.benchmarks)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def nmethods(self):
|
||||||
|
return len(self.methods)
|
||||||
|
|
||||||
|
def touch(self):
|
||||||
|
self._modif = True
|
||||||
|
|
||||||
|
def update(self):
|
||||||
|
if self._modif:
|
||||||
|
self.compute()
|
||||||
|
|
||||||
|
def _getfilled(self):
|
||||||
|
return np.argwhere(self.map['fill'])
|
||||||
|
|
||||||
|
@property
|
||||||
|
def values(self):
|
||||||
|
return self.map['values']
|
||||||
|
|
||||||
|
def _indexes(self):
|
||||||
|
return itertools.product(range(self.nbenchmarks), range(self.nmethods))
|
||||||
|
|
||||||
|
def _addmap(self, map, dtype, func=None):
|
||||||
|
self.map[map] = np.empty((self.nbenchmarks, self.nmethods), dtype=dtype)
|
||||||
|
if func is None:
|
||||||
|
return
|
||||||
|
m = self.map[map]
|
||||||
|
f = func
|
||||||
|
indexes = self._indexes() if map == 'fill' else self._getfilled()
|
||||||
|
for i, j in indexes:
|
||||||
|
m[i, j] = f(self.values[i, j])
|
||||||
|
|
||||||
|
def _addrank(self):
|
||||||
|
for i in range(self.nbenchmarks):
|
||||||
|
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
||||||
|
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
|
||||||
|
ranked_cols_idx = filled_cols_idx[np.argsort(col_means)]
|
||||||
|
if not self.lower_is_better:
|
||||||
|
ranked_cols_idx = ranked_cols_idx[::-1]
|
||||||
|
self.map['rank'][i, ranked_cols_idx] = np.arange(1, len(filled_cols_idx) + 1)
|
||||||
|
|
||||||
|
def _addcolor(self):
|
||||||
|
for i in range(self.nbenchmarks):
|
||||||
|
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
||||||
|
if filled_cols_idx.size == 0:
|
||||||
|
continue
|
||||||
|
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
|
||||||
|
minval = min(col_means)
|
||||||
|
maxval = max(col_means)
|
||||||
|
for col_idx in filled_cols_idx:
|
||||||
|
val = self.map['mean'][i, col_idx]
|
||||||
|
norm = (maxval - minval)
|
||||||
|
if norm > 0:
|
||||||
|
normval = (val - minval) / norm
|
||||||
|
else:
|
||||||
|
normval = 0.5
|
||||||
|
if self.lower_is_better:
|
||||||
|
normval = 1 - normval
|
||||||
|
self.map['color'][i, col_idx] = color_red2green_01(normval)
|
||||||
|
|
||||||
|
def _run_ttest(self, row, col1, col2):
|
||||||
|
mean1 = self.map['mean'][row, col1]
|
||||||
|
std1 = self.map['std'][row, col1]
|
||||||
|
nobs1 = self.map['nobs'][row, col1]
|
||||||
|
mean2 = self.map['mean'][row, col2]
|
||||||
|
std2 = self.map['std'][row, col2]
|
||||||
|
nobs2 = self.map['nobs'][row, col2]
|
||||||
|
_, p_val = ttest_ind_from_stats(mean1, std1, nobs1, mean2, std2, nobs2)
|
||||||
|
return p_val
|
||||||
|
|
||||||
|
def _run_wilcoxon(self, row, col1, col2):
|
||||||
|
values1 = self.map['values'][row, col1]
|
||||||
|
values2 = self.map['values'][row, col2]
|
||||||
|
_, p_val = wilcoxon(values1, values2)
|
||||||
|
return p_val
|
||||||
|
|
||||||
|
def _add_statistical_test(self):
|
||||||
|
if self.ttest is None:
|
||||||
|
return
|
||||||
|
self.some_similar = [False] * self.nmethods
|
||||||
|
for i in range(self.nbenchmarks):
|
||||||
|
filled_cols_idx = np.argwhere(self.map['fill'][i]).flatten()
|
||||||
|
if len(filled_cols_idx) <= 1:
|
||||||
|
continue
|
||||||
|
col_means = [self.map['mean'][i, j] for j in filled_cols_idx]
|
||||||
|
best_pos = filled_cols_idx[np.argmin(col_means)]
|
||||||
|
|
||||||
|
for j in filled_cols_idx:
|
||||||
|
if j == best_pos:
|
||||||
|
continue
|
||||||
|
if self.ttest == 'ttest':
|
||||||
|
p_val = self._run_ttest(i, best_pos, j)
|
||||||
|
else:
|
||||||
|
p_val = self._run_wilcoxon(i, best_pos, j)
|
||||||
|
|
||||||
|
pval_outcome = pval_interpretation(p_val)
|
||||||
|
self.map['ttest'][i, j] = pval_outcome
|
||||||
|
if pval_outcome != 'Diff':
|
||||||
|
self.some_similar[j] = True
|
||||||
|
|
||||||
|
def compute(self):
|
||||||
|
self._addmap('fill', dtype=bool, func=lambda x: x is not None)
|
||||||
|
self._addmap('mean', dtype=float, func=np.mean)
|
||||||
|
self._addmap('std', dtype=float, func=np.std)
|
||||||
|
self._addmap('nobs', dtype=float, func=len)
|
||||||
|
self._addmap('rank', dtype=int, func=None)
|
||||||
|
self._addmap('color', dtype=object, func=None)
|
||||||
|
self._addmap('ttest', dtype=object, func=None)
|
||||||
|
self._addmap('latex', dtype=object, func=None)
|
||||||
|
self._addrank()
|
||||||
|
self._addcolor()
|
||||||
|
self._add_statistical_test()
|
||||||
|
if self.add_average:
|
||||||
|
self._addave()
|
||||||
|
self._modif = False
|
||||||
|
|
||||||
|
def _is_column_full(self, col):
|
||||||
|
return all(self.map['fill'][:, self.method_index[col]])
|
||||||
|
|
||||||
|
def _addave(self):
|
||||||
|
ave = Table(['ave'], self.methods, lower_is_better=self.lower_is_better, significance_test=self.ttest, average=False,
|
||||||
|
missing=self.missing, missing_str=self.missing_str, prec_mean=self.prec_mean, prec_std=self.prec_std,
|
||||||
|
show_std=self.show_std)
|
||||||
|
for col in self.methods:
|
||||||
|
values = None
|
||||||
|
if self._is_column_full(col):
|
||||||
|
if self.ttest == 'ttest':
|
||||||
|
values = np.asarray(self.map['mean'][:, self.method_index[col]])
|
||||||
|
else: # wilcoxon
|
||||||
|
values = np.concatenate(self.values[:, self.method_index[col]])
|
||||||
|
ave.add('ave', col, values)
|
||||||
|
self.average = ave
|
||||||
|
|
||||||
|
def add(self, benchmark, method, values):
|
||||||
|
if values is not None:
|
||||||
|
values = np.asarray(values)
|
||||||
|
if values.ndim == 0:
|
||||||
|
values = values.flatten()
|
||||||
|
rid, cid = self._coordinates(benchmark, method)
|
||||||
|
if self.map['values'][rid, cid] is None:
|
||||||
|
self.map['values'][rid, cid] = values
|
||||||
|
elif values is not None:
|
||||||
|
self.map['values'][rid, cid] = np.concatenate([self.map['values'][rid, cid], values])
|
||||||
|
self.touch()
|
||||||
|
|
||||||
|
def get(self, benchmark, method, attr='mean'):
|
||||||
|
self.update()
|
||||||
|
assert attr in self.map, f'unknwon attribute {attr}'
|
||||||
|
rid, cid = self._coordinates(benchmark, method)
|
||||||
|
if self.map['fill'][rid, cid]:
|
||||||
|
v = self.map[attr][rid, cid]
|
||||||
|
if v is None or (isinstance(v, float) and np.isnan(v)):
|
||||||
|
return self.missing
|
||||||
|
return v
|
||||||
|
else:
|
||||||
|
return self.missing
|
||||||
|
|
||||||
|
def _coordinates(self, benchmark, method):
|
||||||
|
assert benchmark in self.benchmark_index, f'benchmark {benchmark} out of range'
|
||||||
|
assert method in self.method_index, f'method {method} out of range'
|
||||||
|
rid = self.benchmark_index[benchmark]
|
||||||
|
cid = self.method_index[method]
|
||||||
|
return rid, cid
|
||||||
|
|
||||||
|
def get_average(self, method, attr='mean'):
|
||||||
|
self.update()
|
||||||
|
if self.add_average:
|
||||||
|
return self.average.get('ave', method, attr=attr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_color(self, benchmark, method):
|
||||||
|
color = self.get(benchmark, method, attr='color')
|
||||||
|
if color is None:
|
||||||
|
return ''
|
||||||
|
return color
|
||||||
|
|
||||||
|
def latexCell(self, benchmark, method):
|
||||||
|
self.update()
|
||||||
|
i, j = self._coordinates(benchmark, method)
|
||||||
|
if self.map['fill'][i, j] == False:
|
||||||
|
return self.missing_str
|
||||||
|
|
||||||
|
mean = self.map['mean'][i, j]
|
||||||
|
l = f" {mean:.{self.prec_mean}f}"
|
||||||
|
if self.clean_zero:
|
||||||
|
l = l.replace(' 0.', '.')
|
||||||
|
|
||||||
|
isbest = self.map['rank'][i, j] == 1
|
||||||
|
if self.ttest is not None: # and self.some_similar[j]:
|
||||||
|
test_label = self.map['ttest'][i, j]
|
||||||
|
if test_label in ['Sim', 'Same']:
|
||||||
|
isbest = True
|
||||||
|
|
||||||
|
if isbest:
|
||||||
|
l = "\\textbf{" + l.strip() + "}\;"
|
||||||
|
else:
|
||||||
|
l += '\; '
|
||||||
|
|
||||||
|
stat = ''
|
||||||
|
# this is commented because we are putting in textbf all results that are similar to the best one
|
||||||
|
# if self.ttest is not None: # and self.some_similar[j]:
|
||||||
|
# test_label = self.map['ttest'][i, j]
|
||||||
|
# if test_label == 'Sim':
|
||||||
|
# stat = '^{\dag\phantom{\dag}}'
|
||||||
|
# elif test_label == 'Same':
|
||||||
|
# stat = '^{\ddag}'
|
||||||
|
# elif isbest or test_label == 'Diff':
|
||||||
|
# stat = '^{\phantom{\ddag}}'
|
||||||
|
|
||||||
|
std = ''
|
||||||
|
if self.show_std:
|
||||||
|
std = self.map['std'][i, j]
|
||||||
|
std = f" {std:.{self.prec_std}f}"
|
||||||
|
if self.clean_zero:
|
||||||
|
std = std.replace(' 0.', '.')
|
||||||
|
std = f" \pm {std:{self.prec_std}}"
|
||||||
|
|
||||||
|
relto = ''
|
||||||
|
if self.show_rel_to != -1:
|
||||||
|
if j != self.show_rel_to:
|
||||||
|
ref_ave = self.map['mean'][i, self.show_rel_to]
|
||||||
|
rel = 100*(mean-ref_ave)/ref_ave
|
||||||
|
if abs(rel) < 0.1:
|
||||||
|
relto=f'(\\approx)'
|
||||||
|
else:
|
||||||
|
plussign = '+' if rel>0 else '' # already plugs the '-' sign
|
||||||
|
relto=f'({plussign}{rel:.1f}\%)'
|
||||||
|
std = ''
|
||||||
|
|
||||||
|
if stat != '' or std != '' or relto != '':
|
||||||
|
l = f'{l}${stat}{std}{relto}$'
|
||||||
|
|
||||||
|
if self.color:
|
||||||
|
l += ' ' + self.map['color'][i, j]
|
||||||
|
|
||||||
|
return l
|
||||||
|
|
||||||
|
def latexTabular(self, benchmark_replace={}, method_replace={}, average=True):
|
||||||
|
tab = ' & '
|
||||||
|
tab += ' & '.join([method_replace.get(col, col) for col in self.methods])
|
||||||
|
tab += ' \\\\\hline\n'
|
||||||
|
for row in self.benchmarks:
|
||||||
|
rowname = benchmark_replace.get(row, row)
|
||||||
|
tab += rowname + ' & '
|
||||||
|
tab += self.latexRow(row)
|
||||||
|
|
||||||
|
if average:
|
||||||
|
tab += '\hline\n'
|
||||||
|
tab += 'Average & '
|
||||||
|
tab += self.latexAverage()
|
||||||
|
return tab
|
||||||
|
|
||||||
|
def latexTabularT(self, benchmark_replace={}, method_replace={}, average=True, side=False):
|
||||||
|
def withside(label):
|
||||||
|
return '\side{'+label+'}' if side else label
|
||||||
|
|
||||||
|
def center(label):
|
||||||
|
return '\multicolumn{1}{c}{'+label+'}'
|
||||||
|
|
||||||
|
tab = ' & '
|
||||||
|
tab += ' & '.join([center(withside(benchmark_replace.get(col, col))) for col in self.benchmarks])
|
||||||
|
if average:
|
||||||
|
tab += ' & ' + withside('Ave')
|
||||||
|
# tab += ' \\\\\hline\n'
|
||||||
|
tab += ' \\\\\midrule\n'
|
||||||
|
for row in self.methods:
|
||||||
|
rowname = method_replace.get(row, row)
|
||||||
|
tab += rowname + ' & '
|
||||||
|
tab += self.latexRowT(row, endl='')
|
||||||
|
if average:
|
||||||
|
tab += ' & '
|
||||||
|
tab += self.average.latexCell('ave', row)
|
||||||
|
# tab += '\\\\\hline\n'
|
||||||
|
tab += '\\\\\n'
|
||||||
|
tab += '\\bottomrule'
|
||||||
|
return tab
|
||||||
|
|
||||||
|
def latexRow(self, benchmark, endl='\\\\\hline\n'):
|
||||||
|
s = [self.latexCell(benchmark, col) for col in self.methods]
|
||||||
|
s = ' & '.join(s)
|
||||||
|
s += ' ' + endl
|
||||||
|
return s
|
||||||
|
|
||||||
|
def latexRowT(self, method, endl='\\\\\hline\n'):
|
||||||
|
s = [self.latexCell(benchmark, method) for benchmark in self.benchmarks]
|
||||||
|
s = ' & '.join(s)
|
||||||
|
s += ' ' + endl
|
||||||
|
return s
|
||||||
|
|
||||||
|
def latexAverage(self, endl='\\\\\hline\n'):
|
||||||
|
if self.add_average:
|
||||||
|
return self.average.latexRow('ave', endl=endl)
|
||||||
|
|
||||||
|
def getRankTable(self):
|
||||||
|
t = Table(benchmarks=self.benchmarks, methods=self.methods, prec_mean=0, average=True)
|
||||||
|
for rid, cid in self._getfilled():
|
||||||
|
row = self.benchmarks[rid]
|
||||||
|
col = self.methods[cid]
|
||||||
|
t.add(row, col, self.get(row, col, 'rank'))
|
||||||
|
t.compute()
|
||||||
|
return t
|
||||||
|
|
||||||
|
def dropMethods(self, methods):
|
||||||
|
drop_index = [self.method_index[m] for m in methods]
|
||||||
|
new_methods = np.delete(self.methods, drop_index)
|
||||||
|
new_index = {col: j for j, col in enumerate(new_methods)}
|
||||||
|
|
||||||
|
self.map['values'] = self.values[:, np.asarray([self.method_index[m] for m in new_methods], dtype=int)]
|
||||||
|
self.methods = new_methods
|
||||||
|
self.method_index = new_index
|
||||||
|
self.touch()
|
||||||
|
|
||||||
|
|
||||||
|
def pval_interpretation(p_val):
|
||||||
|
if 0.005 >= p_val:
|
||||||
|
return 'Diff'
|
||||||
|
elif 0.05 >= p_val > 0.005:
|
||||||
|
return 'Sim'
|
||||||
|
elif p_val > 0.05:
|
||||||
|
return 'Same'
|
||||||
|
|
||||||
|
|
||||||
|
def color_red2green_01(val, maxtone=50):
|
||||||
|
if np.isnan(val): return None
|
||||||
|
assert 0 <= val <= 1, f'val {val} out of range [0,1]'
|
||||||
|
|
||||||
|
# rescale to [-1,1]
|
||||||
|
val = val * 2 - 1
|
||||||
|
if val < 0:
|
||||||
|
color = 'red'
|
||||||
|
tone = maxtone * (-val)
|
||||||
|
else:
|
||||||
|
color = 'green'
|
||||||
|
tone = maxtone * val
|
||||||
|
return '\cellcolor{' + color + f'!{int(tone)}' + '}'
|
|
@ -0,0 +1,78 @@
|
||||||
|
import gzip
|
||||||
|
import os
|
||||||
|
from collections import Counter
|
||||||
|
from Ordinal.utils import jaggedness
|
||||||
|
import pickle
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
nrows = 3
|
||||||
|
ncols = 4
|
||||||
|
|
||||||
|
prevalences = np.genfromtxt('fact_real_prevalences.csv', delimiter=',')[1:]
|
||||||
|
#prevalences = prevalences[:nrows*ncols]
|
||||||
|
print(prevalences)
|
||||||
|
|
||||||
|
n = prevalences.shape[1]
|
||||||
|
|
||||||
|
class_smooth = []
|
||||||
|
for i, sample in enumerate(prevalences):
|
||||||
|
p = sample
|
||||||
|
smooth = jaggedness(p)
|
||||||
|
class_smooth.append([smooth, f'Sample {i+1}', p])
|
||||||
|
|
||||||
|
# these two lines pick the nrows*ncols examples that go from the less jagged to the most jagged
|
||||||
|
# at equal steps
|
||||||
|
class_smooth = sorted(class_smooth)
|
||||||
|
class_smooth = class_smooth[::len(class_smooth)//(nrows*ncols)]
|
||||||
|
class_smooth = class_smooth[:nrows*ncols]
|
||||||
|
# print(class_smooth)
|
||||||
|
# print(len(class_smooth))
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import seaborn as sns
|
||||||
|
|
||||||
|
sns.set_theme('paper')
|
||||||
|
sns.set_style('dark')
|
||||||
|
sns.set(font_scale=0.5)
|
||||||
|
|
||||||
|
maxy = np.max(prevalences) + 0.1
|
||||||
|
class_labels = np.arange(1,n+1)
|
||||||
|
|
||||||
|
figure, axis = plt.subplots(nrows, ncols, figsize=(ncols*2, nrows))
|
||||||
|
for i, (smooth, category, prevalence) in enumerate(class_smooth):
|
||||||
|
row = i // ncols
|
||||||
|
col = i % ncols
|
||||||
|
# print(i, row, col)
|
||||||
|
#axis[row, col].bar(list(range(1,n+1)), prevalence, width=1)
|
||||||
|
|
||||||
|
axis[row, col].bar(class_labels, prevalence, width=1)
|
||||||
|
axis[row, col].set_ylim(0, maxy)
|
||||||
|
axis[row, col].set_facecolor('white')
|
||||||
|
for spine in axis[row, col].spines.values():
|
||||||
|
spine.set_edgecolor('black')
|
||||||
|
spine.set_linewidth(0.3)
|
||||||
|
|
||||||
|
if row==nrows-1:
|
||||||
|
axis[row, col].set_xlabel("energy bin")
|
||||||
|
axis[row, col].set_xticks(class_labels)
|
||||||
|
else:
|
||||||
|
axis[row, col].set_xlabel("")
|
||||||
|
axis[row, col].set_xticks([])
|
||||||
|
axis[row, col].set_ylabel("")
|
||||||
|
axis[row, col].set_yticks([])
|
||||||
|
|
||||||
|
category = category.replace('_', ' ').title()
|
||||||
|
category = category.replace(' And ', ' & ')
|
||||||
|
axis[row, col].set_title(f'{category} ({smooth:.4f})', x=0.5, y=0.75)
|
||||||
|
# axis[row, col].set_title
|
||||||
|
|
||||||
|
print(smooth, category, prevalence)
|
||||||
|
|
||||||
|
# plt.show()
|
||||||
|
plt.subplots_adjust(wspace=0, hspace=0)
|
||||||
|
plt.savefig('Telescope_sample_plotgrid.pdf', bbox_inches='tight')
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,13 @@
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
target_file = './counters_Amazon_merchandise.pkl'
|
||||||
|
(categories, counters) = pickle.load(open(target_file, 'rb'))
|
||||||
|
|
||||||
|
print(categories)
|
||||||
|
print(counters)
|
||||||
|
|
||||||
|
with open('categories.txt', 'wt') as foo:
|
||||||
|
for counter, category in zip(counters, categories):
|
||||||
|
foo.write(f'{category}\t{counter["1"]}\t{counter["2"]}\t{counter["3"]}\t{counter["4"]}\t{counter["5"]}\n')
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,67 @@
|
||||||
|
import numpy as np
|
||||||
|
from glob import glob
|
||||||
|
from json import load
|
||||||
|
import os
|
||||||
|
from os.path import join
|
||||||
|
import pickle
|
||||||
|
import pandas as pd
|
||||||
|
import csv
|
||||||
|
import datasets
|
||||||
|
from datasets import Dataset
|
||||||
|
import quapy as qp
|
||||||
|
from quapy.data import LabelledCollection
|
||||||
|
|
||||||
|
|
||||||
|
def jaggedness(p):
|
||||||
|
return (1/min(6, len(p)+1)) * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||||
|
|
||||||
|
|
||||||
|
def load_simple_sample_npytxt(parentdir, filename, classes=None):
|
||||||
|
samplepath = join(parentdir, filename+'.txt')
|
||||||
|
yX = np.loadtxt(samplepath)
|
||||||
|
X = yX[:,1:]
|
||||||
|
y = yX[:,0].astype(np.int32)
|
||||||
|
return LabelledCollection(instances=X, labels=y, classes_=classes)
|
||||||
|
|
||||||
|
|
||||||
|
def load_simple_sample_raw(parentdir, filename, classes=None):
|
||||||
|
samplepath = join(parentdir, filename+'.txt')
|
||||||
|
return LabelledCollection.load(samplepath, loader_func=qp.data.reader.from_text, classes=classes)
|
||||||
|
|
||||||
|
|
||||||
|
def load_single_sample_as_csv(parentdir, filename):
|
||||||
|
samplepath = join(parentdir, filename+'.txt')
|
||||||
|
df = pd.read_csv(samplepath, sep='\t', names=['labels', 'review'], quoting=csv.QUOTE_NONE)
|
||||||
|
labels = df.pop('labels').to_frame()
|
||||||
|
|
||||||
|
features = datasets.Features({'review': datasets.Value('string')})
|
||||||
|
sample = Dataset.from_pandas(df=df, features=features)
|
||||||
|
|
||||||
|
return sample, labels
|
||||||
|
|
||||||
|
|
||||||
|
def load_single_sample_pkl(parentdir, filename):
|
||||||
|
return pickle.load(open(join(parentdir, filename+'.pkl'), 'rb'))
|
||||||
|
|
||||||
|
|
||||||
|
# def load_samples_npytxt(path_dir, filter=None, classes=None):
|
||||||
|
# return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_npytxt)
|
||||||
|
|
||||||
|
|
||||||
|
def load_samples_raw(path_dir, filter=None, classes=None):
|
||||||
|
return load_samples_folder(path_dir, filter, load_fn=load_simple_sample_raw, classes=classes)
|
||||||
|
|
||||||
|
|
||||||
|
# def load_samples_as_csv(path_dir, filter=None):
|
||||||
|
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_as_csv)
|
||||||
|
|
||||||
|
|
||||||
|
# def load_samples_pkl(path_dir, filter=None):
|
||||||
|
# return load_samples_folder(path_dir, filter, load_fn=load_single_sample_pkl)
|
||||||
|
|
||||||
|
|
||||||
|
def load_samples_folder(path_dir, filter=None, load_fn=None, **load_fn_kwargs):
|
||||||
|
nsamples = len(glob(join(path_dir, f'*')))
|
||||||
|
for id in range(nsamples):
|
||||||
|
if (filter is None) or id in filter:
|
||||||
|
yield load_fn(path_dir, f'{id}', **load_fn_kwargs)
|
|
@ -1,9 +1,10 @@
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy.sparse import dok_matrix
|
from scipy.sparse import dok_matrix
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from time import time
|
||||||
|
|
||||||
|
|
||||||
def from_text(path, encoding='utf-8', verbose=1, class2int=True):
|
def from_text(path, encoding='utf-8', verbose=0, class2int=True):
|
||||||
"""
|
"""
|
||||||
Reads a labelled colletion of documents.
|
Reads a labelled colletion of documents.
|
||||||
File fomart <0 or 1>\t<document>\n
|
File fomart <0 or 1>\t<document>\n
|
||||||
|
|
|
@ -183,7 +183,7 @@ def _training_helper(learner,
|
||||||
if not hasattr(learner, 'predict_proba'):
|
if not hasattr(learner, 'predict_proba'):
|
||||||
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
|
print(f'The learner {learner.__class__.__name__} does not seem to be probabilistic. '
|
||||||
f'The learner will be calibrated.')
|
f'The learner will be calibrated.')
|
||||||
learner = CalibratedClassifierCV(learner, cv=5)
|
learner = CalibratedClassifierCV(learner, cv=5, ensemble=True)
|
||||||
if val_split is not None:
|
if val_split is not None:
|
||||||
if isinstance(val_split, float):
|
if isinstance(val_split, float):
|
||||||
if not (0 < val_split < 1):
|
if not (0 < val_split < 1):
|
||||||
|
@ -470,7 +470,7 @@ class EMQ(AggregativeProbabilisticQuantifier):
|
||||||
|
|
||||||
def fit(self, data: LabelledCollection, fit_learner=True):
|
def fit(self, data: LabelledCollection, fit_learner=True):
|
||||||
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
self.learner, _ = _training_helper(self.learner, data, fit_learner, ensure_probabilistic=True)
|
||||||
self.train_prevalence = F.prevalence_from_labels(data.labels, self.classes_)
|
self.train_prevalence = F.prevalence_from_labels(data.labels, data.classes_)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
def aggregate(self, classif_posteriors, epsilon=EPSILON):
|
||||||
|
|
Loading…
Reference in New Issue