1
0
Fork 0

adding helper files, unorganized

This commit is contained in:
Alejandro Moreo Fernandez 2024-03-12 09:52:58 +01:00
parent b756871f21
commit faa3af587c
11 changed files with 1096 additions and 0 deletions

View File

@ -0,0 +1,52 @@
import gzip
import os
import sys
from collections import Counter
from Ordinal.utils import jaggedness
import pickle
import numpy as np
amazon = np.genfromtxt('prevalence_votes1_reviews100.csv', delimiter='\t')
telescope = np.genfromtxt('fact_real_prevalences.csv', delimiter=',')[1:]
nclasses_amazon = amazon.shape[1]
nclasses_telescope = telescope.shape[1]
jags_amazon = np.asarray([jaggedness(p) for p in amazon])
jags_telescope = np.asarray([jaggedness(p) for p in telescope])
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
sns.set_theme('paper')
sns.set_style('dark')
sns.set(font_scale=0.7)
# figure, axis = plt.subplots(1, 2, figsize=(8, 7))
ymax = 0.75
figure(figsize=(8, 4), dpi=300)
ax=plt.subplot(1, 2, 1)
classes = np.arange(1, nclasses_amazon+1)
plt.bar(classes, np.mean(amazon, axis=0), yerr=np.std(amazon, axis=0), width=1)
ax.set_ylim(0, ymax)
ax.set_xlabel("stars")
ax.set_xticks(classes)
ax.set_title(f'Amazon Books ({jags_amazon.mean():.4f})')
ax=plt.subplot(1, 2, 2)
# ax=plt.subplot(1, 1, 1)
classes = np.arange(1, nclasses_telescope+1)
plt.bar(classes, np.mean(telescope, axis=0), yerr=np.std(telescope, axis=0), width=1)
ax.set_ylim(0, ymax)
ax.set_xlabel("energy bin")
ax.set_xticks(classes)
ax.set_title(f'FACT Samples ({jags_telescope.mean():.4f})')
plt.subplots_adjust(wspace=0.1, hspace=0)
plt.savefig('prevalence_averages.pdf', bbox_inches='tight')

View File

@ -0,0 +1,43 @@
import gzip
import os
import sys
from collections import Counter
from Ordinal.utils import jaggedness
import pickle
import numpy as np
telescope = np.genfromtxt('fact_expectation.txt')
nclasses_telescope = len(telescope)
jag = jaggedness(telescope)
print(jag)
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
sns.set_theme('paper')
sns.set_style('dark')
sns.set(font_scale=0.7)
# figure, axis = plt.subplots(1, 2, figsize=(8, 7))
ymax = 0.4
figure(figsize=(8, 4), dpi=300)
ax=plt.subplot(1, 1, 1)
classes = np.arange(1, nclasses_telescope+1)
plt.bar(classes, telescope, width=1)
# ax.bar_label(telescope)
ax.set_ylim(0, ymax)
ax.set_xlabel("energy bin")
ax.set_xticks(classes)
ax.set_title(f'FACT data ({jag:.4f})')
for index, data in enumerate(telescope):
plt.text(x=index+0.56 , y=data+0.005 , s=f"{data:.4f}")
plt.subplots_adjust(wspace=0.1, hspace=0)
plt.savefig('telescope_prevalence.pdf', bbox_inches='tight')

0
Ordinal/__init__.py Normal file
View File

View File

@ -0,0 +1,136 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import defaultdict
# this script computes the distribution of smoothness/sharpness of the books;
# either considering all books, as well as considering different groups of books' reviews (by product)
# filein='/media/moreo/Volume/Datasets/Amazon/raw/Gift_Cards.json.gz'
# df = pd.read_json(filein, lines=True, compression='gzip')
read_meta = True
def prepare_vote_field(df):
df['vote'] = df['vote'].fillna('0')
df['vote'] = df['vote'].apply(lambda x: x.replace(',', ''))
df['vote'] = pd.to_numeric(df['vote'])
return df
def read_from_huge_json(filein):
df = pd.read_json(filein, lines=True)
df.drop(columns=[
'verified', 'reviewTime', 'reviewerID', 'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime',
'image'
], inplace=True)
df = prepare_vote_field(df)
return df
def read_from_metadata(filein):
df = pd.read_csv(filein)
df['vote'] = pd.to_numeric(df['vote'])
return df
def filter_by_vote(df, vote_threshold=1):
df = df[df['vote'] >= vote_threshold]
df.drop(columns=['vote'], inplace=True)
return df
if read_meta:
filein = '/media/moreo/Volume/Datasets/Amazon/meta/Books.csv'
readfn = read_from_metadata
else:
filein='/media/moreo/Volume/Datasets/Amazon/raw/Books.json'
readfn = read_from_huge_json
votes_support=9
df = readfn(filein)
num_entries = len(df)
# df = prepare_vote_field(df)
df = filter_by_vote(df, vote_threshold=votes_support)
num_entries_with_vote = len(df)
unique_product_ids = df['asin'].unique()
num_products = len(unique_product_ids)
print(df.columns)
print(f'num rows {len(df)} (before vote-thresholding {num_entries}, after thresholding {num_entries_with_vote})')
print(f'num unique products {num_products}')
# df = df.groupby(df['asin'])
def not_smoothness(p):
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
# pass to dictionaries
df = df.reset_index() # make sure indexes pair with number of rows
ids = df['asin'].values
overalls = df['overall'].values
allbooks_prev = np.histogram(overalls, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
allbooks_sharpness = not_smoothness(allbooks_prev)
print(f'all books prev={allbooks_prev} has sharpness {allbooks_sharpness:.4f}')
import sys
sys.exit(0)
# Defining a dict
d = defaultdict(list)
for i, id in tqdm(enumerate(ids), total=len(ids), desc='passing to dictionary'):
d[id].append(overalls[i])
by_review_support = []
by_review_support_label = []
for reviews_support in [50, 100, 1]:
sharpness_all = []
num_products_with_reviews = 0
for product_id, ratings in tqdm(d.items(), total=len(d), desc='processing histograms'):
# ratings = df[df["asin"] == product_id]["overall"].values
n_ratings = len(ratings)
if n_ratings >= reviews_support:
# print(product_id, ratings)
prev = np.histogram(ratings, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
sharpness = not_smoothness(prev)
# print(prev, sharpness)
sharpness_all.append(sharpness)
num_products_with_reviews+=1
by_review_support.append(sharpness_all)
by_review_support_label.append(f'>{reviews_support}')
print(f'#votes-support (min number of votes): {votes_support}')
print(f'#reviews with >#votes-support: {num_entries_with_vote}/{num_entries}={100*num_entries_with_vote/num_entries:.2f}%')
print(f'#reviews-support (min number of reviews): {reviews_support}')
print(f'#products with >#reviews-support: {num_products_with_reviews}/{num_products}={100*num_products_with_reviews/num_products:.2f}%')
q05 = np.percentile(sharpness_all, 5)
q25 = np.percentile(sharpness_all, 25)
q50 = np.percentile(sharpness_all, 50)
q75 = np.percentile(sharpness_all, 75)
q95 = np.percentile(sharpness_all, 95)
print(f'{q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}')
print(f'ave={np.mean(sharpness_all):.5f}')
print(f'min={np.min(sharpness_all):.5f}')
print(f'max={np.max(sharpness_all):.5f}')
#fig, ax = plt.subplots()
#ax.boxplot(by_review_support)
#ax.set_xticklabels(by_review_support_label)
#ax.set_ylabel("Sharpness")
#ax.set_xlabel("Distributions by number of reviews")
#plt.show()

View File

@ -0,0 +1,209 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import defaultdict
from quapy.data import LabelledCollection
from quapy.protocol import UPP
# this script computes the distribution of smoothness/sharpness of the books;
# either considering all books, as well as considering different groups of books' reviews (by product)
# Mirko asked for some exploration of values (votes, num reviews), and percentiles of dataset shift as measured in terms
# of NMD between training set prevalences and sample prevalences; this script does this
# It also generates a csv containing all the prevalence values by product
read_meta = True
def not_smoothness(p):
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
def _check_arrays(prevs):
prevs = np.asarray(prevs)
if prevs.ndim==1:
prevs = prevs.reshape(1,-1)
return prevs
# mean normalized match distance
def mnmd(prevs, prevs_hat):
prevs = _check_arrays(prevs)
prevs_hat = _check_arrays(prevs_hat)
assert prevs.shape == prevs_hat.shape, f'wrong shape; found {prevs.shape} and {prevs_hat.shape}'
nmds = [nmd(p, p_hat) for p, p_hat in zip(prevs, prevs_hat)]
return np.mean(nmds)
# normalized match distance
def nmd(prev, prev_hat):
n = len(prev)
return (1./(n-1))*mdpa(prev, prev_hat)
"""
Minimum Distance of Pair Assignments (MDPA) [cha2002measuring] for ordinal pdfs `a` and `b`.
The MDPA is a special case of the Earth Mover's Distance [rubner1998metric] that can be
computed efficiently.
[Mirko Bunse's code from Julia adapted]
"""
def mdpa(a, b):
assert len(a) == len(b), "histograms have to have the same length"
assert np.isclose(sum(a), sum(b)), "histograms have to have the same mass (difference is $(sum(a)-sum(b))"
# algorithm 1 in [cha2002measuring]
prefixsum = 0.0
distance = 0.0
for i in range(len(a)):
prefixsum += a[i] - b[i]
distance += abs(prefixsum)
return distance / sum(a) # the normalization is a fix to the original MDPA
def prepare_vote_field(df):
df['vote'] = df['vote'].fillna('0')
df['vote'] = df['vote'].apply(lambda x: x.replace(',', ''))
df['vote'] = pd.to_numeric(df['vote'])
return df
def read_from_huge_json(filein):
df = pd.read_json(filein, lines=True)
df.drop(columns=[
'verified', 'reviewTime', 'reviewerID', 'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime',
'image'
], inplace=True)
df = prepare_vote_field(df)
return df
def read_from_metadata(filein):
df = pd.read_csv(filein)
df['vote'] = pd.to_numeric(df['vote'])
return df
def filter_by_vote(df, vote_threshold=1):
df = df[df['vote'] >= vote_threshold]
df.drop(columns=['vote'], inplace=True)
return df
if read_meta:
filein = '/media/moreo/Volume/Datasets/Amazon/meta/Books.csv'
readfn = read_from_metadata
else:
filein='/media/moreo/Volume/Datasets/Amazon/raw/Books.json'
readfn = read_from_huge_json
def create_dictionary_bookid_ratings(df):
# pass to dictionaries
df = df.reset_index() # make sure indexes pair with number of rows
ids = df['asin'].values
overalls = df['overall'].values
# Defining a dict
d = defaultdict(list)
for i, id in tqdm(enumerate(ids), total=len(ids), desc='passing to dictionary'):
d[id].append(overalls[i])
return d
def get_stats(distribution, msg=''):
# computes the mean, max, min, perc5, perc25, perc50, perc75, perc95 of the distribution
vmean = np.mean(distribution)
vmax = np.max(distribution)
vmin = np.min(distribution)
q05 = np.percentile(distribution, 5)
q25 = np.percentile(distribution, 25)
q50 = np.percentile(distribution, 50)
q75 = np.percentile(distribution, 75)
q95 = np.percentile(distribution, 95)
print(f'{msg}: percentiles {q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}')
print(f'{msg}: ave={np.mean(distribution):.5f}')
print(f'{msg}: max={np.max(distribution):.5f}')
print(f'{msg}: min={np.min(distribution):.5f}')
return vmean, vmax, vmin, q05, q25, q50, q75, q95
with open('book_stats.csv', 'wt') as foo:
foo.write(f'minvotes\tminreviews\t#products\t#reviews'
f'\tsharp-ave\tsharp-max\tsharp-min\t'
f'sharp-P5\tsharp-P25\tsharp-P50\tsharp-P75\tsharp-P95'
f'\tshift-ave\tshift-max\tshift-min\t'
f'shift-P5\tshift-P25\tshift-P50\tshift-P75\tshift-P95'
f'\n')
for votes_support in [1]:
df = readfn(filein)
df = df[df['overall']>0] # there are a couple of reviews with 0 stars (the min should be 1)
num_entries = len(df)
df = filter_by_vote(df, vote_threshold=votes_support)
num_entries_with_vote = len(df)
unique_product_ids = df['asin'].unique()
num_products = len(unique_product_ids)
print(df.columns)
print(f'num rows {len(df)} (before vote-thresholding {num_entries}, after thresholding {num_entries_with_vote})')
print(f'num unique products {num_products}')
d = create_dictionary_bookid_ratings(df)
for reviews_support in [100]:
with open(f'./prevalence_votes{votes_support}_reviews{reviews_support}.csv', 'wt') as fprev:
sharpness_all = []
num_products_with_reviews = 0
sel_ids, sel_overalls = [], []
for product_id, ratings in tqdm(d.items(), total=len(d), desc='processing histograms'):
n_ratings = len(ratings)
if n_ratings >= reviews_support:
sel_ids.extend([product_id] * n_ratings)
sel_overalls.extend(ratings)
prev = np.histogram(ratings, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
for i, prev_i in enumerate(prev):
fprev.write(f'{prev_i:.5f}')
if i < len(prev)-1:
fprev.write('\t')
else:
fprev.write('\n')
sharpness = not_smoothness(prev)
sharpness_all.append(sharpness)
num_products_with_reviews+=1
print(f'#votes-support (min number of votes): {votes_support}')
print(f'#reviews with >#votes-support: {num_entries_with_vote}/{num_entries}={100*num_entries_with_vote/num_entries:.2f}%')
print(f'#reviews-support (min number of reviews): {reviews_support}')
print(f'#products with >#reviews-support: {num_products_with_reviews}/{num_products}={100*num_products_with_reviews/num_products:.2f}%')
vmean, vmax, vmin, q05, q25, q50, q75, q95 = get_stats(sharpness_all, 'sharpness')
allbooks_prev = np.histogram(sel_overalls, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
allbooks_sharpness = not_smoothness(allbooks_prev)
print(f'all books prev={allbooks_prev} has sharpness {allbooks_sharpness:.4f}')
sel_collection = LabelledCollection(instances=sel_ids, labels=sel_overalls, classes=[1,2,3,4,5])
prot = UPP(sel_collection, sample_size=1000, repeats=5000)
prot_iterator = prot()
shifts = []
for _, test_prev in tqdm(prot_iterator, total=prot.total()):
shifts.append(nmd(allbooks_prev, prev_hat=test_prev))
s_mean, s_max, s_min, s_q05, s_q25, s_q50, s_q75, s_q95 = get_stats(shifts, 'shift')
foo.write(f'{votes_support}\t{reviews_support}\t{num_products_with_reviews}\t{len(sel_ids)}'
f'\t{vmean:.5f}\t{vmax:.5f}\t{vmin:.5f}\t'
f'{q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}'
f'\t{s_mean:.5f}\t{s_max:.5f}\t{s_min:.5f}\t'
f'{s_q05:.5f}\t{s_q25:.5f}\t{s_q50:.5f}\t{s_q75:.5f}\t{s_q95:.5f}\n')

View File

@ -0,0 +1,27 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import optimize
# this script checks for the prevalence values that yield the maximum or minimum values of smoothness;
# the result indicates any linear distribution (not only the uniform) satisfies this requirement
def sharpness(p):
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
def smoothness(p):
return 1-sharpness(p)
nclasses = 5
uniform_distribution = np.random.rand(nclasses) #np.full(fill_value=1/nclasses, shape=nclasses)
uniform_distribution /= uniform_distribution.sum()
bounds = tuple((0, 1) for x in range(nclasses)) # values in [0,1]
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
r = optimize.minimize(sharpness, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'minimum of sharpness function {r.x}')
r = optimize.minimize(smoothness, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
print(f'maximum of sharpness function {r.x}')

View File

@ -0,0 +1,105 @@
import gzip
import os
from collections import Counter
from Ordinal.utils import jaggedness
import quapy as qp
import pickle
import numpy as np
import pandas as pd
base_path = '/media/moreo/Volume/Datasets/Amazon/reviews'
categories_path = '/media/moreo/Volume/Datasets/Amazon/raw/amazon_categories.txt'
def get_prevalence_merchandise(category):
input_file = os.path.join(base_path, category+'.txt.gz')
labels = []
print(f'{category} starts')
with gzip.open(input_file, 'rt') as f:
for line in f:
try:
stars, doc = line.split('\t')
labels.append(stars)
except:
print('error in line: ', line)
counts = Counter(labels)
print(f'\t{category} done')
return counts
target_file = './counters_Amazon_merchandise.pkl'
if not os.path.exists(target_file):
categories = [c.strip().replace(' ', '_') for c in open(categories_path, 'rt').readlines()]
# categories = ['Gift_Cards', 'Magazine_Subscriptions']
counters = qp.util.parallel(get_prevalence_merchandise, categories, n_jobs=-1)
print('saving pickle')
pickle.dump((categories, counters), open(target_file, 'wb'), pickle.HIGHEST_PROTOCOL)
else:
(categories, counters) = pickle.load(open(target_file, 'rb'))
index_gift_cards = categories.index('Gift_Cards')
del categories[index_gift_cards]
del counters[index_gift_cards]
class_smooth = []
for cat, counter in zip(categories, counters):
total = sum(count for label, count in counter.items())
counts = [counter[i] for i in map(str, [1,2,3,4,5])]
p = np.asarray(counts)/total
smooth = jaggedness(p)
class_smooth.append([smooth, cat, p])
class_smooth = sorted(class_smooth)
# df = pd.DataFrame(class_smooth, columns=['smoothness', 'category', 'prevalence'])
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme('paper')
sns.set_style('dark')
sns.set(font_scale=0.5)
nrows = 7
ncols = 4
figure, axis = plt.subplots(nrows, ncols, figsize=(ncols*2, nrows))
with open('categories.txt', 'wt') as foo:
foo.write(f'Category\tSmooth\tPrevalence\n')
for i, (smooth, category, prevalence) in enumerate(class_smooth):
row = i // 4
col = i % 4
# print(i, row, col)
axis[row, col].bar([1,2,3,4,5], prevalence, width=1)
axis[row, col].set_ylim(0, 0.75)
axis[row, col].set_facecolor('white')
for spine in axis[row, col].spines.values():
spine.set_edgecolor('black')
spine.set_linewidth(0.3)
# axis[row, col].set_xticks(loc=0)
if row==6:
axis[row, col].set_xlabel("stars")
# axis[row, col].set_xticks([1,2,3,4,5])
# else:
# axis[row, col].set_xticks([])
if col==0:
axis[row, col].set_ylabel("")
axis[row, col].set_yticks([])
else:
axis[row, col].set_ylabel("")
axis[row, col].set_yticks([])
category = category.replace('_', ' ').title()
category = category.replace(' And ', ' & ')
axis[row, col].set_title(f'{category} ({smooth:.4f})', x=0.5, y=0.75)
# axis[row, col].set_title
foo.write(f'{category}\t{smooth}\t{prevalence}\n')
# plt.show()
plt.subplots_adjust(wspace=0, hspace=0)
plt.savefig('Amazon_categories_plotgrid.pdf', bbox_inches='tight')

View File

@ -0,0 +1,137 @@
import numpy as np
from scipy.stats import wilcoxon
import quapy as qp
import os
from os.path import join
from Ordinal.tabular import Table
from utils import load_samples_folder, load_single_sample_pkl, jaggedness
from Ordinal.evaluation import nmd, mnmd
from tqdm import tqdm
import pandas as pd
from glob import glob
from pathlib import Path
"""
This script takes all results from the book domain, that correspond to the APP protocol, and filters by
smoothness so that only the 50% smoothest examples are considered, and recomputes the averages of the nmd
thus effectively reporting the results for the APP-OQ protocol
"""
def parse_str_prev(df_col):
values = df_col.values
array_list = [np.fromstring(array[1:-1], sep=' ') for array in values]
return np.asarray(array_list)
def parse_result_file(path):
df = pd.read_csv(path)
true_prev = parse_str_prev(df['true-prev'])
estim_prev = parse_str_prev(df['estim-prev'])
nmd = df['nmd'].values
return true_prev, estim_prev, nmd
def ave_jaggedness(prevs, less_percentile=1):
jag = np.sort([jaggedness(p) for p in prevs])
up_to = int(less_percentile * len(jag))
return np.mean(jag[:up_to])
def retain_half_smoothest(true_prev, estim_prev, nmd):
jag = [jaggedness(p) for p in true_prev]
order = np.argsort(jag)
up_to = len(order)//2
order = order[:up_to]
return true_prev[order], estim_prev[order], nmd[order]
def compute_half_smoothest_nmd(true_prev, estim_prev, nmd):
_, _, nmd_smooth = retain_half_smoothest(true_prev, estim_prev, nmd)
return nmd_smooth
if __name__ == '__main__':
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
datapath = './data'
in_protocol = 'app'
out_protocol = 'app-oq'
in_result_path = join('./results', domain, in_protocol)
out_result_path = join('./results', domain, out_protocol)
os.makedirs(out_result_path, exist_ok=True)
# recompute the results in terms of APP-OQ
result_dict = {}
for filepath in glob(f'{in_result_path}/*).all.csv'):
name = Path(filepath).name
quantifier = name[:name.index('(')]
classifier = name[name.index('(')+1:name.index(')')]
true_prev, estim_prev, nmds = parse_result_file(filepath)
nmds = compute_half_smoothest_nmd(true_prev, estim_prev, nmds)
result_dict[classifier + '-' + quantifier] = nmds
# convert to numbers and search for the best in each quantifier
best_keys = {}
best_nmds = {}
for quantifier in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']:
best_ave, best_key, best_nmd = None, None, None
for classifier in ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']:
key = classifier + '-' + quantifier
if key in result_dict:
nmds = result_dict[key]
mean_val = np.mean(nmds)
if best_ave is None or mean_val < best_ave:
best_ave = mean_val
best_key = key
best_nmd = nmds
best_keys[quantifier] = best_key
best_nmds[quantifier] = best_nmd
# print(best_keys)
# write a latex table
for q in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']:
print('& \multicolumn{2}{c}{'+q+'} ', end='')
print('\\\\')
print('\\midrule')
for classifier in ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']:
print(classifier + '\t', end='')
for quantifier in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']:
key = classifier + '-' + quantifier
the_best_nmds = best_nmds[quantifier]
if key in result_dict:
nmds = result_dict[key]
mean_val = np.mean(nmds)
bold = False
if best_keys[quantifier] == key:
bold = True
else:
_, pval = wilcoxon(nmds, the_best_nmds)
if pval > 0.01:
bold = True
str_mean = f'{mean_val:.4f}'
if bold:
str_mean = '\\textbf{' + str_mean + '}'
if classifier == 'LR':
std_val = np.std(nmds)
str_val = f'{str_mean} & $\pm {std_val:.4f}$'
else:
rel_increment = 100 * (mean_val-np.mean(the_best_nmds)) / np.mean(the_best_nmds)
sign = '+' if rel_increment>0 else ''
str_val = f'{str_mean} & ({sign}{rel_increment:.1f}\\%)'
else:
str_val = '\multicolumn{2}{c}{---}'
str_val = ' & ' + str_val
print(str_val, end='')
print('\\\\')

View File

@ -0,0 +1,296 @@
from copy import deepcopy
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression, Ridge
from scipy.sparse import issparse
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVR, SVR
from statsmodels.miscmodels.ordinal_model import OrderedModel
import mord
from sklearn.utils.class_weight import compute_class_weight
class OrderedLogisticRegression:
def __init__(self, model='logit'):
assert model in ['logit', 'probit'], 'unknown ordered model, valid ones are logit or probit'
self.model = model
def fit(self, X, y):
if issparse(X):
self.svd = TruncatedSVD(500)
X = self.svd.fit_transform(X)
self.learner = OrderedModel(y, X, distr=self.model)
self.res_prob = self.learner.fit(method='bfgs', disp=False, skip_hessian=True)
def predict(self, X):
prob = self.predict_proba(X)
return np.argmax(prob, axis=1)
def predict_proba(self, X):
if issparse(X):
assert hasattr(self, 'svd'), \
'X matrix in predict is sparse, but the method has not been fit with sparse type'
X = self.svd.transform(X)
return self.res_prob.quantifier.predict(self.res_prob.params, exog=X)
class StackedClassifier: # aka Funnelling Monolingual
def __init__(self, base_estimator=LogisticRegression()):
if not hasattr(base_estimator, 'predict_proba'):
print('the estimator does not seem to be probabilistic: calibrating')
base_estimator = CalibratedClassifierCV(base_estimator)
# self.base = deepcopy(OneVsRestClassifier(base_estimator))
# self.meta = deepcopy(OneVsRestClassifier(base_estimator))
self.base = deepcopy(base_estimator)
self.meta = deepcopy(base_estimator)
self.norm = StandardScaler()
def fit(self, X, y):
self.base.fit(X, y)
P = self.base.predict_proba(X)
P = self.norm.fit_transform(P)
self.meta.fit(P, y)
return self
def predict(self, X):
P = self.base.predict_proba(X)
P = self.norm.transform(P)
return self.meta.predict(P)
def predict_proba(self, X):
P = self.base.predict_proba(X)
P = self.norm.transform(P)
return self.meta.predict_proba(P)
class RegressionQuantification:
def __init__(self,
base_quantifier,
regression='svr',
val_samples_generator=None,
norm=True):
self.base_quantifier = base_quantifier
if isinstance(regression, str):
assert regression in ['ridge', 'svr'], 'unknown regression model'
if regression == 'ridge':
self.reg = Ridge(normalize=norm)
elif regression == 'svr':
self.reg = MultiOutputRegressor(LinearSVR())
else:
self.reg = regression
# self.reg = MultiTaskLassoCV(normalize=norm)
# self.reg = KernelRidge(kernel='rbf')
# self.reg = LassoLarsCV(normalize=norm)
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
#self.reg = LinearRegression(normalize=norm) # <- bien
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
self.regression = regression
self.val_samples_generator = val_samples_generator
# self.norm = StandardScaler()
# self.covs = covs
def generate_validation_samples(self):
Xs, ys = [], []
for instances, prevalence in self.val_samples_generator():
ys.append(prevalence)
Xs.append(self.base_quantifier.quantify(instances))
Xs = np.asarray(Xs)
ys = np.asarray(ys)
return Xs, ys
def fit(self, data):
print('fitting quantifier')
if data is not None:
self.base_quantifier.fit(data)
print('generating val samples')
Xs, ys = self.generate_validation_samples()
# Xs = self.norm.fit_transform(Xs)
print('fitting regressor')
self.reg.fit(Xs, ys)
print('[done]')
return self
def quantify(self, instances):
Xs = self.base_quantifier.quantify(instances).reshape(1, -1)
# Xs = self.norm.transform(Xs)
Xs = self.reg.predict(Xs).flatten()
# Xs = self.norm.inverse_transform(Xs)
Xs = np.clip(Xs, 0, 1)
adjusted = Xs / Xs.sum()
# adjusted = np.clip(Xs, 0, 1)
adjusted = adjusted
return adjusted
def get_params(self, deep=True):
return self.base_quantifier.get_params()
def set_params(self, **params):
self.base_quantifier.set_params(**params)
class LAD(BaseEstimator, ClassifierMixin):
def __init__(self, C=1.0, class_weight=None):
self.C = C
self.class_weight = class_weight
def fit(self, X, y, sample_weight=None):
self.regressor = LinearSVR(C=self.C)
# self.regressor = SVR()
# self.regressor = Ridge(normalize=True)
classes = sorted(np.unique(y))
self.nclasses = len(classes)
if self.class_weight == 'balanced':
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
self.regressor.fit(X, y, sample_weight=sample_weight)
return self
def predict(self, X):
r = self.regressor.predict(X)
c = np.round(r)
c[c<0]=0
c[c>(self.nclasses-1)]=self.nclasses-1
return c.astype(int)
# def predict_proba(self, X):
# r = self.regressor.predict(X)
# nC = len(self.classes_)
# r = np.clip(r, 0, nC - 1)
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
# invdist = 1 - dists
# invdist[invdist < 0] = 0
# return invdist
def decision_function(self, X):
r = self.regressor.predict(X)
nC = len(self.classes_)
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
invdist = 1 - dists
return invdist
@property
def classes_(self):
return np.arange(self.nclasses)
def get_params(self, deep=True):
return {'C':self.C, 'class_weight': self.class_weight}
def set_params(self, **params):
self.C = params['C']
self.class_weight = params['class_weight']
class OrdinalRidge(BaseEstimator, ClassifierMixin):
def __init__(self, alpha=1.0, class_weight=None, normalize=False):
self.alpha = alpha
self.class_weight = class_weight
self.normalize = normalize
def fit(self, X, y, sample_weight=None):
self.regressor = Ridge(alpha=self.alpha, normalize=self.normalize)
classes = sorted(np.unique(y))
self.nclasses = len(classes)
if self.class_weight == 'balanced':
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
self.regressor.fit(X, y, sample_weight=sample_weight)
return self
def predict(self, X):
r = self.regressor.predict(X)
c = np.round(r)
c[c<0]=0
c[c>(self.nclasses-1)]=self.nclasses-1
return c.astype(int)
# def predict_proba(self, X):
# r = self.regressor.predict(X)
# nC = len(self.classes_)
# r = np.clip(r, 0, nC - 1)
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
# invdist = 1 - dists
# invdist[invdist < 0] = 0
# return invdist
def decision_function(self, X):
r = self.regressor.predict(X)
nC = len(self.classes_)
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
invdist = 1 - dists
return invdist
@property
def classes_(self):
return np.arange(self.nclasses)
def get_params(self, deep=True):
return {'alpha':self.alpha, 'class_weight': self.class_weight, 'normalize': self.normalize}
def set_params(self, **params):
self.alpha = params['alpha']
self.class_weight = params['class_weight']
self.normalize = params['normalize']
# with order-aware classifiers
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
class LogisticAT(mord.LogisticAT):
def __init__(self, alpha=1.0, class_weight=None):
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
self.class_weight = class_weight
super(LogisticAT, self).__init__(alpha=alpha)
def fit(self, X, y, sample_weight=None):
if self.class_weight == 'balanced':
classes = sorted(np.unique(y))
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight)
class LogisticSE(mord.LogisticSE):
def __init__(self, alpha=1.0, class_weight=None):
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
self.class_weight = class_weight
super(LogisticSE, self).__init__(alpha=alpha)
def fit(self, X, y, sample_weight=None):
if self.class_weight == 'balanced':
classes = sorted(np.unique(y))
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
return super(LogisticSE, self).fit(X, y, sample_weight=sample_weight)
class LogisticIT(mord.LogisticIT):
def __init__(self, alpha=1.0, class_weight=None):
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
self.class_weight = class_weight
super(LogisticIT, self).__init__(alpha=alpha)
def fit(self, X, y, sample_weight=None):
if self.class_weight == 'balanced':
classes = sorted(np.unique(y))
class_weight = compute_class_weight('balanced', classes=classes, y=y)
sample_weight = class_weight[y]
return super(LogisticIT, self).fit(X, y, sample_weight=sample_weight)
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
# class LAD(mord.LAD):
# def fit(self, X, y):
# self.classes_ = sorted(np.unique(y))
# return super().fit(X, y)
# class OrdinalRidge(mord.OrdinalRidge):
# def fit(self, X, y):
# self.classes_ = sorted(np.unique(y))
# return super().fit(X, y)

View File

@ -0,0 +1,78 @@
import gzip
import os
from collections import Counter
from Ordinal.utils import jaggedness
import pickle
import numpy as np
import pandas as pd
nrows = 3
ncols = 4
prevalences = np.genfromtxt('fact_real_prevalences.csv', delimiter=',')[1:]
#prevalences = prevalences[:nrows*ncols]
print(prevalences)
n = prevalences.shape[1]
class_smooth = []
for i, sample in enumerate(prevalences):
p = sample
smooth = jaggedness(p)
class_smooth.append([smooth, f'Sample {i+1}', p])
# these two lines pick the nrows*ncols examples that go from the less jagged to the most jagged
# at equal steps
class_smooth = sorted(class_smooth)
class_smooth = class_smooth[::len(class_smooth)//(nrows*ncols)]
class_smooth = class_smooth[:nrows*ncols]
# print(class_smooth)
# print(len(class_smooth))
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme('paper')
sns.set_style('dark')
sns.set(font_scale=0.5)
maxy = np.max(prevalences) + 0.1
class_labels = np.arange(1,n+1)
figure, axis = plt.subplots(nrows, ncols, figsize=(ncols*2, nrows))
for i, (smooth, category, prevalence) in enumerate(class_smooth):
row = i // ncols
col = i % ncols
# print(i, row, col)
#axis[row, col].bar(list(range(1,n+1)), prevalence, width=1)
axis[row, col].bar(class_labels, prevalence, width=1)
axis[row, col].set_ylim(0, maxy)
axis[row, col].set_facecolor('white')
for spine in axis[row, col].spines.values():
spine.set_edgecolor('black')
spine.set_linewidth(0.3)
if row==nrows-1:
axis[row, col].set_xlabel("energy bin")
axis[row, col].set_xticks(class_labels)
else:
axis[row, col].set_xlabel("")
axis[row, col].set_xticks([])
axis[row, col].set_ylabel("")
axis[row, col].set_yticks([])
category = category.replace('_', ' ').title()
category = category.replace(' And ', ' & ')
axis[row, col].set_title(f'{category} ({smooth:.4f})', x=0.5, y=0.75)
# axis[row, col].set_title
print(smooth, category, prevalence)
# plt.show()
plt.subplots_adjust(wspace=0, hspace=0)
plt.savefig('Telescope_sample_plotgrid.pdf', bbox_inches='tight')

View File

@ -0,0 +1,13 @@
import pickle
target_file = './counters_Amazon_merchandise.pkl'
(categories, counters) = pickle.load(open(target_file, 'rb'))
print(categories)
print(counters)
with open('categories.txt', 'wt') as foo:
for counter, category in zip(counters, categories):
foo.write(f'{category}\t{counter["1"]}\t{counter["2"]}\t{counter["3"]}\t{counter["4"]}\t{counter["5"]}\n')