diff --git a/Ordinal/Amazon_Telescople_errorbars_prevalence_plots.py b/Ordinal/Amazon_Telescople_errorbars_prevalence_plots.py new file mode 100644 index 0000000..54a4267 --- /dev/null +++ b/Ordinal/Amazon_Telescople_errorbars_prevalence_plots.py @@ -0,0 +1,52 @@ +import gzip +import os +import sys +from collections import Counter +from Ordinal.utils import jaggedness +import pickle +import numpy as np + +amazon = np.genfromtxt('prevalence_votes1_reviews100.csv', delimiter='\t') +telescope = np.genfromtxt('fact_real_prevalences.csv', delimiter=',')[1:] + +nclasses_amazon = amazon.shape[1] +nclasses_telescope = telescope.shape[1] + +jags_amazon = np.asarray([jaggedness(p) for p in amazon]) +jags_telescope = np.asarray([jaggedness(p) for p in telescope]) + +import matplotlib.pyplot as plt +from matplotlib.pyplot import figure +import seaborn as sns + +sns.set_theme('paper') +sns.set_style('dark') +sns.set(font_scale=0.7) + +# figure, axis = plt.subplots(1, 2, figsize=(8, 7)) +ymax = 0.75 + +figure(figsize=(8, 4), dpi=300) + +ax=plt.subplot(1, 2, 1) +classes = np.arange(1, nclasses_amazon+1) +plt.bar(classes, np.mean(amazon, axis=0), yerr=np.std(amazon, axis=0), width=1) +ax.set_ylim(0, ymax) +ax.set_xlabel("stars") +ax.set_xticks(classes) +ax.set_title(f'Amazon Books ({jags_amazon.mean():.4f})') + +ax=plt.subplot(1, 2, 2) +# ax=plt.subplot(1, 1, 1) +classes = np.arange(1, nclasses_telescope+1) +plt.bar(classes, np.mean(telescope, axis=0), yerr=np.std(telescope, axis=0), width=1) +ax.set_ylim(0, ymax) +ax.set_xlabel("energy bin") +ax.set_xticks(classes) +ax.set_title(f'FACT Samples ({jags_telescope.mean():.4f})') + + +plt.subplots_adjust(wspace=0.1, hspace=0) +plt.savefig('prevalence_averages.pdf', bbox_inches='tight') + + diff --git a/Ordinal/Telescople_prevalence_plot.py b/Ordinal/Telescople_prevalence_plot.py new file mode 100644 index 0000000..048db3e --- /dev/null +++ b/Ordinal/Telescople_prevalence_plot.py @@ -0,0 +1,43 @@ +import gzip +import os +import sys +from collections import Counter +from Ordinal.utils import jaggedness +import pickle +import numpy as np + +telescope = np.genfromtxt('fact_expectation.txt') +nclasses_telescope = len(telescope) + +jag = jaggedness(telescope) +print(jag) + +import matplotlib.pyplot as plt +from matplotlib.pyplot import figure +import seaborn as sns + +sns.set_theme('paper') +sns.set_style('dark') +sns.set(font_scale=0.7) + +# figure, axis = plt.subplots(1, 2, figsize=(8, 7)) +ymax = 0.4 + +figure(figsize=(8, 4), dpi=300) + +ax=plt.subplot(1, 1, 1) +classes = np.arange(1, nclasses_telescope+1) +plt.bar(classes, telescope, width=1) +# ax.bar_label(telescope) +ax.set_ylim(0, ymax) +ax.set_xlabel("energy bin") +ax.set_xticks(classes) +ax.set_title(f'FACT data ({jag:.4f})') +for index, data in enumerate(telescope): + plt.text(x=index+0.56 , y=data+0.005 , s=f"{data:.4f}") + + +plt.subplots_adjust(wspace=0.1, hspace=0) +plt.savefig('telescope_prevalence.pdf', bbox_inches='tight') + + diff --git a/Ordinal/__init__.py b/Ordinal/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Ordinal/amazon_books_by_product_smooth_test.py b/Ordinal/amazon_books_by_product_smooth_test.py new file mode 100644 index 0000000..846ff4d --- /dev/null +++ b/Ordinal/amazon_books_by_product_smooth_test.py @@ -0,0 +1,136 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from tqdm import tqdm +from collections import defaultdict + + +# this script computes the distribution of smoothness/sharpness of the books; +# either considering all books, as well as considering different groups of books' reviews (by product) + +# filein='/media/moreo/Volume/Datasets/Amazon/raw/Gift_Cards.json.gz' +# df = pd.read_json(filein, lines=True, compression='gzip') + +read_meta = True + + +def prepare_vote_field(df): + df['vote'] = df['vote'].fillna('0') + df['vote'] = df['vote'].apply(lambda x: x.replace(',', '')) + df['vote'] = pd.to_numeric(df['vote']) + return df + + +def read_from_huge_json(filein): + df = pd.read_json(filein, lines=True) + df.drop(columns=[ + 'verified', 'reviewTime', 'reviewerID', 'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime', + 'image' + ], inplace=True) + + df = prepare_vote_field(df) + return df + + +def read_from_metadata(filein): + df = pd.read_csv(filein) + df['vote'] = pd.to_numeric(df['vote']) + return df + + +def filter_by_vote(df, vote_threshold=1): + df = df[df['vote'] >= vote_threshold] + df.drop(columns=['vote'], inplace=True) + return df + + +if read_meta: + filein = '/media/moreo/Volume/Datasets/Amazon/meta/Books.csv' + readfn = read_from_metadata +else: + filein='/media/moreo/Volume/Datasets/Amazon/raw/Books.json' + readfn = read_from_huge_json + +votes_support=9 + +df = readfn(filein) + +num_entries = len(df) +# df = prepare_vote_field(df) +df = filter_by_vote(df, vote_threshold=votes_support) +num_entries_with_vote = len(df) + +unique_product_ids = df['asin'].unique() +num_products = len(unique_product_ids) + +print(df.columns) +print(f'num rows {len(df)} (before vote-thresholding {num_entries}, after thresholding {num_entries_with_vote})') +print(f'num unique products {num_products}') + + +# df = df.groupby(df['asin']) + +def not_smoothness(p): + return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:])) + + +# pass to dictionaries +df = df.reset_index() # make sure indexes pair with number of rows + +ids = df['asin'].values +overalls = df['overall'].values + +allbooks_prev = np.histogram(overalls, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0] +allbooks_sharpness = not_smoothness(allbooks_prev) +print(f'all books prev={allbooks_prev} has sharpness {allbooks_sharpness:.4f}') + +import sys +sys.exit(0) + +# Defining a dict +d = defaultdict(list) +for i, id in tqdm(enumerate(ids), total=len(ids), desc='passing to dictionary'): + d[id].append(overalls[i]) + + +by_review_support = [] +by_review_support_label = [] +for reviews_support in [50, 100, 1]: + sharpness_all = [] + num_products_with_reviews = 0 + for product_id, ratings in tqdm(d.items(), total=len(d), desc='processing histograms'): + # ratings = df[df["asin"] == product_id]["overall"].values + n_ratings = len(ratings) + if n_ratings >= reviews_support: + # print(product_id, ratings) + prev = np.histogram(ratings, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0] + sharpness = not_smoothness(prev) + # print(prev, sharpness) + sharpness_all.append(sharpness) + num_products_with_reviews+=1 + by_review_support.append(sharpness_all) + by_review_support_label.append(f'>{reviews_support}') + + print(f'#votes-support (min number of votes): {votes_support}') + print(f'#reviews with >#votes-support: {num_entries_with_vote}/{num_entries}={100*num_entries_with_vote/num_entries:.2f}%') + + print(f'#reviews-support (min number of reviews): {reviews_support}') + print(f'#products with >#reviews-support: {num_products_with_reviews}/{num_products}={100*num_products_with_reviews/num_products:.2f}%') + + q05 = np.percentile(sharpness_all, 5) + q25 = np.percentile(sharpness_all, 25) + q50 = np.percentile(sharpness_all, 50) + q75 = np.percentile(sharpness_all, 75) + q95 = np.percentile(sharpness_all, 95) + print(f'{q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}') + print(f'ave={np.mean(sharpness_all):.5f}') + print(f'min={np.min(sharpness_all):.5f}') + print(f'max={np.max(sharpness_all):.5f}') + +#fig, ax = plt.subplots() +#ax.boxplot(by_review_support) +#ax.set_xticklabels(by_review_support_label) +#ax.set_ylabel("Sharpness") +#ax.set_xlabel("Distributions by number of reviews") +#plt.show() + diff --git a/Ordinal/amazon_books_by_product_smooth_test_various_values.py b/Ordinal/amazon_books_by_product_smooth_test_various_values.py new file mode 100644 index 0000000..cd1cb25 --- /dev/null +++ b/Ordinal/amazon_books_by_product_smooth_test_various_values.py @@ -0,0 +1,209 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from tqdm import tqdm +from collections import defaultdict + +from quapy.data import LabelledCollection +from quapy.protocol import UPP + +# this script computes the distribution of smoothness/sharpness of the books; +# either considering all books, as well as considering different groups of books' reviews (by product) +# Mirko asked for some exploration of values (votes, num reviews), and percentiles of dataset shift as measured in terms +# of NMD between training set prevalences and sample prevalences; this script does this +# It also generates a csv containing all the prevalence values by product + + +read_meta = True + + +def not_smoothness(p): + return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:])) + + +def _check_arrays(prevs): + prevs = np.asarray(prevs) + if prevs.ndim==1: + prevs = prevs.reshape(1,-1) + return prevs + + +# mean normalized match distance +def mnmd(prevs, prevs_hat): + prevs = _check_arrays(prevs) + prevs_hat = _check_arrays(prevs_hat) + assert prevs.shape == prevs_hat.shape, f'wrong shape; found {prevs.shape} and {prevs_hat.shape}' + + nmds = [nmd(p, p_hat) for p, p_hat in zip(prevs, prevs_hat)] + return np.mean(nmds) + + +# normalized match distance +def nmd(prev, prev_hat): + n = len(prev) + return (1./(n-1))*mdpa(prev, prev_hat) + + +""" +Minimum Distance of Pair Assignments (MDPA) [cha2002measuring] for ordinal pdfs `a` and `b`. +The MDPA is a special case of the Earth Mover's Distance [rubner1998metric] that can be +computed efficiently. +[Mirko Bunse's code from Julia adapted] +""" +def mdpa(a, b): + assert len(a) == len(b), "histograms have to have the same length" + assert np.isclose(sum(a), sum(b)), "histograms have to have the same mass (difference is $(sum(a)-sum(b))" + + # algorithm 1 in [cha2002measuring] + prefixsum = 0.0 + distance = 0.0 + for i in range(len(a)): + prefixsum += a[i] - b[i] + distance += abs(prefixsum) + + return distance / sum(a) # the normalization is a fix to the original MDPA + + +def prepare_vote_field(df): + df['vote'] = df['vote'].fillna('0') + df['vote'] = df['vote'].apply(lambda x: x.replace(',', '')) + df['vote'] = pd.to_numeric(df['vote']) + return df + + +def read_from_huge_json(filein): + df = pd.read_json(filein, lines=True) + df.drop(columns=[ + 'verified', 'reviewTime', 'reviewerID', 'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime', + 'image' + ], inplace=True) + + df = prepare_vote_field(df) + return df + + +def read_from_metadata(filein): + df = pd.read_csv(filein) + df['vote'] = pd.to_numeric(df['vote']) + return df + + +def filter_by_vote(df, vote_threshold=1): + df = df[df['vote'] >= vote_threshold] + df.drop(columns=['vote'], inplace=True) + return df + + +if read_meta: + filein = '/media/moreo/Volume/Datasets/Amazon/meta/Books.csv' + readfn = read_from_metadata +else: + filein='/media/moreo/Volume/Datasets/Amazon/raw/Books.json' + readfn = read_from_huge_json + + +def create_dictionary_bookid_ratings(df): + # pass to dictionaries + df = df.reset_index() # make sure indexes pair with number of rows + + ids = df['asin'].values + overalls = df['overall'].values + + # Defining a dict + d = defaultdict(list) + for i, id in tqdm(enumerate(ids), total=len(ids), desc='passing to dictionary'): + d[id].append(overalls[i]) + + return d + +def get_stats(distribution, msg=''): + # computes the mean, max, min, perc5, perc25, perc50, perc75, perc95 of the distribution + vmean = np.mean(distribution) + vmax = np.max(distribution) + vmin = np.min(distribution) + q05 = np.percentile(distribution, 5) + q25 = np.percentile(distribution, 25) + q50 = np.percentile(distribution, 50) + q75 = np.percentile(distribution, 75) + q95 = np.percentile(distribution, 95) + print(f'{msg}: percentiles {q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}') + print(f'{msg}: ave={np.mean(distribution):.5f}') + print(f'{msg}: max={np.max(distribution):.5f}') + print(f'{msg}: min={np.min(distribution):.5f}') + return vmean, vmax, vmin, q05, q25, q50, q75, q95 + +with open('book_stats.csv', 'wt') as foo: + foo.write(f'minvotes\tminreviews\t#products\t#reviews' + f'\tsharp-ave\tsharp-max\tsharp-min\t' + f'sharp-P5\tsharp-P25\tsharp-P50\tsharp-P75\tsharp-P95' + f'\tshift-ave\tshift-max\tshift-min\t' + f'shift-P5\tshift-P25\tshift-P50\tshift-P75\tshift-P95' + f'\n') + + for votes_support in [1]: + + df = readfn(filein) + df = df[df['overall']>0] # there are a couple of reviews with 0 stars (the min should be 1) + + num_entries = len(df) + df = filter_by_vote(df, vote_threshold=votes_support) + num_entries_with_vote = len(df) + + unique_product_ids = df['asin'].unique() + num_products = len(unique_product_ids) + + print(df.columns) + print(f'num rows {len(df)} (before vote-thresholding {num_entries}, after thresholding {num_entries_with_vote})') + print(f'num unique products {num_products}') + + d = create_dictionary_bookid_ratings(df) + + for reviews_support in [100]: + with open(f'./prevalence_votes{votes_support}_reviews{reviews_support}.csv', 'wt') as fprev: + sharpness_all = [] + num_products_with_reviews = 0 + sel_ids, sel_overalls = [], [] + for product_id, ratings in tqdm(d.items(), total=len(d), desc='processing histograms'): + n_ratings = len(ratings) + if n_ratings >= reviews_support: + sel_ids.extend([product_id] * n_ratings) + sel_overalls.extend(ratings) + + prev = np.histogram(ratings, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0] + for i, prev_i in enumerate(prev): + fprev.write(f'{prev_i:.5f}') + if i < len(prev)-1: + fprev.write('\t') + else: + fprev.write('\n') + sharpness = not_smoothness(prev) + sharpness_all.append(sharpness) + num_products_with_reviews+=1 + + print(f'#votes-support (min number of votes): {votes_support}') + print(f'#reviews with >#votes-support: {num_entries_with_vote}/{num_entries}={100*num_entries_with_vote/num_entries:.2f}%') + + print(f'#reviews-support (min number of reviews): {reviews_support}') + print(f'#products with >#reviews-support: {num_products_with_reviews}/{num_products}={100*num_products_with_reviews/num_products:.2f}%') + + vmean, vmax, vmin, q05, q25, q50, q75, q95 = get_stats(sharpness_all, 'sharpness') + + allbooks_prev = np.histogram(sel_overalls, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0] + allbooks_sharpness = not_smoothness(allbooks_prev) + print(f'all books prev={allbooks_prev} has sharpness {allbooks_sharpness:.4f}') + + sel_collection = LabelledCollection(instances=sel_ids, labels=sel_overalls, classes=[1,2,3,4,5]) + prot = UPP(sel_collection, sample_size=1000, repeats=5000) + prot_iterator = prot() + shifts = [] + for _, test_prev in tqdm(prot_iterator, total=prot.total()): + shifts.append(nmd(allbooks_prev, prev_hat=test_prev)) + s_mean, s_max, s_min, s_q05, s_q25, s_q50, s_q75, s_q95 = get_stats(shifts, 'shift') + + foo.write(f'{votes_support}\t{reviews_support}\t{num_products_with_reviews}\t{len(sel_ids)}' + f'\t{vmean:.5f}\t{vmax:.5f}\t{vmin:.5f}\t' + f'{q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}' + f'\t{s_mean:.5f}\t{s_max:.5f}\t{s_min:.5f}\t' + f'{s_q05:.5f}\t{s_q25:.5f}\t{s_q50:.5f}\t{s_q75:.5f}\t{s_q95:.5f}\n') + + diff --git a/Ordinal/amazon_most_less_smooth_test.py b/Ordinal/amazon_most_less_smooth_test.py new file mode 100644 index 0000000..5c7521c --- /dev/null +++ b/Ordinal/amazon_most_less_smooth_test.py @@ -0,0 +1,27 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from scipy import optimize + + +# this script checks for the prevalence values that yield the maximum or minimum values of smoothness; +# the result indicates any linear distribution (not only the uniform) satisfies this requirement + +def sharpness(p): + return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:])) + +def smoothness(p): + return 1-sharpness(p) + +nclasses = 5 +uniform_distribution = np.random.rand(nclasses) #np.full(fill_value=1/nclasses, shape=nclasses) +uniform_distribution /= uniform_distribution.sum() + +bounds = tuple((0, 1) for x in range(nclasses)) # values in [0,1] +constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1 +r = optimize.minimize(sharpness, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) + +print(f'minimum of sharpness function {r.x}') + +r = optimize.minimize(smoothness, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints) +print(f'maximum of sharpness function {r.x}') \ No newline at end of file diff --git a/Ordinal/amazon_prevalence_plotgrid.py b/Ordinal/amazon_prevalence_plotgrid.py new file mode 100644 index 0000000..efddacb --- /dev/null +++ b/Ordinal/amazon_prevalence_plotgrid.py @@ -0,0 +1,105 @@ +import gzip +import os +from collections import Counter +from Ordinal.utils import jaggedness +import quapy as qp +import pickle +import numpy as np +import pandas as pd + +base_path = '/media/moreo/Volume/Datasets/Amazon/reviews' +categories_path = '/media/moreo/Volume/Datasets/Amazon/raw/amazon_categories.txt' + + +def get_prevalence_merchandise(category): + input_file = os.path.join(base_path, category+'.txt.gz') + labels = [] + print(f'{category} starts') + with gzip.open(input_file, 'rt') as f: + for line in f: + try: + stars, doc = line.split('\t') + labels.append(stars) + except: + print('error in line: ', line) + counts = Counter(labels) + print(f'\t{category} done') + return counts + +target_file = './counters_Amazon_merchandise.pkl' + +if not os.path.exists(target_file): + categories = [c.strip().replace(' ', '_') for c in open(categories_path, 'rt').readlines()] + + # categories = ['Gift_Cards', 'Magazine_Subscriptions'] + counters = qp.util.parallel(get_prevalence_merchandise, categories, n_jobs=-1) + + print('saving pickle') + pickle.dump((categories, counters), open(target_file, 'wb'), pickle.HIGHEST_PROTOCOL) + +else: + (categories, counters) = pickle.load(open(target_file, 'rb')) + +index_gift_cards = categories.index('Gift_Cards') +del categories[index_gift_cards] +del counters[index_gift_cards] + +class_smooth = [] +for cat, counter in zip(categories, counters): + total = sum(count for label, count in counter.items()) + counts = [counter[i] for i in map(str, [1,2,3,4,5])] + p = np.asarray(counts)/total + smooth = jaggedness(p) + class_smooth.append([smooth, cat, p]) + +class_smooth = sorted(class_smooth) + +# df = pd.DataFrame(class_smooth, columns=['smoothness', 'category', 'prevalence']) + +import matplotlib.pyplot as plt +import seaborn as sns + +sns.set_theme('paper') +sns.set_style('dark') +sns.set(font_scale=0.5) + +nrows = 7 +ncols = 4 +figure, axis = plt.subplots(nrows, ncols, figsize=(ncols*2, nrows)) +with open('categories.txt', 'wt') as foo: + foo.write(f'Category\tSmooth\tPrevalence\n') + for i, (smooth, category, prevalence) in enumerate(class_smooth): + row = i // 4 + col = i % 4 + # print(i, row, col) + axis[row, col].bar([1,2,3,4,5], prevalence, width=1) + axis[row, col].set_ylim(0, 0.75) + axis[row, col].set_facecolor('white') + for spine in axis[row, col].spines.values(): + spine.set_edgecolor('black') + spine.set_linewidth(0.3) + # axis[row, col].set_xticks(loc=0) + if row==6: + axis[row, col].set_xlabel("stars") + # axis[row, col].set_xticks([1,2,3,4,5]) + # else: + # axis[row, col].set_xticks([]) + if col==0: + axis[row, col].set_ylabel("") + axis[row, col].set_yticks([]) + else: + axis[row, col].set_ylabel("") + axis[row, col].set_yticks([]) + + category = category.replace('_', ' ').title() + category = category.replace(' And ', ' & ') + axis[row, col].set_title(f'{category} ({smooth:.4f})', x=0.5, y=0.75) + # axis[row, col].set_title + + foo.write(f'{category}\t{smooth}\t{prevalence}\n') + +# plt.show() +plt.subplots_adjust(wspace=0, hspace=0) +plt.savefig('Amazon_categories_plotgrid.pdf', bbox_inches='tight') + + diff --git a/Ordinal/experiments_lr_vs_ordlr_APP-OQ.py b/Ordinal/experiments_lr_vs_ordlr_APP-OQ.py new file mode 100644 index 0000000..7d00e48 --- /dev/null +++ b/Ordinal/experiments_lr_vs_ordlr_APP-OQ.py @@ -0,0 +1,137 @@ +import numpy as np +from scipy.stats import wilcoxon + +import quapy as qp +import os +from os.path import join + +from Ordinal.tabular import Table +from utils import load_samples_folder, load_single_sample_pkl, jaggedness +from Ordinal.evaluation import nmd, mnmd +from tqdm import tqdm +import pandas as pd +from glob import glob +from pathlib import Path + + +""" +This script takes all results from the book domain, that correspond to the APP protocol, and filters by +smoothness so that only the 50% smoothest examples are considered, and recomputes the averages of the nmd +thus effectively reporting the results for the APP-OQ protocol +""" + +def parse_str_prev(df_col): + values = df_col.values + array_list = [np.fromstring(array[1:-1], sep=' ') for array in values] + return np.asarray(array_list) + +def parse_result_file(path): + df = pd.read_csv(path) + true_prev = parse_str_prev(df['true-prev']) + estim_prev = parse_str_prev(df['estim-prev']) + nmd = df['nmd'].values + return true_prev, estim_prev, nmd + +def ave_jaggedness(prevs, less_percentile=1): + jag = np.sort([jaggedness(p) for p in prevs]) + up_to = int(less_percentile * len(jag)) + return np.mean(jag[:up_to]) + + +def retain_half_smoothest(true_prev, estim_prev, nmd): + jag = [jaggedness(p) for p in true_prev] + order = np.argsort(jag) + up_to = len(order)//2 + order = order[:up_to] + return true_prev[order], estim_prev[order], nmd[order] + + +def compute_half_smoothest_nmd(true_prev, estim_prev, nmd): + _, _, nmd_smooth = retain_half_smoothest(true_prev, estim_prev, nmd) + return nmd_smooth + + +if __name__ == '__main__': + domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average' + datapath = './data' + in_protocol = 'app' + out_protocol = 'app-oq' + in_result_path = join('./results', domain, in_protocol) + out_result_path = join('./results', domain, out_protocol) + os.makedirs(out_result_path, exist_ok=True) + + # recompute the results in terms of APP-OQ + result_dict = {} + for filepath in glob(f'{in_result_path}/*).all.csv'): + name = Path(filepath).name + quantifier = name[:name.index('(')] + classifier = name[name.index('(')+1:name.index(')')] + + true_prev, estim_prev, nmds = parse_result_file(filepath) + nmds = compute_half_smoothest_nmd(true_prev, estim_prev, nmds) + + result_dict[classifier + '-' + quantifier] = nmds + + # convert to numbers and search for the best in each quantifier + best_keys = {} + best_nmds = {} + for quantifier in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']: + best_ave, best_key, best_nmd = None, None, None + for classifier in ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']: + key = classifier + '-' + quantifier + if key in result_dict: + nmds = result_dict[key] + mean_val = np.mean(nmds) + if best_ave is None or mean_val < best_ave: + best_ave = mean_val + best_key = key + best_nmd = nmds + best_keys[quantifier] = best_key + best_nmds[quantifier] = best_nmd + + # print(best_keys) + + # write a latex table + for q in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']: + print('& \multicolumn{2}{c}{'+q+'} ', end='') + print('\\\\') + print('\\midrule') + for classifier in ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']: + print(classifier + '\t', end='') + for quantifier in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']: + key = classifier + '-' + quantifier + the_best_nmds = best_nmds[quantifier] + + if key in result_dict: + nmds = result_dict[key] + mean_val = np.mean(nmds) + + bold = False + if best_keys[quantifier] == key: + bold = True + else: + _, pval = wilcoxon(nmds, the_best_nmds) + if pval > 0.01: + bold = True + + str_mean = f'{mean_val:.4f}' + if bold: + str_mean = '\\textbf{' + str_mean + '}' + + if classifier == 'LR': + std_val = np.std(nmds) + str_val = f'{str_mean} & $\pm {std_val:.4f}$' + else: + rel_increment = 100 * (mean_val-np.mean(the_best_nmds)) / np.mean(the_best_nmds) + sign = '+' if rel_increment>0 else '' + str_val = f'{str_mean} & ({sign}{rel_increment:.1f}\\%)' + else: + str_val = '\multicolumn{2}{c}{---}' + + str_val = ' & ' + str_val + + print(str_val, end='') + print('\\\\') + + + diff --git a/Ordinal/model_experimental.py b/Ordinal/model_experimental.py new file mode 100644 index 0000000..5bdb27e --- /dev/null +++ b/Ordinal/model_experimental.py @@ -0,0 +1,296 @@ +from copy import deepcopy +import numpy as np +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.calibration import CalibratedClassifierCV +from sklearn.decomposition import TruncatedSVD +from sklearn.linear_model import LogisticRegression, Ridge +from scipy.sparse import issparse +from sklearn.multiclass import OneVsRestClassifier +from sklearn.multioutput import MultiOutputRegressor +from sklearn.preprocessing import StandardScaler +from sklearn.svm import LinearSVR, SVR +from statsmodels.miscmodels.ordinal_model import OrderedModel +import mord +from sklearn.utils.class_weight import compute_class_weight + + +class OrderedLogisticRegression: + def __init__(self, model='logit'): + assert model in ['logit', 'probit'], 'unknown ordered model, valid ones are logit or probit' + self.model = model + + def fit(self, X, y): + if issparse(X): + self.svd = TruncatedSVD(500) + X = self.svd.fit_transform(X) + self.learner = OrderedModel(y, X, distr=self.model) + self.res_prob = self.learner.fit(method='bfgs', disp=False, skip_hessian=True) + + def predict(self, X): + prob = self.predict_proba(X) + return np.argmax(prob, axis=1) + + def predict_proba(self, X): + if issparse(X): + assert hasattr(self, 'svd'), \ + 'X matrix in predict is sparse, but the method has not been fit with sparse type' + X = self.svd.transform(X) + return self.res_prob.quantifier.predict(self.res_prob.params, exog=X) + + +class StackedClassifier: # aka Funnelling Monolingual + def __init__(self, base_estimator=LogisticRegression()): + if not hasattr(base_estimator, 'predict_proba'): + print('the estimator does not seem to be probabilistic: calibrating') + base_estimator = CalibratedClassifierCV(base_estimator) + # self.base = deepcopy(OneVsRestClassifier(base_estimator)) + # self.meta = deepcopy(OneVsRestClassifier(base_estimator)) + self.base = deepcopy(base_estimator) + self.meta = deepcopy(base_estimator) + self.norm = StandardScaler() + + def fit(self, X, y): + self.base.fit(X, y) + P = self.base.predict_proba(X) + P = self.norm.fit_transform(P) + self.meta.fit(P, y) + return self + + def predict(self, X): + P = self.base.predict_proba(X) + P = self.norm.transform(P) + return self.meta.predict(P) + + def predict_proba(self, X): + P = self.base.predict_proba(X) + P = self.norm.transform(P) + return self.meta.predict_proba(P) + + +class RegressionQuantification: + def __init__(self, + base_quantifier, + regression='svr', + val_samples_generator=None, + norm=True): + + self.base_quantifier = base_quantifier + if isinstance(regression, str): + assert regression in ['ridge', 'svr'], 'unknown regression model' + if regression == 'ridge': + self.reg = Ridge(normalize=norm) + elif regression == 'svr': + self.reg = MultiOutputRegressor(LinearSVR()) + else: + self.reg = regression + # self.reg = MultiTaskLassoCV(normalize=norm) + # self.reg = KernelRidge(kernel='rbf') + # self.reg = LassoLarsCV(normalize=norm) + # self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien + #self.reg = LinearRegression(normalize=norm) # <- bien + # self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm + # self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm + # self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va + self.regression = regression + self.val_samples_generator = val_samples_generator + # self.norm = StandardScaler() + # self.covs = covs + + def generate_validation_samples(self): + Xs, ys = [], [] + for instances, prevalence in self.val_samples_generator(): + ys.append(prevalence) + Xs.append(self.base_quantifier.quantify(instances)) + Xs = np.asarray(Xs) + ys = np.asarray(ys) + return Xs, ys + + def fit(self, data): + print('fitting quantifier') + if data is not None: + self.base_quantifier.fit(data) + print('generating val samples') + Xs, ys = self.generate_validation_samples() + # Xs = self.norm.fit_transform(Xs) + print('fitting regressor') + self.reg.fit(Xs, ys) + print('[done]') + return self + + def quantify(self, instances): + Xs = self.base_quantifier.quantify(instances).reshape(1, -1) + # Xs = self.norm.transform(Xs) + Xs = self.reg.predict(Xs).flatten() + # Xs = self.norm.inverse_transform(Xs) + Xs = np.clip(Xs, 0, 1) + adjusted = Xs / Xs.sum() + # adjusted = np.clip(Xs, 0, 1) + adjusted = adjusted + return adjusted + + def get_params(self, deep=True): + return self.base_quantifier.get_params() + + def set_params(self, **params): + self.base_quantifier.set_params(**params) + + +class LAD(BaseEstimator, ClassifierMixin): + def __init__(self, C=1.0, class_weight=None): + self.C = C + self.class_weight = class_weight + + def fit(self, X, y, sample_weight=None): + self.regressor = LinearSVR(C=self.C) + # self.regressor = SVR() + # self.regressor = Ridge(normalize=True) + classes = sorted(np.unique(y)) + self.nclasses = len(classes) + if self.class_weight == 'balanced': + class_weight = compute_class_weight('balanced', classes=classes, y=y) + sample_weight = class_weight[y] + self.regressor.fit(X, y, sample_weight=sample_weight) + return self + + def predict(self, X): + r = self.regressor.predict(X) + c = np.round(r) + c[c<0]=0 + c[c>(self.nclasses-1)]=self.nclasses-1 + return c.astype(int) + + # def predict_proba(self, X): + # r = self.regressor.predict(X) + # nC = len(self.classes_) + # r = np.clip(r, 0, nC - 1) + # dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1)) + # invdist = 1 - dists + # invdist[invdist < 0] = 0 + # return invdist + + def decision_function(self, X): + r = self.regressor.predict(X) + nC = len(self.classes_) + dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1)) + invdist = 1 - dists + return invdist + + @property + def classes_(self): + return np.arange(self.nclasses) + + def get_params(self, deep=True): + return {'C':self.C, 'class_weight': self.class_weight} + + def set_params(self, **params): + self.C = params['C'] + self.class_weight = params['class_weight'] + + +class OrdinalRidge(BaseEstimator, ClassifierMixin): + def __init__(self, alpha=1.0, class_weight=None, normalize=False): + self.alpha = alpha + self.class_weight = class_weight + self.normalize = normalize + + def fit(self, X, y, sample_weight=None): + self.regressor = Ridge(alpha=self.alpha, normalize=self.normalize) + classes = sorted(np.unique(y)) + self.nclasses = len(classes) + if self.class_weight == 'balanced': + class_weight = compute_class_weight('balanced', classes=classes, y=y) + sample_weight = class_weight[y] + self.regressor.fit(X, y, sample_weight=sample_weight) + return self + + def predict(self, X): + r = self.regressor.predict(X) + c = np.round(r) + c[c<0]=0 + c[c>(self.nclasses-1)]=self.nclasses-1 + return c.astype(int) + + # def predict_proba(self, X): + # r = self.regressor.predict(X) + # nC = len(self.classes_) + # r = np.clip(r, 0, nC - 1) + # dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1)) + # invdist = 1 - dists + # invdist[invdist < 0] = 0 + # return invdist + + def decision_function(self, X): + r = self.regressor.predict(X) + nC = len(self.classes_) + dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1)) + invdist = 1 - dists + return invdist + + @property + def classes_(self): + return np.arange(self.nclasses) + + def get_params(self, deep=True): + return {'alpha':self.alpha, 'class_weight': self.class_weight, 'normalize': self.normalize} + + def set_params(self, **params): + self.alpha = params['alpha'] + self.class_weight = params['class_weight'] + self.normalize = params['normalize'] + +# with order-aware classifiers +# threshold-based ordinal regression (see https://pythonhosted.org/mord/) +class LogisticAT(mord.LogisticAT): + def __init__(self, alpha=1.0, class_weight=None): + assert class_weight in [None, 'balanced'], 'unexpected value for class_weight' + self.class_weight = class_weight + super(LogisticAT, self).__init__(alpha=alpha) + + def fit(self, X, y, sample_weight=None): + if self.class_weight == 'balanced': + classes = sorted(np.unique(y)) + class_weight = compute_class_weight('balanced', classes=classes, y=y) + sample_weight = class_weight[y] + return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight) + + +class LogisticSE(mord.LogisticSE): + def __init__(self, alpha=1.0, class_weight=None): + assert class_weight in [None, 'balanced'], 'unexpected value for class_weight' + self.class_weight = class_weight + super(LogisticSE, self).__init__(alpha=alpha) + + def fit(self, X, y, sample_weight=None): + if self.class_weight == 'balanced': + classes = sorted(np.unique(y)) + class_weight = compute_class_weight('balanced', classes=classes, y=y) + sample_weight = class_weight[y] + return super(LogisticSE, self).fit(X, y, sample_weight=sample_weight) + + +class LogisticIT(mord.LogisticIT): + def __init__(self, alpha=1.0, class_weight=None): + assert class_weight in [None, 'balanced'], 'unexpected value for class_weight' + self.class_weight = class_weight + super(LogisticIT, self).__init__(alpha=alpha) + + def fit(self, X, y, sample_weight=None): + if self.class_weight == 'balanced': + classes = sorted(np.unique(y)) + class_weight = compute_class_weight('balanced', classes=classes, y=y) + sample_weight = class_weight[y] + return super(LogisticIT, self).fit(X, y, sample_weight=sample_weight) + + +# regression-based ordinal regression (see https://pythonhosted.org/mord/) +# class LAD(mord.LAD): +# def fit(self, X, y): +# self.classes_ = sorted(np.unique(y)) +# return super().fit(X, y) + + +# class OrdinalRidge(mord.OrdinalRidge): +# def fit(self, X, y): +# self.classes_ = sorted(np.unique(y)) +# return super().fit(X, y) + diff --git a/Ordinal/telescope_prevalence_plotgrid.py b/Ordinal/telescope_prevalence_plotgrid.py new file mode 100644 index 0000000..66d5e72 --- /dev/null +++ b/Ordinal/telescope_prevalence_plotgrid.py @@ -0,0 +1,78 @@ +import gzip +import os +from collections import Counter +from Ordinal.utils import jaggedness +import pickle +import numpy as np +import pandas as pd + + + +nrows = 3 +ncols = 4 + +prevalences = np.genfromtxt('fact_real_prevalences.csv', delimiter=',')[1:] +#prevalences = prevalences[:nrows*ncols] +print(prevalences) + +n = prevalences.shape[1] + +class_smooth = [] +for i, sample in enumerate(prevalences): + p = sample + smooth = jaggedness(p) + class_smooth.append([smooth, f'Sample {i+1}', p]) + +# these two lines pick the nrows*ncols examples that go from the less jagged to the most jagged +# at equal steps +class_smooth = sorted(class_smooth) +class_smooth = class_smooth[::len(class_smooth)//(nrows*ncols)] +class_smooth = class_smooth[:nrows*ncols] +# print(class_smooth) +# print(len(class_smooth)) + +import matplotlib.pyplot as plt +import seaborn as sns + +sns.set_theme('paper') +sns.set_style('dark') +sns.set(font_scale=0.5) + +maxy = np.max(prevalences) + 0.1 +class_labels = np.arange(1,n+1) + +figure, axis = plt.subplots(nrows, ncols, figsize=(ncols*2, nrows)) +for i, (smooth, category, prevalence) in enumerate(class_smooth): + row = i // ncols + col = i % ncols + # print(i, row, col) + #axis[row, col].bar(list(range(1,n+1)), prevalence, width=1) + + axis[row, col].bar(class_labels, prevalence, width=1) + axis[row, col].set_ylim(0, maxy) + axis[row, col].set_facecolor('white') + for spine in axis[row, col].spines.values(): + spine.set_edgecolor('black') + spine.set_linewidth(0.3) + + if row==nrows-1: + axis[row, col].set_xlabel("energy bin") + axis[row, col].set_xticks(class_labels) + else: + axis[row, col].set_xlabel("") + axis[row, col].set_xticks([]) + axis[row, col].set_ylabel("") + axis[row, col].set_yticks([]) + + category = category.replace('_', ' ').title() + category = category.replace(' And ', ' & ') + axis[row, col].set_title(f'{category} ({smooth:.4f})', x=0.5, y=0.75) + # axis[row, col].set_title + + print(smooth, category, prevalence) + +# plt.show() +plt.subplots_adjust(wspace=0, hspace=0) +plt.savefig('Telescope_sample_plotgrid.pdf', bbox_inches='tight') + + diff --git a/Ordinal/unpack_amazon_pickle_for_Mirko.py b/Ordinal/unpack_amazon_pickle_for_Mirko.py new file mode 100644 index 0000000..3155ebf --- /dev/null +++ b/Ordinal/unpack_amazon_pickle_for_Mirko.py @@ -0,0 +1,13 @@ +import pickle + +target_file = './counters_Amazon_merchandise.pkl' +(categories, counters) = pickle.load(open(target_file, 'rb')) + +print(categories) +print(counters) + +with open('categories.txt', 'wt') as foo: + for counter, category in zip(counters, categories): + foo.write(f'{category}\t{counter["1"]}\t{counter["2"]}\t{counter["3"]}\t{counter["4"]}\t{counter["5"]}\n') + +