diff --git a/Retrieval/experiments.py b/Retrieval/experiments.py index a0ee37f..6e643a7 100644 --- a/Retrieval/experiments.py +++ b/Retrieval/experiments.py @@ -51,22 +51,10 @@ def methods(classifier, class_name=None, binarize=False): 'years_category':0.03 } - # yield ('Naive', Naive()) - # yield ('NaiveHalf', Naive()) yield ('NaiveQuery', Naive()) yield ('CC', ClassifyAndCount(classifier)) - # yield ('PCC', PCC(classifier)) - # yield ('ACC', ACC(classifier, val_split=5, n_jobs=-1)) yield ('PACC', PACC(classifier, val_split=5, n_jobs=-1)) - # yield ('EMQ', EMQ(classifier, exact_train_prev=True)) - # yield ('EMQ-Platt', EMQ(classifier, exact_train_prev=True, recalib='platt')) - # yield ('EMQh', EMQ(classifier, exact_train_prev=False)) - # yield ('EMQ-BCTS', EMQ(classifier, exact_train_prev=True, recalib='bcts')) - # yield ('EMQ-TS', EMQ(classifier, exact_train_prev=False, recalib='ts')) - # yield ('EMQ-NBVS', EMQ(classifier, exact_train_prev=False, recalib='nbvs')) - # yield ('EMQ-VS', EMQ(classifier, exact_train_prev=False, recalib='vs')) yield ('KDEy-ML', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=kde_param.get(class_name, 0.01))) - # yield ('KDE01', KDEyML(classifier, val_split=5, n_jobs=-1, bandwidth=0.01)) if binarize: yield ('M3b', M3rND_ModelB(classifier)) yield ('M3b+', M3rND_ModelB(classifier)) @@ -153,10 +141,6 @@ def run_experiment(): method.fit(train_col, val_split=train_col, fit_classifier=False) elif method_name == 'Naive': method.fit(train_col) - elif method_name == 'NaiveHalf': - n = len(ytr)//2 - train_col = LabelledCollection(Xtr[:n], ytr[:n], classes=classifier.classes_) - method.fit(train_col) test_col = LabelledCollection(Xte, yte, classes=classifier.classes_) rKL_estim, rKL_true = [], [] diff --git a/Retrieval/plot_mrae_xaxis_k.py b/Retrieval/plot_mrae_xaxis_k.py index 7b6007d..90e360b 100644 --- a/Retrieval/plot_mrae_xaxis_k.py +++ b/Retrieval/plot_mrae_xaxis_k.py @@ -1,3 +1,4 @@ +import itertools import os.path import pickle import numpy as np @@ -15,11 +16,18 @@ method_names = [name for name, *other in methods(None, 'continent')] all_results = {} +class_name_label = { + 'continent': 'Geographic Location', + 'gender': 'Gender', + 'years_category': 'Age of Topic' +} + # loads all MRAE results, and returns a dictionary containing the values, which is indexed by: # class_name -> data_size -> method_name -> k -> stat -> float # where stat is "mean", "std", "max" def load_all_results(): + for class_name in CLASS_NAMES: all_results[class_name] = {} @@ -56,13 +64,14 @@ results = load_all_results() # - the x-axis displays the Ks for class_name in CLASS_NAMES: - for data_size in DATA_SIZES: + for data_size in DATA_SIZES[:1]: - log = True + log = class_name=='gender' fig, ax = plt.subplots() max_means = [] + markers = itertools.cycle(['o', 's', '^', 'D', 'v', '*', '+']) for method_name in method_names: # class_name -> data_size -> method_name -> k -> stat -> float means = [ @@ -79,18 +88,23 @@ for class_name in CLASS_NAMES: means = np.asarray(means) stds = np.asarray(stds) - line = ax.plot(Ks, means, 'o-', label=method_name, color=None) + method_name = method_name.replace('NaiveQuery', 'Naive@$k$') + marker = next(markers) + line = ax.plot(Ks, means, 'o-', label=method_name, color=None, linewidth=3, markersize=10, marker=marker) color = line[-1].get_color() if log: ax.set_yscale('log') # ax.fill_between(Ks, means - stds, means + stds, alpha=0.3, color=color) + ax.grid(True, which='both', axis='y', color='gray', linestyle='--', linewidth=0.3) ax.set_xlabel('k') - ax.set_ylabel('RAE' + ('(log scale)' if log else '')) - ax.set_title(f'{class_name} from {data_size}') + ax.set_ylabel('RAE' + (' (log scale)' if log else '')) + data_size_label = '$\mathcal{L}_{10\mathrm{K}}$' + ax.set_title(f'{class_name_label[class_name]} from {data_size_label}') ax.set_ylim([0, max(max_means)*1.05]) - ax.legend() + if class_name == 'years_category': + ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) os.makedirs(f'plots/var_k/{class_name}', exist_ok=True) plotpath = f'plots/var_k/{class_name}/{data_size}_mrae.pdf' diff --git a/Retrieval/plot_mrae_xaxis_size.py b/Retrieval/plot_mrae_xaxis_size.py index 55797cf..0a362af 100644 --- a/Retrieval/plot_mrae_xaxis_size.py +++ b/Retrieval/plot_mrae_xaxis_size.py @@ -1,3 +1,4 @@ +import itertools import os.path from Retrieval.experiments import methods from Retrieval.commons import CLASS_NAMES, Ks, DATA_SIZES @@ -12,6 +13,11 @@ method_names = [name for name, *other in methods(None)] all_results = {} +class_name_label = { + 'continent': 'Geographic Location', + 'gender': 'Gender', + 'years_category': 'Age of Topic' +} # loads all MRAE results, and returns a dictionary containing the values, which is indexed by: # class_name -> data_size -> method_name -> k -> stat -> float @@ -20,14 +26,18 @@ results = load_all_results() # generates the class-independent, size-independent plots for y-axis=MRAE in which: # - the x-axis displays the Ks -for class_name in CLASS_NAMES: - for k in Ks: +# X_DATA_SIZES = [int(x.replace('K', '000').replace('M', '000000').replace('FULL', '3250000')) for x in DATA_SIZES] +X_DATA_SIZES = [x.replace('FULL', '3.25M') for x in DATA_SIZES] - log = True +for class_name in CLASS_NAMES: + for k in [100]: #Ks: + + log = class_name=='gender' fig, ax = plt.subplots() max_means = [] + markers = itertools.cycle(['o', 's', '^', 'D', 'v', '*', '+']) for method_name in method_names: # class_name -> data_size -> method_name -> k -> stat -> float means = [ @@ -43,18 +53,22 @@ for class_name in CLASS_NAMES: max_means.append(max(means)) style = 'o-' if method_name != 'CC' else '--' - line = ax.plot(DATA_SIZES, means, style, label=method_name, color=None) + method_name = method_name.replace('NaiveQuery', 'Naive@$k$') + marker=next(markers) + line = ax.plot(X_DATA_SIZES, means, style, label=method_name, color=None, linewidth=3, markersize=10, marker=marker) color = line[-1].get_color() if log: ax.set_yscale('log') # ax.fill_between(Ks, means - stds, means + stds, alpha=0.3, color=color) + ax.grid(True, which='both', axis='y', color='gray', linestyle='--', linewidth=0.3) ax.set_xlabel('training pool size') - ax.set_ylabel('RAE' + ('(log scale)' if log else '')) - ax.set_title(f'{class_name} from {k=}') + ax.set_ylabel('RAE' + (' (log scale)' if log else '')) + ax.set_title(f'{class_name_label[class_name]} at exposure {k=}') ax.set_ylim([0, max(max_means)*1.05]) - ax.legend() + if class_name == 'years_category': + ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) os.makedirs(f'plots/var_size/{class_name}', exist_ok=True) plotpath = f'plots/var_size/{class_name}/{k}_mrae.pdf' diff --git a/Retrieval/relscore_distribution.py b/Retrieval/relscore_distribution.py index aac52d5..5c4e097 100644 --- a/Retrieval/relscore_distribution.py +++ b/Retrieval/relscore_distribution.py @@ -9,14 +9,16 @@ import matplotlib.pyplot as plt """ Plots the distribution of (predicted) relevance score for the test samples and for the training samples wrt: -- training pool size (100K, 500K, 1M, FULL) +- training pool size (10K, 50K, 100K, 500K, 1M, FULL) - rank """ data_home = 'data' -for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']: +up_to = 250 + +for class_name in ['continent']: # 'num_sitelinks_category', 'relative_pageviews_category', 'years_category', 'continent', 'gender']: test_added = False Mtrs, Mtes, source = [], [], [] for data_size in DATA_SIZES: @@ -24,12 +26,14 @@ for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'yea class_home = join(data_home, class_name, data_size) classifier_path = join('classifiers', 'FULL', f'classifier_{class_name}.pkl') test_rankings_path = join(data_home, 'testRanking_Results.json') + test_query_prevs_path = join(data_home, 'prevelance_vectors_judged_docs.json') _, classifier = pickle.load(open(classifier_path, 'rb')) experiment_prot = RetrievedSamples( class_home, test_rankings_path, + test_query_prevs_path, vectorizer=None, class_name=class_name, classes=classifier.classes_ @@ -38,11 +42,12 @@ for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'yea Mtr = [] Mte = [] pbar = tqdm(experiment_prot(), total=experiment_prot.total()) - for train, test in pbar: + for train, test, *_ in pbar: Xtr, ytr, score_tr = train Xte, yte, score_te = test - Mtr.append(score_tr) - Mte.append(score_te) + if len(score_tr) >= up_to: + Mtr.append(score_tr) + Mte.append(score_te) Mtrs.append(Mtr) if not test_added: @@ -51,8 +56,11 @@ for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'yea source.append(data_size) fig, ax = plt.subplots() - train_source = ['train-'+s for s in source] - Ms = list(zip(Mtrs, train_source))+list(zip(Mtes, ['test'])) + # train_source = ['train-'+s for s in source] + train_source = ['$\mathcal{L}_{'+s.replace('FULL', '3.25M').replace('K','\mathrm{K}').replace('M','\mathrm{M}')+'}$' for s in source] + # Ms = list(zip(Mtrs, train_source))+list(zip(Mtes, ['test'])) + Ms = list(zip(Mtrs, train_source)) + list(zip(Mtes, ['$\mathcal{U}_{(3.25\mathrm{M})}$'])) + for M, source in Ms: M = np.asarray(list(zip_longest(*M, fillvalue=np.nan))).T @@ -68,17 +76,18 @@ for class_name in ['num_sitelinks_category', 'relative_pageviews_category', 'yea ax.fill_between(range(num_docs), mean_values - std_errors, mean_values + std_errors, alpha=0.3, color=color) - ax.set_xlabel('Doc. Rank') - ax.set_ylabel('Rel. Score') - ax.set_title(class_name) + ax.set_xlabel('rank ($k$)') + ax.set_ylabel('predicted relevance score') + ax.set_title(class_name.replace('continent', 'Geographic Location')) + ax.set_xlim((0,up_to)) - ax.legend() + ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # plt.show() os.makedirs('plots', exist_ok=True) - plotpath = f'plots/{class_name}.pdf' + plotpath = f'plots/{class_name}_rel_distrbution.pdf' print(f'saving plot in {plotpath}') - plt.savefig(plotpath) + plt.savefig(plotpath, bbox_inches='tight')