forked from moreo/QuaPy
adding helper files, unorganized
This commit is contained in:
parent
b756871f21
commit
faa3af587c
|
@ -0,0 +1,52 @@
|
|||
import gzip
|
||||
import os
|
||||
import sys
|
||||
from collections import Counter
|
||||
from Ordinal.utils import jaggedness
|
||||
import pickle
|
||||
import numpy as np
|
||||
|
||||
amazon = np.genfromtxt('prevalence_votes1_reviews100.csv', delimiter='\t')
|
||||
telescope = np.genfromtxt('fact_real_prevalences.csv', delimiter=',')[1:]
|
||||
|
||||
nclasses_amazon = amazon.shape[1]
|
||||
nclasses_telescope = telescope.shape[1]
|
||||
|
||||
jags_amazon = np.asarray([jaggedness(p) for p in amazon])
|
||||
jags_telescope = np.asarray([jaggedness(p) for p in telescope])
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.pyplot import figure
|
||||
import seaborn as sns
|
||||
|
||||
sns.set_theme('paper')
|
||||
sns.set_style('dark')
|
||||
sns.set(font_scale=0.7)
|
||||
|
||||
# figure, axis = plt.subplots(1, 2, figsize=(8, 7))
|
||||
ymax = 0.75
|
||||
|
||||
figure(figsize=(8, 4), dpi=300)
|
||||
|
||||
ax=plt.subplot(1, 2, 1)
|
||||
classes = np.arange(1, nclasses_amazon+1)
|
||||
plt.bar(classes, np.mean(amazon, axis=0), yerr=np.std(amazon, axis=0), width=1)
|
||||
ax.set_ylim(0, ymax)
|
||||
ax.set_xlabel("stars")
|
||||
ax.set_xticks(classes)
|
||||
ax.set_title(f'Amazon Books ({jags_amazon.mean():.4f})')
|
||||
|
||||
ax=plt.subplot(1, 2, 2)
|
||||
# ax=plt.subplot(1, 1, 1)
|
||||
classes = np.arange(1, nclasses_telescope+1)
|
||||
plt.bar(classes, np.mean(telescope, axis=0), yerr=np.std(telescope, axis=0), width=1)
|
||||
ax.set_ylim(0, ymax)
|
||||
ax.set_xlabel("energy bin")
|
||||
ax.set_xticks(classes)
|
||||
ax.set_title(f'FACT Samples ({jags_telescope.mean():.4f})')
|
||||
|
||||
|
||||
plt.subplots_adjust(wspace=0.1, hspace=0)
|
||||
plt.savefig('prevalence_averages.pdf', bbox_inches='tight')
|
||||
|
||||
|
|
@ -0,0 +1,43 @@
|
|||
import gzip
|
||||
import os
|
||||
import sys
|
||||
from collections import Counter
|
||||
from Ordinal.utils import jaggedness
|
||||
import pickle
|
||||
import numpy as np
|
||||
|
||||
telescope = np.genfromtxt('fact_expectation.txt')
|
||||
nclasses_telescope = len(telescope)
|
||||
|
||||
jag = jaggedness(telescope)
|
||||
print(jag)
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.pyplot import figure
|
||||
import seaborn as sns
|
||||
|
||||
sns.set_theme('paper')
|
||||
sns.set_style('dark')
|
||||
sns.set(font_scale=0.7)
|
||||
|
||||
# figure, axis = plt.subplots(1, 2, figsize=(8, 7))
|
||||
ymax = 0.4
|
||||
|
||||
figure(figsize=(8, 4), dpi=300)
|
||||
|
||||
ax=plt.subplot(1, 1, 1)
|
||||
classes = np.arange(1, nclasses_telescope+1)
|
||||
plt.bar(classes, telescope, width=1)
|
||||
# ax.bar_label(telescope)
|
||||
ax.set_ylim(0, ymax)
|
||||
ax.set_xlabel("energy bin")
|
||||
ax.set_xticks(classes)
|
||||
ax.set_title(f'FACT data ({jag:.4f})')
|
||||
for index, data in enumerate(telescope):
|
||||
plt.text(x=index+0.56 , y=data+0.005 , s=f"{data:.4f}")
|
||||
|
||||
|
||||
plt.subplots_adjust(wspace=0.1, hspace=0)
|
||||
plt.savefig('telescope_prevalence.pdf', bbox_inches='tight')
|
||||
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from tqdm import tqdm
|
||||
from collections import defaultdict
|
||||
|
||||
|
||||
# this script computes the distribution of smoothness/sharpness of the books;
|
||||
# either considering all books, as well as considering different groups of books' reviews (by product)
|
||||
|
||||
# filein='/media/moreo/Volume/Datasets/Amazon/raw/Gift_Cards.json.gz'
|
||||
# df = pd.read_json(filein, lines=True, compression='gzip')
|
||||
|
||||
read_meta = True
|
||||
|
||||
|
||||
def prepare_vote_field(df):
|
||||
df['vote'] = df['vote'].fillna('0')
|
||||
df['vote'] = df['vote'].apply(lambda x: x.replace(',', ''))
|
||||
df['vote'] = pd.to_numeric(df['vote'])
|
||||
return df
|
||||
|
||||
|
||||
def read_from_huge_json(filein):
|
||||
df = pd.read_json(filein, lines=True)
|
||||
df.drop(columns=[
|
||||
'verified', 'reviewTime', 'reviewerID', 'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime',
|
||||
'image'
|
||||
], inplace=True)
|
||||
|
||||
df = prepare_vote_field(df)
|
||||
return df
|
||||
|
||||
|
||||
def read_from_metadata(filein):
|
||||
df = pd.read_csv(filein)
|
||||
df['vote'] = pd.to_numeric(df['vote'])
|
||||
return df
|
||||
|
||||
|
||||
def filter_by_vote(df, vote_threshold=1):
|
||||
df = df[df['vote'] >= vote_threshold]
|
||||
df.drop(columns=['vote'], inplace=True)
|
||||
return df
|
||||
|
||||
|
||||
if read_meta:
|
||||
filein = '/media/moreo/Volume/Datasets/Amazon/meta/Books.csv'
|
||||
readfn = read_from_metadata
|
||||
else:
|
||||
filein='/media/moreo/Volume/Datasets/Amazon/raw/Books.json'
|
||||
readfn = read_from_huge_json
|
||||
|
||||
votes_support=9
|
||||
|
||||
df = readfn(filein)
|
||||
|
||||
num_entries = len(df)
|
||||
# df = prepare_vote_field(df)
|
||||
df = filter_by_vote(df, vote_threshold=votes_support)
|
||||
num_entries_with_vote = len(df)
|
||||
|
||||
unique_product_ids = df['asin'].unique()
|
||||
num_products = len(unique_product_ids)
|
||||
|
||||
print(df.columns)
|
||||
print(f'num rows {len(df)} (before vote-thresholding {num_entries}, after thresholding {num_entries_with_vote})')
|
||||
print(f'num unique products {num_products}')
|
||||
|
||||
|
||||
# df = df.groupby(df['asin'])
|
||||
|
||||
def not_smoothness(p):
|
||||
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||
|
||||
|
||||
# pass to dictionaries
|
||||
df = df.reset_index() # make sure indexes pair with number of rows
|
||||
|
||||
ids = df['asin'].values
|
||||
overalls = df['overall'].values
|
||||
|
||||
allbooks_prev = np.histogram(overalls, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
|
||||
allbooks_sharpness = not_smoothness(allbooks_prev)
|
||||
print(f'all books prev={allbooks_prev} has sharpness {allbooks_sharpness:.4f}')
|
||||
|
||||
import sys
|
||||
sys.exit(0)
|
||||
|
||||
# Defining a dict
|
||||
d = defaultdict(list)
|
||||
for i, id in tqdm(enumerate(ids), total=len(ids), desc='passing to dictionary'):
|
||||
d[id].append(overalls[i])
|
||||
|
||||
|
||||
by_review_support = []
|
||||
by_review_support_label = []
|
||||
for reviews_support in [50, 100, 1]:
|
||||
sharpness_all = []
|
||||
num_products_with_reviews = 0
|
||||
for product_id, ratings in tqdm(d.items(), total=len(d), desc='processing histograms'):
|
||||
# ratings = df[df["asin"] == product_id]["overall"].values
|
||||
n_ratings = len(ratings)
|
||||
if n_ratings >= reviews_support:
|
||||
# print(product_id, ratings)
|
||||
prev = np.histogram(ratings, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
|
||||
sharpness = not_smoothness(prev)
|
||||
# print(prev, sharpness)
|
||||
sharpness_all.append(sharpness)
|
||||
num_products_with_reviews+=1
|
||||
by_review_support.append(sharpness_all)
|
||||
by_review_support_label.append(f'>{reviews_support}')
|
||||
|
||||
print(f'#votes-support (min number of votes): {votes_support}')
|
||||
print(f'#reviews with >#votes-support: {num_entries_with_vote}/{num_entries}={100*num_entries_with_vote/num_entries:.2f}%')
|
||||
|
||||
print(f'#reviews-support (min number of reviews): {reviews_support}')
|
||||
print(f'#products with >#reviews-support: {num_products_with_reviews}/{num_products}={100*num_products_with_reviews/num_products:.2f}%')
|
||||
|
||||
q05 = np.percentile(sharpness_all, 5)
|
||||
q25 = np.percentile(sharpness_all, 25)
|
||||
q50 = np.percentile(sharpness_all, 50)
|
||||
q75 = np.percentile(sharpness_all, 75)
|
||||
q95 = np.percentile(sharpness_all, 95)
|
||||
print(f'{q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}')
|
||||
print(f'ave={np.mean(sharpness_all):.5f}')
|
||||
print(f'min={np.min(sharpness_all):.5f}')
|
||||
print(f'max={np.max(sharpness_all):.5f}')
|
||||
|
||||
#fig, ax = plt.subplots()
|
||||
#ax.boxplot(by_review_support)
|
||||
#ax.set_xticklabels(by_review_support_label)
|
||||
#ax.set_ylabel("Sharpness")
|
||||
#ax.set_xlabel("Distributions by number of reviews")
|
||||
#plt.show()
|
||||
|
|
@ -0,0 +1,209 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from tqdm import tqdm
|
||||
from collections import defaultdict
|
||||
|
||||
from quapy.data import LabelledCollection
|
||||
from quapy.protocol import UPP
|
||||
|
||||
# this script computes the distribution of smoothness/sharpness of the books;
|
||||
# either considering all books, as well as considering different groups of books' reviews (by product)
|
||||
# Mirko asked for some exploration of values (votes, num reviews), and percentiles of dataset shift as measured in terms
|
||||
# of NMD between training set prevalences and sample prevalences; this script does this
|
||||
# It also generates a csv containing all the prevalence values by product
|
||||
|
||||
|
||||
read_meta = True
|
||||
|
||||
|
||||
def not_smoothness(p):
|
||||
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||
|
||||
|
||||
def _check_arrays(prevs):
|
||||
prevs = np.asarray(prevs)
|
||||
if prevs.ndim==1:
|
||||
prevs = prevs.reshape(1,-1)
|
||||
return prevs
|
||||
|
||||
|
||||
# mean normalized match distance
|
||||
def mnmd(prevs, prevs_hat):
|
||||
prevs = _check_arrays(prevs)
|
||||
prevs_hat = _check_arrays(prevs_hat)
|
||||
assert prevs.shape == prevs_hat.shape, f'wrong shape; found {prevs.shape} and {prevs_hat.shape}'
|
||||
|
||||
nmds = [nmd(p, p_hat) for p, p_hat in zip(prevs, prevs_hat)]
|
||||
return np.mean(nmds)
|
||||
|
||||
|
||||
# normalized match distance
|
||||
def nmd(prev, prev_hat):
|
||||
n = len(prev)
|
||||
return (1./(n-1))*mdpa(prev, prev_hat)
|
||||
|
||||
|
||||
"""
|
||||
Minimum Distance of Pair Assignments (MDPA) [cha2002measuring] for ordinal pdfs `a` and `b`.
|
||||
The MDPA is a special case of the Earth Mover's Distance [rubner1998metric] that can be
|
||||
computed efficiently.
|
||||
[Mirko Bunse's code from Julia adapted]
|
||||
"""
|
||||
def mdpa(a, b):
|
||||
assert len(a) == len(b), "histograms have to have the same length"
|
||||
assert np.isclose(sum(a), sum(b)), "histograms have to have the same mass (difference is $(sum(a)-sum(b))"
|
||||
|
||||
# algorithm 1 in [cha2002measuring]
|
||||
prefixsum = 0.0
|
||||
distance = 0.0
|
||||
for i in range(len(a)):
|
||||
prefixsum += a[i] - b[i]
|
||||
distance += abs(prefixsum)
|
||||
|
||||
return distance / sum(a) # the normalization is a fix to the original MDPA
|
||||
|
||||
|
||||
def prepare_vote_field(df):
|
||||
df['vote'] = df['vote'].fillna('0')
|
||||
df['vote'] = df['vote'].apply(lambda x: x.replace(',', ''))
|
||||
df['vote'] = pd.to_numeric(df['vote'])
|
||||
return df
|
||||
|
||||
|
||||
def read_from_huge_json(filein):
|
||||
df = pd.read_json(filein, lines=True)
|
||||
df.drop(columns=[
|
||||
'verified', 'reviewTime', 'reviewerID', 'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime',
|
||||
'image'
|
||||
], inplace=True)
|
||||
|
||||
df = prepare_vote_field(df)
|
||||
return df
|
||||
|
||||
|
||||
def read_from_metadata(filein):
|
||||
df = pd.read_csv(filein)
|
||||
df['vote'] = pd.to_numeric(df['vote'])
|
||||
return df
|
||||
|
||||
|
||||
def filter_by_vote(df, vote_threshold=1):
|
||||
df = df[df['vote'] >= vote_threshold]
|
||||
df.drop(columns=['vote'], inplace=True)
|
||||
return df
|
||||
|
||||
|
||||
if read_meta:
|
||||
filein = '/media/moreo/Volume/Datasets/Amazon/meta/Books.csv'
|
||||
readfn = read_from_metadata
|
||||
else:
|
||||
filein='/media/moreo/Volume/Datasets/Amazon/raw/Books.json'
|
||||
readfn = read_from_huge_json
|
||||
|
||||
|
||||
def create_dictionary_bookid_ratings(df):
|
||||
# pass to dictionaries
|
||||
df = df.reset_index() # make sure indexes pair with number of rows
|
||||
|
||||
ids = df['asin'].values
|
||||
overalls = df['overall'].values
|
||||
|
||||
# Defining a dict
|
||||
d = defaultdict(list)
|
||||
for i, id in tqdm(enumerate(ids), total=len(ids), desc='passing to dictionary'):
|
||||
d[id].append(overalls[i])
|
||||
|
||||
return d
|
||||
|
||||
def get_stats(distribution, msg=''):
|
||||
# computes the mean, max, min, perc5, perc25, perc50, perc75, perc95 of the distribution
|
||||
vmean = np.mean(distribution)
|
||||
vmax = np.max(distribution)
|
||||
vmin = np.min(distribution)
|
||||
q05 = np.percentile(distribution, 5)
|
||||
q25 = np.percentile(distribution, 25)
|
||||
q50 = np.percentile(distribution, 50)
|
||||
q75 = np.percentile(distribution, 75)
|
||||
q95 = np.percentile(distribution, 95)
|
||||
print(f'{msg}: percentiles {q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}')
|
||||
print(f'{msg}: ave={np.mean(distribution):.5f}')
|
||||
print(f'{msg}: max={np.max(distribution):.5f}')
|
||||
print(f'{msg}: min={np.min(distribution):.5f}')
|
||||
return vmean, vmax, vmin, q05, q25, q50, q75, q95
|
||||
|
||||
with open('book_stats.csv', 'wt') as foo:
|
||||
foo.write(f'minvotes\tminreviews\t#products\t#reviews'
|
||||
f'\tsharp-ave\tsharp-max\tsharp-min\t'
|
||||
f'sharp-P5\tsharp-P25\tsharp-P50\tsharp-P75\tsharp-P95'
|
||||
f'\tshift-ave\tshift-max\tshift-min\t'
|
||||
f'shift-P5\tshift-P25\tshift-P50\tshift-P75\tshift-P95'
|
||||
f'\n')
|
||||
|
||||
for votes_support in [1]:
|
||||
|
||||
df = readfn(filein)
|
||||
df = df[df['overall']>0] # there are a couple of reviews with 0 stars (the min should be 1)
|
||||
|
||||
num_entries = len(df)
|
||||
df = filter_by_vote(df, vote_threshold=votes_support)
|
||||
num_entries_with_vote = len(df)
|
||||
|
||||
unique_product_ids = df['asin'].unique()
|
||||
num_products = len(unique_product_ids)
|
||||
|
||||
print(df.columns)
|
||||
print(f'num rows {len(df)} (before vote-thresholding {num_entries}, after thresholding {num_entries_with_vote})')
|
||||
print(f'num unique products {num_products}')
|
||||
|
||||
d = create_dictionary_bookid_ratings(df)
|
||||
|
||||
for reviews_support in [100]:
|
||||
with open(f'./prevalence_votes{votes_support}_reviews{reviews_support}.csv', 'wt') as fprev:
|
||||
sharpness_all = []
|
||||
num_products_with_reviews = 0
|
||||
sel_ids, sel_overalls = [], []
|
||||
for product_id, ratings in tqdm(d.items(), total=len(d), desc='processing histograms'):
|
||||
n_ratings = len(ratings)
|
||||
if n_ratings >= reviews_support:
|
||||
sel_ids.extend([product_id] * n_ratings)
|
||||
sel_overalls.extend(ratings)
|
||||
|
||||
prev = np.histogram(ratings, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
|
||||
for i, prev_i in enumerate(prev):
|
||||
fprev.write(f'{prev_i:.5f}')
|
||||
if i < len(prev)-1:
|
||||
fprev.write('\t')
|
||||
else:
|
||||
fprev.write('\n')
|
||||
sharpness = not_smoothness(prev)
|
||||
sharpness_all.append(sharpness)
|
||||
num_products_with_reviews+=1
|
||||
|
||||
print(f'#votes-support (min number of votes): {votes_support}')
|
||||
print(f'#reviews with >#votes-support: {num_entries_with_vote}/{num_entries}={100*num_entries_with_vote/num_entries:.2f}%')
|
||||
|
||||
print(f'#reviews-support (min number of reviews): {reviews_support}')
|
||||
print(f'#products with >#reviews-support: {num_products_with_reviews}/{num_products}={100*num_products_with_reviews/num_products:.2f}%')
|
||||
|
||||
vmean, vmax, vmin, q05, q25, q50, q75, q95 = get_stats(sharpness_all, 'sharpness')
|
||||
|
||||
allbooks_prev = np.histogram(sel_overalls, bins=np.array([0, 1, 2, 3, 4, 5]) + 0.5, density=True)[0]
|
||||
allbooks_sharpness = not_smoothness(allbooks_prev)
|
||||
print(f'all books prev={allbooks_prev} has sharpness {allbooks_sharpness:.4f}')
|
||||
|
||||
sel_collection = LabelledCollection(instances=sel_ids, labels=sel_overalls, classes=[1,2,3,4,5])
|
||||
prot = UPP(sel_collection, sample_size=1000, repeats=5000)
|
||||
prot_iterator = prot()
|
||||
shifts = []
|
||||
for _, test_prev in tqdm(prot_iterator, total=prot.total()):
|
||||
shifts.append(nmd(allbooks_prev, prev_hat=test_prev))
|
||||
s_mean, s_max, s_min, s_q05, s_q25, s_q50, s_q75, s_q95 = get_stats(shifts, 'shift')
|
||||
|
||||
foo.write(f'{votes_support}\t{reviews_support}\t{num_products_with_reviews}\t{len(sel_ids)}'
|
||||
f'\t{vmean:.5f}\t{vmax:.5f}\t{vmin:.5f}\t'
|
||||
f'{q05:.5f}\t{q25:.5f}\t{q50:.5f}\t{q75:.5f}\t{q95:.5f}'
|
||||
f'\t{s_mean:.5f}\t{s_max:.5f}\t{s_min:.5f}\t'
|
||||
f'{s_q05:.5f}\t{s_q25:.5f}\t{s_q50:.5f}\t{s_q75:.5f}\t{s_q95:.5f}\n')
|
||||
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from scipy import optimize
|
||||
|
||||
|
||||
# this script checks for the prevalence values that yield the maximum or minimum values of smoothness;
|
||||
# the result indicates any linear distribution (not only the uniform) satisfies this requirement
|
||||
|
||||
def sharpness(p):
|
||||
return 0.5 * sum((-p_prev + 2*p_i - p_next)**2 for p_prev, p_i, p_next in zip(p[:-2], p[1:-1], p[2:]))
|
||||
|
||||
def smoothness(p):
|
||||
return 1-sharpness(p)
|
||||
|
||||
nclasses = 5
|
||||
uniform_distribution = np.random.rand(nclasses) #np.full(fill_value=1/nclasses, shape=nclasses)
|
||||
uniform_distribution /= uniform_distribution.sum()
|
||||
|
||||
bounds = tuple((0, 1) for x in range(nclasses)) # values in [0,1]
|
||||
constraints = ({'type': 'eq', 'fun': lambda x: 1 - sum(x)}) # values summing up to 1
|
||||
r = optimize.minimize(sharpness, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||
|
||||
print(f'minimum of sharpness function {r.x}')
|
||||
|
||||
r = optimize.minimize(smoothness, x0=uniform_distribution, method='SLSQP', bounds=bounds, constraints=constraints)
|
||||
print(f'maximum of sharpness function {r.x}')
|
|
@ -0,0 +1,105 @@
|
|||
import gzip
|
||||
import os
|
||||
from collections import Counter
|
||||
from Ordinal.utils import jaggedness
|
||||
import quapy as qp
|
||||
import pickle
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
base_path = '/media/moreo/Volume/Datasets/Amazon/reviews'
|
||||
categories_path = '/media/moreo/Volume/Datasets/Amazon/raw/amazon_categories.txt'
|
||||
|
||||
|
||||
def get_prevalence_merchandise(category):
|
||||
input_file = os.path.join(base_path, category+'.txt.gz')
|
||||
labels = []
|
||||
print(f'{category} starts')
|
||||
with gzip.open(input_file, 'rt') as f:
|
||||
for line in f:
|
||||
try:
|
||||
stars, doc = line.split('\t')
|
||||
labels.append(stars)
|
||||
except:
|
||||
print('error in line: ', line)
|
||||
counts = Counter(labels)
|
||||
print(f'\t{category} done')
|
||||
return counts
|
||||
|
||||
target_file = './counters_Amazon_merchandise.pkl'
|
||||
|
||||
if not os.path.exists(target_file):
|
||||
categories = [c.strip().replace(' ', '_') for c in open(categories_path, 'rt').readlines()]
|
||||
|
||||
# categories = ['Gift_Cards', 'Magazine_Subscriptions']
|
||||
counters = qp.util.parallel(get_prevalence_merchandise, categories, n_jobs=-1)
|
||||
|
||||
print('saving pickle')
|
||||
pickle.dump((categories, counters), open(target_file, 'wb'), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
else:
|
||||
(categories, counters) = pickle.load(open(target_file, 'rb'))
|
||||
|
||||
index_gift_cards = categories.index('Gift_Cards')
|
||||
del categories[index_gift_cards]
|
||||
del counters[index_gift_cards]
|
||||
|
||||
class_smooth = []
|
||||
for cat, counter in zip(categories, counters):
|
||||
total = sum(count for label, count in counter.items())
|
||||
counts = [counter[i] for i in map(str, [1,2,3,4,5])]
|
||||
p = np.asarray(counts)/total
|
||||
smooth = jaggedness(p)
|
||||
class_smooth.append([smooth, cat, p])
|
||||
|
||||
class_smooth = sorted(class_smooth)
|
||||
|
||||
# df = pd.DataFrame(class_smooth, columns=['smoothness', 'category', 'prevalence'])
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
sns.set_theme('paper')
|
||||
sns.set_style('dark')
|
||||
sns.set(font_scale=0.5)
|
||||
|
||||
nrows = 7
|
||||
ncols = 4
|
||||
figure, axis = plt.subplots(nrows, ncols, figsize=(ncols*2, nrows))
|
||||
with open('categories.txt', 'wt') as foo:
|
||||
foo.write(f'Category\tSmooth\tPrevalence\n')
|
||||
for i, (smooth, category, prevalence) in enumerate(class_smooth):
|
||||
row = i // 4
|
||||
col = i % 4
|
||||
# print(i, row, col)
|
||||
axis[row, col].bar([1,2,3,4,5], prevalence, width=1)
|
||||
axis[row, col].set_ylim(0, 0.75)
|
||||
axis[row, col].set_facecolor('white')
|
||||
for spine in axis[row, col].spines.values():
|
||||
spine.set_edgecolor('black')
|
||||
spine.set_linewidth(0.3)
|
||||
# axis[row, col].set_xticks(loc=0)
|
||||
if row==6:
|
||||
axis[row, col].set_xlabel("stars")
|
||||
# axis[row, col].set_xticks([1,2,3,4,5])
|
||||
# else:
|
||||
# axis[row, col].set_xticks([])
|
||||
if col==0:
|
||||
axis[row, col].set_ylabel("")
|
||||
axis[row, col].set_yticks([])
|
||||
else:
|
||||
axis[row, col].set_ylabel("")
|
||||
axis[row, col].set_yticks([])
|
||||
|
||||
category = category.replace('_', ' ').title()
|
||||
category = category.replace(' And ', ' & ')
|
||||
axis[row, col].set_title(f'{category} ({smooth:.4f})', x=0.5, y=0.75)
|
||||
# axis[row, col].set_title
|
||||
|
||||
foo.write(f'{category}\t{smooth}\t{prevalence}\n')
|
||||
|
||||
# plt.show()
|
||||
plt.subplots_adjust(wspace=0, hspace=0)
|
||||
plt.savefig('Amazon_categories_plotgrid.pdf', bbox_inches='tight')
|
||||
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
import numpy as np
|
||||
from scipy.stats import wilcoxon
|
||||
|
||||
import quapy as qp
|
||||
import os
|
||||
from os.path import join
|
||||
|
||||
from Ordinal.tabular import Table
|
||||
from utils import load_samples_folder, load_single_sample_pkl, jaggedness
|
||||
from Ordinal.evaluation import nmd, mnmd
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
from glob import glob
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
"""
|
||||
This script takes all results from the book domain, that correspond to the APP protocol, and filters by
|
||||
smoothness so that only the 50% smoothest examples are considered, and recomputes the averages of the nmd
|
||||
thus effectively reporting the results for the APP-OQ protocol
|
||||
"""
|
||||
|
||||
def parse_str_prev(df_col):
|
||||
values = df_col.values
|
||||
array_list = [np.fromstring(array[1:-1], sep=' ') for array in values]
|
||||
return np.asarray(array_list)
|
||||
|
||||
def parse_result_file(path):
|
||||
df = pd.read_csv(path)
|
||||
true_prev = parse_str_prev(df['true-prev'])
|
||||
estim_prev = parse_str_prev(df['estim-prev'])
|
||||
nmd = df['nmd'].values
|
||||
return true_prev, estim_prev, nmd
|
||||
|
||||
def ave_jaggedness(prevs, less_percentile=1):
|
||||
jag = np.sort([jaggedness(p) for p in prevs])
|
||||
up_to = int(less_percentile * len(jag))
|
||||
return np.mean(jag[:up_to])
|
||||
|
||||
|
||||
def retain_half_smoothest(true_prev, estim_prev, nmd):
|
||||
jag = [jaggedness(p) for p in true_prev]
|
||||
order = np.argsort(jag)
|
||||
up_to = len(order)//2
|
||||
order = order[:up_to]
|
||||
return true_prev[order], estim_prev[order], nmd[order]
|
||||
|
||||
|
||||
def compute_half_smoothest_nmd(true_prev, estim_prev, nmd):
|
||||
_, _, nmd_smooth = retain_half_smoothest(true_prev, estim_prev, nmd)
|
||||
return nmd_smooth
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
domain = 'Books-roberta-base-finetuned-pkl/checkpoint-1188-average'
|
||||
datapath = './data'
|
||||
in_protocol = 'app'
|
||||
out_protocol = 'app-oq'
|
||||
in_result_path = join('./results', domain, in_protocol)
|
||||
out_result_path = join('./results', domain, out_protocol)
|
||||
os.makedirs(out_result_path, exist_ok=True)
|
||||
|
||||
# recompute the results in terms of APP-OQ
|
||||
result_dict = {}
|
||||
for filepath in glob(f'{in_result_path}/*).all.csv'):
|
||||
name = Path(filepath).name
|
||||
quantifier = name[:name.index('(')]
|
||||
classifier = name[name.index('(')+1:name.index(')')]
|
||||
|
||||
true_prev, estim_prev, nmds = parse_result_file(filepath)
|
||||
nmds = compute_half_smoothest_nmd(true_prev, estim_prev, nmds)
|
||||
|
||||
result_dict[classifier + '-' + quantifier] = nmds
|
||||
|
||||
# convert to numbers and search for the best in each quantifier
|
||||
best_keys = {}
|
||||
best_nmds = {}
|
||||
for quantifier in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']:
|
||||
best_ave, best_key, best_nmd = None, None, None
|
||||
for classifier in ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']:
|
||||
key = classifier + '-' + quantifier
|
||||
if key in result_dict:
|
||||
nmds = result_dict[key]
|
||||
mean_val = np.mean(nmds)
|
||||
if best_ave is None or mean_val < best_ave:
|
||||
best_ave = mean_val
|
||||
best_key = key
|
||||
best_nmd = nmds
|
||||
best_keys[quantifier] = best_key
|
||||
best_nmds[quantifier] = best_nmd
|
||||
|
||||
# print(best_keys)
|
||||
|
||||
# write a latex table
|
||||
for q in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']:
|
||||
print('& \multicolumn{2}{c}{'+q+'} ', end='')
|
||||
print('\\\\')
|
||||
print('\\midrule')
|
||||
for classifier in ['LR', 'OLR-AT', 'OLR-IT', 'ORidge', 'LAD']:
|
||||
print(classifier + '\t', end='')
|
||||
for quantifier in ['CC', 'PCC', 'ACC', 'PACC', 'SLD']:
|
||||
key = classifier + '-' + quantifier
|
||||
the_best_nmds = best_nmds[quantifier]
|
||||
|
||||
if key in result_dict:
|
||||
nmds = result_dict[key]
|
||||
mean_val = np.mean(nmds)
|
||||
|
||||
bold = False
|
||||
if best_keys[quantifier] == key:
|
||||
bold = True
|
||||
else:
|
||||
_, pval = wilcoxon(nmds, the_best_nmds)
|
||||
if pval > 0.01:
|
||||
bold = True
|
||||
|
||||
str_mean = f'{mean_val:.4f}'
|
||||
if bold:
|
||||
str_mean = '\\textbf{' + str_mean + '}'
|
||||
|
||||
if classifier == 'LR':
|
||||
std_val = np.std(nmds)
|
||||
str_val = f'{str_mean} & $\pm {std_val:.4f}$'
|
||||
else:
|
||||
rel_increment = 100 * (mean_val-np.mean(the_best_nmds)) / np.mean(the_best_nmds)
|
||||
sign = '+' if rel_increment>0 else ''
|
||||
str_val = f'{str_mean} & ({sign}{rel_increment:.1f}\\%)'
|
||||
else:
|
||||
str_val = '\multicolumn{2}{c}{---}'
|
||||
|
||||
str_val = ' & ' + str_val
|
||||
|
||||
print(str_val, end='')
|
||||
print('\\\\')
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,296 @@
|
|||
from copy import deepcopy
|
||||
import numpy as np
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
from sklearn.calibration import CalibratedClassifierCV
|
||||
from sklearn.decomposition import TruncatedSVD
|
||||
from sklearn.linear_model import LogisticRegression, Ridge
|
||||
from scipy.sparse import issparse
|
||||
from sklearn.multiclass import OneVsRestClassifier
|
||||
from sklearn.multioutput import MultiOutputRegressor
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.svm import LinearSVR, SVR
|
||||
from statsmodels.miscmodels.ordinal_model import OrderedModel
|
||||
import mord
|
||||
from sklearn.utils.class_weight import compute_class_weight
|
||||
|
||||
|
||||
class OrderedLogisticRegression:
|
||||
def __init__(self, model='logit'):
|
||||
assert model in ['logit', 'probit'], 'unknown ordered model, valid ones are logit or probit'
|
||||
self.model = model
|
||||
|
||||
def fit(self, X, y):
|
||||
if issparse(X):
|
||||
self.svd = TruncatedSVD(500)
|
||||
X = self.svd.fit_transform(X)
|
||||
self.learner = OrderedModel(y, X, distr=self.model)
|
||||
self.res_prob = self.learner.fit(method='bfgs', disp=False, skip_hessian=True)
|
||||
|
||||
def predict(self, X):
|
||||
prob = self.predict_proba(X)
|
||||
return np.argmax(prob, axis=1)
|
||||
|
||||
def predict_proba(self, X):
|
||||
if issparse(X):
|
||||
assert hasattr(self, 'svd'), \
|
||||
'X matrix in predict is sparse, but the method has not been fit with sparse type'
|
||||
X = self.svd.transform(X)
|
||||
return self.res_prob.quantifier.predict(self.res_prob.params, exog=X)
|
||||
|
||||
|
||||
class StackedClassifier: # aka Funnelling Monolingual
|
||||
def __init__(self, base_estimator=LogisticRegression()):
|
||||
if not hasattr(base_estimator, 'predict_proba'):
|
||||
print('the estimator does not seem to be probabilistic: calibrating')
|
||||
base_estimator = CalibratedClassifierCV(base_estimator)
|
||||
# self.base = deepcopy(OneVsRestClassifier(base_estimator))
|
||||
# self.meta = deepcopy(OneVsRestClassifier(base_estimator))
|
||||
self.base = deepcopy(base_estimator)
|
||||
self.meta = deepcopy(base_estimator)
|
||||
self.norm = StandardScaler()
|
||||
|
||||
def fit(self, X, y):
|
||||
self.base.fit(X, y)
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.fit_transform(P)
|
||||
self.meta.fit(P, y)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.transform(P)
|
||||
return self.meta.predict(P)
|
||||
|
||||
def predict_proba(self, X):
|
||||
P = self.base.predict_proba(X)
|
||||
P = self.norm.transform(P)
|
||||
return self.meta.predict_proba(P)
|
||||
|
||||
|
||||
class RegressionQuantification:
|
||||
def __init__(self,
|
||||
base_quantifier,
|
||||
regression='svr',
|
||||
val_samples_generator=None,
|
||||
norm=True):
|
||||
|
||||
self.base_quantifier = base_quantifier
|
||||
if isinstance(regression, str):
|
||||
assert regression in ['ridge', 'svr'], 'unknown regression model'
|
||||
if regression == 'ridge':
|
||||
self.reg = Ridge(normalize=norm)
|
||||
elif regression == 'svr':
|
||||
self.reg = MultiOutputRegressor(LinearSVR())
|
||||
else:
|
||||
self.reg = regression
|
||||
# self.reg = MultiTaskLassoCV(normalize=norm)
|
||||
# self.reg = KernelRidge(kernel='rbf')
|
||||
# self.reg = LassoLarsCV(normalize=norm)
|
||||
# self.reg = MultiTaskElasticNetCV(normalize=norm) <- bien
|
||||
#self.reg = LinearRegression(normalize=norm) # <- bien
|
||||
# self.reg = MultiOutputRegressor(ARDRegression(normalize=norm)) # <- bastante bien, incluso sin norm
|
||||
# self.reg = MultiOutputRegressor(BayesianRidge(normalize=False)) # <- bastante bien, incluso sin norm
|
||||
# self.reg = MultiOutputRegressor(SGDRegressor()) # lento, no va
|
||||
self.regression = regression
|
||||
self.val_samples_generator = val_samples_generator
|
||||
# self.norm = StandardScaler()
|
||||
# self.covs = covs
|
||||
|
||||
def generate_validation_samples(self):
|
||||
Xs, ys = [], []
|
||||
for instances, prevalence in self.val_samples_generator():
|
||||
ys.append(prevalence)
|
||||
Xs.append(self.base_quantifier.quantify(instances))
|
||||
Xs = np.asarray(Xs)
|
||||
ys = np.asarray(ys)
|
||||
return Xs, ys
|
||||
|
||||
def fit(self, data):
|
||||
print('fitting quantifier')
|
||||
if data is not None:
|
||||
self.base_quantifier.fit(data)
|
||||
print('generating val samples')
|
||||
Xs, ys = self.generate_validation_samples()
|
||||
# Xs = self.norm.fit_transform(Xs)
|
||||
print('fitting regressor')
|
||||
self.reg.fit(Xs, ys)
|
||||
print('[done]')
|
||||
return self
|
||||
|
||||
def quantify(self, instances):
|
||||
Xs = self.base_quantifier.quantify(instances).reshape(1, -1)
|
||||
# Xs = self.norm.transform(Xs)
|
||||
Xs = self.reg.predict(Xs).flatten()
|
||||
# Xs = self.norm.inverse_transform(Xs)
|
||||
Xs = np.clip(Xs, 0, 1)
|
||||
adjusted = Xs / Xs.sum()
|
||||
# adjusted = np.clip(Xs, 0, 1)
|
||||
adjusted = adjusted
|
||||
return adjusted
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return self.base_quantifier.get_params()
|
||||
|
||||
def set_params(self, **params):
|
||||
self.base_quantifier.set_params(**params)
|
||||
|
||||
|
||||
class LAD(BaseEstimator, ClassifierMixin):
|
||||
def __init__(self, C=1.0, class_weight=None):
|
||||
self.C = C
|
||||
self.class_weight = class_weight
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
self.regressor = LinearSVR(C=self.C)
|
||||
# self.regressor = SVR()
|
||||
# self.regressor = Ridge(normalize=True)
|
||||
classes = sorted(np.unique(y))
|
||||
self.nclasses = len(classes)
|
||||
if self.class_weight == 'balanced':
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
self.regressor.fit(X, y, sample_weight=sample_weight)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
c = np.round(r)
|
||||
c[c<0]=0
|
||||
c[c>(self.nclasses-1)]=self.nclasses-1
|
||||
return c.astype(int)
|
||||
|
||||
# def predict_proba(self, X):
|
||||
# r = self.regressor.predict(X)
|
||||
# nC = len(self.classes_)
|
||||
# r = np.clip(r, 0, nC - 1)
|
||||
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
# invdist = 1 - dists
|
||||
# invdist[invdist < 0] = 0
|
||||
# return invdist
|
||||
|
||||
def decision_function(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
nC = len(self.classes_)
|
||||
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
invdist = 1 - dists
|
||||
return invdist
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return np.arange(self.nclasses)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return {'C':self.C, 'class_weight': self.class_weight}
|
||||
|
||||
def set_params(self, **params):
|
||||
self.C = params['C']
|
||||
self.class_weight = params['class_weight']
|
||||
|
||||
|
||||
class OrdinalRidge(BaseEstimator, ClassifierMixin):
|
||||
def __init__(self, alpha=1.0, class_weight=None, normalize=False):
|
||||
self.alpha = alpha
|
||||
self.class_weight = class_weight
|
||||
self.normalize = normalize
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
self.regressor = Ridge(alpha=self.alpha, normalize=self.normalize)
|
||||
classes = sorted(np.unique(y))
|
||||
self.nclasses = len(classes)
|
||||
if self.class_weight == 'balanced':
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
self.regressor.fit(X, y, sample_weight=sample_weight)
|
||||
return self
|
||||
|
||||
def predict(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
c = np.round(r)
|
||||
c[c<0]=0
|
||||
c[c>(self.nclasses-1)]=self.nclasses-1
|
||||
return c.astype(int)
|
||||
|
||||
# def predict_proba(self, X):
|
||||
# r = self.regressor.predict(X)
|
||||
# nC = len(self.classes_)
|
||||
# r = np.clip(r, 0, nC - 1)
|
||||
# dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
# invdist = 1 - dists
|
||||
# invdist[invdist < 0] = 0
|
||||
# return invdist
|
||||
|
||||
def decision_function(self, X):
|
||||
r = self.regressor.predict(X)
|
||||
nC = len(self.classes_)
|
||||
dists = np.abs(np.tile(np.arange(nC), (len(r), 1)) - r.reshape(-1,1))
|
||||
invdist = 1 - dists
|
||||
return invdist
|
||||
|
||||
@property
|
||||
def classes_(self):
|
||||
return np.arange(self.nclasses)
|
||||
|
||||
def get_params(self, deep=True):
|
||||
return {'alpha':self.alpha, 'class_weight': self.class_weight, 'normalize': self.normalize}
|
||||
|
||||
def set_params(self, **params):
|
||||
self.alpha = params['alpha']
|
||||
self.class_weight = params['class_weight']
|
||||
self.normalize = params['normalize']
|
||||
|
||||
# with order-aware classifiers
|
||||
# threshold-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
class LogisticAT(mord.LogisticAT):
|
||||
def __init__(self, alpha=1.0, class_weight=None):
|
||||
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||
self.class_weight = class_weight
|
||||
super(LogisticAT, self).__init__(alpha=alpha)
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
if self.class_weight == 'balanced':
|
||||
classes = sorted(np.unique(y))
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
return super(LogisticAT, self).fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
class LogisticSE(mord.LogisticSE):
|
||||
def __init__(self, alpha=1.0, class_weight=None):
|
||||
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||
self.class_weight = class_weight
|
||||
super(LogisticSE, self).__init__(alpha=alpha)
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
if self.class_weight == 'balanced':
|
||||
classes = sorted(np.unique(y))
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
return super(LogisticSE, self).fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
class LogisticIT(mord.LogisticIT):
|
||||
def __init__(self, alpha=1.0, class_weight=None):
|
||||
assert class_weight in [None, 'balanced'], 'unexpected value for class_weight'
|
||||
self.class_weight = class_weight
|
||||
super(LogisticIT, self).__init__(alpha=alpha)
|
||||
|
||||
def fit(self, X, y, sample_weight=None):
|
||||
if self.class_weight == 'balanced':
|
||||
classes = sorted(np.unique(y))
|
||||
class_weight = compute_class_weight('balanced', classes=classes, y=y)
|
||||
sample_weight = class_weight[y]
|
||||
return super(LogisticIT, self).fit(X, y, sample_weight=sample_weight)
|
||||
|
||||
|
||||
# regression-based ordinal regression (see https://pythonhosted.org/mord/)
|
||||
# class LAD(mord.LAD):
|
||||
# def fit(self, X, y):
|
||||
# self.classes_ = sorted(np.unique(y))
|
||||
# return super().fit(X, y)
|
||||
|
||||
|
||||
# class OrdinalRidge(mord.OrdinalRidge):
|
||||
# def fit(self, X, y):
|
||||
# self.classes_ = sorted(np.unique(y))
|
||||
# return super().fit(X, y)
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
import gzip
|
||||
import os
|
||||
from collections import Counter
|
||||
from Ordinal.utils import jaggedness
|
||||
import pickle
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
|
||||
|
||||
nrows = 3
|
||||
ncols = 4
|
||||
|
||||
prevalences = np.genfromtxt('fact_real_prevalences.csv', delimiter=',')[1:]
|
||||
#prevalences = prevalences[:nrows*ncols]
|
||||
print(prevalences)
|
||||
|
||||
n = prevalences.shape[1]
|
||||
|
||||
class_smooth = []
|
||||
for i, sample in enumerate(prevalences):
|
||||
p = sample
|
||||
smooth = jaggedness(p)
|
||||
class_smooth.append([smooth, f'Sample {i+1}', p])
|
||||
|
||||
# these two lines pick the nrows*ncols examples that go from the less jagged to the most jagged
|
||||
# at equal steps
|
||||
class_smooth = sorted(class_smooth)
|
||||
class_smooth = class_smooth[::len(class_smooth)//(nrows*ncols)]
|
||||
class_smooth = class_smooth[:nrows*ncols]
|
||||
# print(class_smooth)
|
||||
# print(len(class_smooth))
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
sns.set_theme('paper')
|
||||
sns.set_style('dark')
|
||||
sns.set(font_scale=0.5)
|
||||
|
||||
maxy = np.max(prevalences) + 0.1
|
||||
class_labels = np.arange(1,n+1)
|
||||
|
||||
figure, axis = plt.subplots(nrows, ncols, figsize=(ncols*2, nrows))
|
||||
for i, (smooth, category, prevalence) in enumerate(class_smooth):
|
||||
row = i // ncols
|
||||
col = i % ncols
|
||||
# print(i, row, col)
|
||||
#axis[row, col].bar(list(range(1,n+1)), prevalence, width=1)
|
||||
|
||||
axis[row, col].bar(class_labels, prevalence, width=1)
|
||||
axis[row, col].set_ylim(0, maxy)
|
||||
axis[row, col].set_facecolor('white')
|
||||
for spine in axis[row, col].spines.values():
|
||||
spine.set_edgecolor('black')
|
||||
spine.set_linewidth(0.3)
|
||||
|
||||
if row==nrows-1:
|
||||
axis[row, col].set_xlabel("energy bin")
|
||||
axis[row, col].set_xticks(class_labels)
|
||||
else:
|
||||
axis[row, col].set_xlabel("")
|
||||
axis[row, col].set_xticks([])
|
||||
axis[row, col].set_ylabel("")
|
||||
axis[row, col].set_yticks([])
|
||||
|
||||
category = category.replace('_', ' ').title()
|
||||
category = category.replace(' And ', ' & ')
|
||||
axis[row, col].set_title(f'{category} ({smooth:.4f})', x=0.5, y=0.75)
|
||||
# axis[row, col].set_title
|
||||
|
||||
print(smooth, category, prevalence)
|
||||
|
||||
# plt.show()
|
||||
plt.subplots_adjust(wspace=0, hspace=0)
|
||||
plt.savefig('Telescope_sample_plotgrid.pdf', bbox_inches='tight')
|
||||
|
||||
|
|
@ -0,0 +1,13 @@
|
|||
import pickle
|
||||
|
||||
target_file = './counters_Amazon_merchandise.pkl'
|
||||
(categories, counters) = pickle.load(open(target_file, 'rb'))
|
||||
|
||||
print(categories)
|
||||
print(counters)
|
||||
|
||||
with open('categories.txt', 'wt') as foo:
|
||||
for counter, category in zip(counters, categories):
|
||||
foo.write(f'{category}\t{counter["1"]}\t{counter["2"]}\t{counter["3"]}\t{counter["4"]}\t{counter["5"]}\n')
|
||||
|
||||
|
Loading…
Reference in New Issue