forked from moreo/QuaPy
106 lines
3.3 KiB
Python
106 lines
3.3 KiB
Python
import gzip
|
|
import os
|
|
from collections import Counter
|
|
from Ordinal.utils import jaggedness
|
|
import quapy as qp
|
|
import pickle
|
|
import numpy as np
|
|
import pandas as pd
|
|
|
|
base_path = '/media/moreo/Volume/Datasets/Amazon/reviews'
|
|
categories_path = '/media/moreo/Volume/Datasets/Amazon/raw/amazon_categories.txt'
|
|
|
|
|
|
def get_prevalence_merchandise(category):
|
|
input_file = os.path.join(base_path, category+'.txt.gz')
|
|
labels = []
|
|
print(f'{category} starts')
|
|
with gzip.open(input_file, 'rt') as f:
|
|
for line in f:
|
|
try:
|
|
stars, doc = line.split('\t')
|
|
labels.append(stars)
|
|
except:
|
|
print('error in line: ', line)
|
|
counts = Counter(labels)
|
|
print(f'\t{category} done')
|
|
return counts
|
|
|
|
target_file = './counters_Amazon_merchandise.pkl'
|
|
|
|
if not os.path.exists(target_file):
|
|
categories = [c.strip().replace(' ', '_') for c in open(categories_path, 'rt').readlines()]
|
|
|
|
# categories = ['Gift_Cards', 'Magazine_Subscriptions']
|
|
counters = qp.util.parallel(get_prevalence_merchandise, categories, n_jobs=-1)
|
|
|
|
print('saving pickle')
|
|
pickle.dump((categories, counters), open(target_file, 'wb'), pickle.HIGHEST_PROTOCOL)
|
|
|
|
else:
|
|
(categories, counters) = pickle.load(open(target_file, 'rb'))
|
|
|
|
index_gift_cards = categories.index('Gift_Cards')
|
|
del categories[index_gift_cards]
|
|
del counters[index_gift_cards]
|
|
|
|
class_smooth = []
|
|
for cat, counter in zip(categories, counters):
|
|
total = sum(count for label, count in counter.items())
|
|
counts = [counter[i] for i in map(str, [1,2,3,4,5])]
|
|
p = np.asarray(counts)/total
|
|
smooth = jaggedness(p)
|
|
class_smooth.append([smooth, cat, p])
|
|
|
|
class_smooth = sorted(class_smooth)
|
|
|
|
# df = pd.DataFrame(class_smooth, columns=['smoothness', 'category', 'prevalence'])
|
|
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
|
|
sns.set_theme('paper')
|
|
sns.set_style('dark')
|
|
sns.set(font_scale=0.5)
|
|
|
|
nrows = 7
|
|
ncols = 4
|
|
figure, axis = plt.subplots(nrows, ncols, figsize=(ncols*2, nrows))
|
|
with open('categories.txt', 'wt') as foo:
|
|
foo.write(f'Category\tSmooth\tPrevalence\n')
|
|
for i, (smooth, category, prevalence) in enumerate(class_smooth):
|
|
row = i // 4
|
|
col = i % 4
|
|
# print(i, row, col)
|
|
axis[row, col].bar([1,2,3,4,5], prevalence, width=1)
|
|
axis[row, col].set_ylim(0, 0.75)
|
|
axis[row, col].set_facecolor('white')
|
|
for spine in axis[row, col].spines.values():
|
|
spine.set_edgecolor('black')
|
|
spine.set_linewidth(0.3)
|
|
# axis[row, col].set_xticks(loc=0)
|
|
if row==6:
|
|
axis[row, col].set_xlabel("stars")
|
|
# axis[row, col].set_xticks([1,2,3,4,5])
|
|
# else:
|
|
# axis[row, col].set_xticks([])
|
|
if col==0:
|
|
axis[row, col].set_ylabel("")
|
|
axis[row, col].set_yticks([])
|
|
else:
|
|
axis[row, col].set_ylabel("")
|
|
axis[row, col].set_yticks([])
|
|
|
|
category = category.replace('_', ' ').title()
|
|
category = category.replace(' And ', ' & ')
|
|
axis[row, col].set_title(f'{category} ({smooth:.4f})', x=0.5, y=0.75)
|
|
# axis[row, col].set_title
|
|
|
|
foo.write(f'{category}\t{smooth}\t{prevalence}\n')
|
|
|
|
# plt.show()
|
|
plt.subplots_adjust(wspace=0, hspace=0)
|
|
plt.savefig('Amazon_categories_plotgrid.pdf', bbox_inches='tight')
|
|
|
|
|