QuaPy/Ordinal/amazon_prevalence_plotgrid.py

import gzip
import os
from collections import Counter
from Ordinal.utils import jaggedness
import quapy as qp
import pickle
import numpy as np
import pandas as pd

base_path = '/media/moreo/Volume/Datasets/Amazon/reviews'
categories_path = '/media/moreo/Volume/Datasets/Amazon/raw/amazon_categories.txt'


def get_prevalence_merchandise(category):
    input_file = os.path.join(base_path, category+'.txt.gz')
    labels = []
    print(f'{category} starts')
    with gzip.open(input_file, 'rt') as f:
        for line in f:
            try:
                stars, doc = line.split('\t')
                labels.append(stars)
            except:
                print('error in line: ', line)
    counts = Counter(labels)
    print(f'\t{category} done')
    return counts

target_file = './counters_Amazon_merchandise.pkl'

if not os.path.exists(target_file):
    categories = [c.strip().replace(' ', '_') for c in open(categories_path, 'rt').readlines()]

    # categories = ['Gift_Cards', 'Magazine_Subscriptions']
    counters = qp.util.parallel(get_prevalence_merchandise, categories, n_jobs=-1)

    print('saving pickle')
    pickle.dump((categories, counters), open(target_file, 'wb'), pickle.HIGHEST_PROTOCOL)

else:
    (categories, counters) = pickle.load(open(target_file, 'rb'))

index_gift_cards = categories.index('Gift_Cards')
del categories[index_gift_cards]
del counters[index_gift_cards]

class_smooth = []
for cat, counter in zip(categories, counters):
    total = sum(count for label, count in counter.items())
    counts = [counter[i] for i in map(str, [1,2,3,4,5])]
    p = np.asarray(counts)/total
    smooth = jaggedness(p)
    class_smooth.append([smooth, cat, p])

class_smooth = sorted(class_smooth)

# df = pd.DataFrame(class_smooth, columns=['smoothness', 'category', 'prevalence'])

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme('paper')
sns.set_style('dark')
sns.set(font_scale=0.5)

nrows = 7
ncols = 4
figure, axis = plt.subplots(nrows, ncols, figsize=(ncols*2, nrows))
with open('categories.txt', 'wt') as foo:
    foo.write(f'Category\tSmooth\tPrevalence\n')
    for i, (smooth, category, prevalence) in enumerate(class_smooth):
        row = i // 4
        col = i % 4
        # print(i, row, col)
        axis[row, col].bar([1,2,3,4,5], prevalence, width=1)
        axis[row, col].set_ylim(0, 0.75)
        axis[row, col].set_facecolor('white')
        for spine in axis[row, col].spines.values():
            spine.set_edgecolor('black')
            spine.set_linewidth(0.3)
        # axis[row, col].set_xticks(loc=0)
        if row==6:
            axis[row, col].set_xlabel("stars")
            # axis[row, col].set_xticks([1,2,3,4,5])
        # else:
        #     axis[row, col].set_xticks([])
        if col==0:
            axis[row, col].set_ylabel("")
            axis[row, col].set_yticks([])
        else:
            axis[row, col].set_ylabel("")
            axis[row, col].set_yticks([])

        category = category.replace('_', ' ').title()
        category = category.replace(' And ', ' & ')
        axis[row, col].set_title(f'{category} ({smooth:.4f})', x=0.5, y=0.75)
        # axis[row, col].set_title

        foo.write(f'{category}\t{smooth}\t{prevalence}\n')

# plt.show()
plt.subplots_adjust(wspace=0, hspace=0)
plt.savefig('Amazon_categories_plotgrid.pdf', bbox_inches='tight')