1
0
Fork 0
QuaPy/eDiscovery/plot.py

212 lines
6.6 KiB
Python

import matplotlib.pyplot as plt
import pandas as pd
import sys, os, pathlib
class eDiscoveryPlot:
def __init__(self, datapath, outdir='./plots', loop=True, save=True, showYdist=False, showCost=True, refreshEach=10):
self.outdir = outdir
self.datapath = datapath
self.plotname = pathlib.Path(datapath).name.replace(".csv", ".png")
self.loop = loop
self.save = save
self.showYdist = showYdist
self.showCost = showCost
self.refreshEach = refreshEach
nPlots = 4
if showYdist:
nPlots+=1
if showCost:
nPlots += 1
if not loop:
plt.rcParams['figure.figsize'] = [12, 12]
plt.rcParams['figure.dpi'] = 200
else:
plt.rcParams['figure.figsize'] = [14, 18]
plt.rcParams['figure.dpi'] = 50
plt.rcParams.update({'font.size': 15})
# plot the data
self.fig, self.axs = plt.subplots(nPlots)
self.calls=0
def plot(self):
# if (self.calls+1) % self.refreshEach != 0:
# self.calls+=1
# return
fig, axs = self.fig, self.axs
loop, save = self.loop, self.save
aXn = 0
df = pd.read_csv(self.datapath, sep='\t')
xs = df['it']
y_r = df['R']
y_rhat = df['Rhat']
y_rhatCC = df['RhatCC']
axs[aXn].plot(xs, y_rhat, label='$\hat{R}_{Q}$')
axs[aXn].plot(xs, y_rhatCC, label='$\hat{R}_{CC}$')
axs[aXn].plot(xs, y_r, label='$R$')
axs[aXn].grid()
axs[aXn].set_ylabel('Recall')
axs[aXn].set_ylim(0, 1)
aXn += 1
y_r = df['te-prev']
y_rhat = df['te-estim']
y_rhatCC = df['te-estimCC']
axs[aXn].plot(xs, y_rhat, label='te-$\hat{Pr}(\oplus)_{Q}$')
axs[aXn].plot(xs, y_rhatCC, label='te-$\hat{Pr}(\oplus)_{CC}$')
axs[aXn].plot(xs, y_r, label='te-$Pr(\oplus)$')
axs[aXn].legend()
axs[aXn].grid()
axs[aXn].set_ylabel('Pool prevalence')
aXn += 1
y_ae = df['AE']
y_ae_cc = df['AE_CC']
axs[aXn].plot(xs, y_ae, label='AE$_{Q}$')
axs[aXn].plot(xs, y_ae_cc, label='AE$_{CC}$')
axs[aXn].legend()
axs[aXn].grid()
axs[aXn].set_ylabel('Quantification error')
aXn += 1
# classifier performance (not very reliable)
#axs[aXn].plot(xs, df['MF1_Q'], label='$F_1(clf(Q))$')
#axs[aXn].plot(xs, df['MF1_Clf'], label='$F_1(clf(CC))$')
#axs[aXn].legend()
#axs[aXn].grid()
#axs[aXn].set_ylabel('Classifiers performance')
#aXn += 1
if self.showCost:
cost = df['tr-size']
idealcost = df['ICost']
totalcost = cost + idealcost
axs[aXn].plot(xs, cost, label='Cost')
axs[aXn].plot(xs, idealcost, label='IdealCost')
axs[aXn].plot(xs, totalcost, label='TotalCost')
axs[aXn].legend()
axs[aXn].grid()
axs[aXn].set_ylabel('Cost')
aXn += 1
# distribution of posterior probabilities in the pool
# if self.showYdist:
# positive_posteriors = posteriors[y==1,1]
# negative_posteriors = posteriors[y==0,1]
# axs[aXn].hist(negative_posteriors, bins=50, label='negative', density=True, alpha=.75)
# axs[aXn].hist(positive_posteriors, bins=50, label='positive', density=True, alpha=.75)
# axs[aXn].legend()
# axs[aXn].grid()
# axs[aXn].set_xlim(0, 1)
# axs[aXn].set_ylabel('te-$Pr(\oplus)$ distribution')
# aXn += 1
axs[aXn].plot(xs, df['Shift'], '--k', label='shift (AE)')
axs[aXn].plot(xs, df['tr-prev'], 'y', label='tr-$Pr(\oplus)$')
axs[aXn].plot(xs, df['te-prev'], 'r', label='te-$Pr(\oplus)$')
axs[aXn].legend()
axs[aXn].grid()
axs[aXn].set_ylabel('Train-Test Shift')
aXn += 1
for i in range(aXn):
if self.calls==0:
# Shrink current axis by 20%
box = axs[i].get_position()
axs[i].set_position([box.x0, box.y0, box.width * 0.8, box.height])
fig.tight_layout()
# Put a legend to the right of the current axis
axs[i].legend(loc='center left', bbox_to_anchor=(1, 0.5))
# if save:
# os.makedirs(self.outdir, exist_ok=True)
# plt.savefig(f'{self.outdir}/{self.plotname}')
if loop:
plt.pause(.5)
for i in range(aXn):
axs[i].cla()
self.calls += 1
class InOutDistPlot:
def __init__(self, refreshEach=1):
self.refreshEach = refreshEach
# plot the data
self.fig, self.axs = plt.subplots(2)
self.calls = 0
def _plot_dist(self, posteriors, y, aXn, title):
positive_posteriors = posteriors[y == 1, 1]
negative_posteriors = posteriors[y == 0, 1]
self.axs[aXn].hist(negative_posteriors, bins=50, label='$Pr(x|\ominus)$', density=False, alpha=.75)
self.axs[aXn].hist(positive_posteriors, bins=50, label='$Pr(x|\oplus)$', density=False, alpha=.75)
self.axs[aXn].legend()
self.axs[aXn].grid()
self.axs[aXn].set_xlim(0, 1)
self.axs[aXn].set_ylabel(title)
def plot(self, in_posteriors, in_y, out_posteriors, out_y):
if (self.calls+1) % self.refreshEach != 0:
self.calls += 1
return
fig, axs = self.fig, self.axs
aXn = 0
# in-posteriors distribution
self._plot_dist(in_posteriors, in_y, aXn, title='training distribution')
aXn += 1
# out-posteriors distribution
self._plot_dist(out_posteriors, out_y, aXn, title='pool distribution')
aXn += 1
for i in range(aXn):
if self.calls==0:
# Shrink current axis by 20%
box = axs[i].get_position()
axs[i].set_position([box.x0, box.y0, box.width * 0.8, box.height])
fig.tight_layout()
# Put a legend to the right of the current axis
axs[i].legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.pause(.5)
for i in range(aXn):
axs[i].cla()
self.calls += 1
if __name__ == '__main__':
assert len(sys.argv) == 3, f'wrong args, syntax is: python {sys.argv[0]} <result_input_path> <dynamic (0|1)>'
file = str(sys.argv[1])
loop = bool(int(sys.argv[2]))
figure = eDiscoveryPlot(file)
try:
while True:
figure.plot()
except KeyboardInterrupt:
print('\n[stop]')