quantification over time experiment

This commit is contained in:
Alejandro Moreo Fernandez 2025-12-10 19:43:08 +01:00
parent 5c2554861c
commit 1661a79dbb
2 changed files with 302 additions and 1 deletions

View File

@ -0,0 +1,300 @@
import os
from collections import defaultdict
from typing import List, Dict
import matplotlib.pyplot as plt
import kagglehub
import pandas as pd
from pathlib import Path
import numpy as np
from qunfold import KMM
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression as LR, LogisticRegressionCV
from tqdm import tqdm
import quapy as qp
from data import LabelledCollection, Dataset
import quapy.functional as F
from method.composable import QUnfoldWrapper
from quapy.method.aggregative import DistributionMatchingY, EMQ, KDEyML
from quapy.method.non_aggregative import DistributionMatchingX
from quapy.method.aggregative import CC, ACC, HDy
from transformers import pipeline
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.max_rows', None)
pd.set_option("display.expand_frame_repr", False)
pd.set_option("display.precision", 4)
pd.set_option("display.float_format", "{:.4f}".format)
def prepare_xy_date_blocks(df, freq="M"):
"""
df: DataFrame con columnas 'text', 'airline_sentiment', 'tweet_created'
freq: frecuencia de los bloques temporales ('D', 'W', 'M', etc.)
Devuelve:
X: lista de textos
y: np.ndarray de etiquetas
date: lista de índices enteros por bloque temporal
idx2date: lista con los límites temporales de cada bloque (tuplas)
"""
df["tweet_created"] = pd.to_datetime(df["tweet_created"], errors="coerce")
df = df.sort_values("tweet_created").reset_index(drop=True)
X = df["text"].astype(str).values
y = df["airline_sentiment"].values
# group dates by requested frequency
date_groups = df["tweet_created"].dt.to_period(freq)
# assigns index to date blocks
unique_periods = date_groups.unique()
period_to_idx = {p: i for i, p in enumerate(unique_periods)}
date = np.asarray([period_to_idx[p] for p in date_groups])
# get true limits of period intervals
idx2date = []
for p in unique_periods:
start = p.start_time
end = p.end_time
idx2date.append((start, end))
return X, y, date, idx2date
def prepare_labelled_collections():
# loads and prepares the Twitter US Arlines Sentiment dataset (from Kaggle)
# returns a labelled collection for the training data (day 0 and 1), and a list of the
# test sets (days 2 to 8) and the time limits for each test period
# The dataset is originally ternary (negative, neutral, positive), but we binarize it discarding neutral
# Download latest version
path = kagglehub.dataset_download("crowdflower/twitter-airline-sentiment")
df = pd.read_csv(Path(path) / 'Tweets.csv')
X, y, date, idx2date = prepare_xy_date_blocks(df, freq="D")
# binarize
keep_idx = (y!='neutral')
X = X[keep_idx]
y = y[keep_idx]
date = date[keep_idx]
y[y != 'negative'] = 1
y[y == 'negative'] = 0
y = y.astype(int)
# use day 0 for training, the rest for test
X_train, y_train = X[date<=1], y[date<=1]
train = LabelledCollection(X_train, y_train)
print(f'training has {len(train)} docs and prevalence={F.strprev(train.prevalence())} classes={train.classes}')
tests = []
test_init = []
for date_i in range(2, max(date)+1):
X_test_i, y_test_i = X[date==date_i], y[date==date_i]
test_i = LabelledCollection(X_test_i, y_test_i, classes=train.classes)
print(f'test-{date_i} has {len(test_i)} docs and prevalence={F.strprev(test_i.prevalence())}')
tests.append(test_i)
test_init.append(idx2date[date_i])
return train, tests, test_init
from scipy.interpolate import CubicSpline
import numpy as np
import matplotlib.pyplot as plt
def smooth_curve(dates, values, num_points=300):
"""
dates: list of timestamps
values: list of Y-values
num_points: number of points in the smooth curve
Returns new_x, new_y for plotting a smooth line.
"""
# Convert datetime to numeric (matplotlib float representation)
x = [d.timestamp() for d in dates]
x = np.array(x)
y = np.array(values)
# Create new X-axis with more points
x_new = np.linspace(x.min(), x.max(), num_points)
# Smooth spline
spline = CubicSpline(x, y)
y_new = spline(x_new)
# Convert numeric x_new back to datetime
dates_new = [pd.to_datetime(t, unit='s') for t in x_new]
return dates_new, y_new
def plot_prevalences(results_dict, target_class=1, target_label='positive', savepath=None):
"""
Plot prevalence estimates over time for each method contained in results_dict.
Parameters
----------
results_dict : dict
A dictionary where:
- "date-start" : list of datetime-like objects
- all other keys : list of prevalence vectors (arrays), e.g. [p_pos, p_neg]
Only the first component (p_pos) will be plotted.
"""
dates = results_dict["date-start"]
# Create figure
plt.figure(figsize=(20, 10))
# Plot one line per method (except "date-start")
for method, values in results_dict.items():
if method == "date-start":
continue
# Extract first component from each prevalence array
target_component = [v[target_class]*100 for v in values]
dates_smooth, y_smooth = smooth_curve(dates, target_component)
if method=='true-prev':
line,=plt.plot(dates_smooth, y_smooth, label=method, linewidth=3, linestyle='-')
else:
line,=plt.plot(dates_smooth, y_smooth, label=method, linewidth=2, linestyle='--')
plt.plot(dates, target_component, 'o', markersize=10, color=line.get_color())
# Axis labels
# plt.xlabel("Date")
plt.ylabel("% of "+target_label+" tweets")
# Rotate date labels for readability
plt.xticks(rotation=45)
plt.minorticks_on()
plt.grid(which='major', linestyle='-', linewidth=0.5)
plt.grid(which='minor', linestyle=':', linewidth=0.3)
# Place the legend outside to the right
plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
plt.tight_layout()
if savepath is not None:
os.makedirs(Path(savepath).parent, exist_ok=True)
plt.savefig(savepath)
else:
plt.show()
class HDxDensify(DistributionMatchingX):
def fit(self, X, y):
self.reductor = TruncatedSVD(n_components=5, random_state=0)
Xred = self.reductor.fit_transform(X)
return super().fit(Xred, y)
def predict(self, X):
Xred = self.reductor.transform(X)
return super().predict(Xred)
class QUnfoldWrapperDensify(QUnfoldWrapper):
def fit(self, X, y):
self.reductor = TruncatedSVD(n_components=5, random_state=0)
Xred = self.reductor.fit_transform(X)
return super().fit(Xred, y)
def predict(self, X):
Xred = self.reductor.transform(X)
return super().predict(Xred)
# A scikit-learn's style wrapper for a huggingface-based pre-trained transformer for binary sentiment classification
class HFTextClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, model_name='distilbert-base-uncased-finetuned-sst-2-english'):
self.pipe = pipeline("sentiment-analysis", model=model_name)
self.classes_ = np.asarray([0,1])
def fit(self, X, y=None):
return self
def _binary_decisions(self, transformer_output: List[Dict]):
return np.array([(1 if p['label']=='POSITIVE' else 0) for p in transformer_output], dtype=int)
def predict(self, X):
X = list(map(str, X))
preds = self.pipe(X, truncation=True)
return self._binary_decisions(preds)
def predict_proba(self, X):
X = list(map(str, X))
n_examples = len(X)
preds = self.pipe(X, truncation=True)
decisions = self._binary_decisions(preds)
scores = np.array([p['score'] for p in preds], dtype=float)
probas = np.zeros(shape=(len(X), 2), dtype=float)
probas[np.arange(n_examples),decisions] = scores
probas[np.arange(n_examples),~decisions] = 1-scores
return probas
# def methods(pre_trained_classifier):
# yield 'CC', CC(pre_trained_classifier, fit_classifier=False)
USE_LOGISTIC_REGRESSION = True
if USE_LOGISTIC_REGRESSION:
new_classifier = lambda:LR()
to_fit = True
else:
pretrained = HFTextClassifier()
new_classifier = lambda:pretrained
to_fit = False
def methods():
yield 'CC', CC(new_classifier(), fit_classifier=to_fit)
yield 'ACC', ACC(new_classifier(), fit_classifier=to_fit)
yield 'HDy', DistributionMatchingY(new_classifier(), fit_classifier=to_fit)
yield 'HDx', HDxDensify()
yield 'KMM', QUnfoldWrapperDensify(KMM())
yield 'SLD', EMQ(new_classifier(), fit_classifier=to_fit)
yield 'KDEy', KDEyML(new_classifier(), fit_classifier=to_fit)
train, tests, test_init = prepare_labelled_collections()
if USE_LOGISTIC_REGRESSION:
# vectorize text for logistic regression
vectorizer = TfidfVectorizer(min_df=5, sublinear_tf=True)
Xtr = vectorizer.fit_transform(train.X)
train = LabelledCollection(Xtr, train.labels, train.classes_)
for i in range(len(tests)):
Xte = vectorizer.transform(tests[i].X)
tests[i] = LabelledCollection(Xte, tests[i].labels, train.classes_)
results = defaultdict(list)
for test_i, test_init_i in zip(tests, test_init):
results['true-prev'].append(test_i.prevalence())
results['date-start'].append(test_init_i[0])
for q_name, quant in methods():
quant.fit(*train.Xy)
for test_i, test_init_i in tqdm(zip(tests, test_init), desc=f'{q_name} predicting', total=len(tests)):
pred_i = quant.predict(test_i.X)
results[q_name].append(pred_i)
suffix = '_lr' if USE_LOGISTIC_REGRESSION else '_transformer'
plot_prevalences(results, savepath=f'./plots_ieee/over_time{suffix}.pdf')

View File

@ -93,4 +93,5 @@ ax.set_zlabel('')
ax.grid(False)
plt.tight_layout()
plt.show()
# plt.show()
plt.savefig('plots_ieee/tetrahedron.pdf')