quantification over time experiment
This commit is contained in:
parent
5c2554861c
commit
1661a79dbb
|
|
@ -0,0 +1,300 @@
|
||||||
|
import os
|
||||||
|
from collections import defaultdict
|
||||||
|
from typing import List, Dict
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import kagglehub
|
||||||
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
|
import numpy as np
|
||||||
|
from qunfold import KMM
|
||||||
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
|
from sklearn.decomposition import TruncatedSVD
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.linear_model import LogisticRegression as LR, LogisticRegressionCV
|
||||||
|
from tqdm import tqdm
|
||||||
|
import quapy as qp
|
||||||
|
from data import LabelledCollection, Dataset
|
||||||
|
import quapy.functional as F
|
||||||
|
from method.composable import QUnfoldWrapper
|
||||||
|
from quapy.method.aggregative import DistributionMatchingY, EMQ, KDEyML
|
||||||
|
from quapy.method.non_aggregative import DistributionMatchingX
|
||||||
|
from quapy.method.aggregative import CC, ACC, HDy
|
||||||
|
from transformers import pipeline
|
||||||
|
|
||||||
|
|
||||||
|
pd.set_option('display.max_columns', None)
|
||||||
|
pd.set_option('display.width', 2000)
|
||||||
|
pd.set_option('display.max_rows', None)
|
||||||
|
pd.set_option("display.expand_frame_repr", False)
|
||||||
|
pd.set_option("display.precision", 4)
|
||||||
|
pd.set_option("display.float_format", "{:.4f}".format)
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_xy_date_blocks(df, freq="M"):
|
||||||
|
"""
|
||||||
|
df: DataFrame con columnas 'text', 'airline_sentiment', 'tweet_created'
|
||||||
|
freq: frecuencia de los bloques temporales ('D', 'W', 'M', etc.)
|
||||||
|
|
||||||
|
Devuelve:
|
||||||
|
X: lista de textos
|
||||||
|
y: np.ndarray de etiquetas
|
||||||
|
date: lista de índices enteros por bloque temporal
|
||||||
|
idx2date: lista con los límites temporales de cada bloque (tuplas)
|
||||||
|
"""
|
||||||
|
|
||||||
|
df["tweet_created"] = pd.to_datetime(df["tweet_created"], errors="coerce")
|
||||||
|
df = df.sort_values("tweet_created").reset_index(drop=True)
|
||||||
|
|
||||||
|
X = df["text"].astype(str).values
|
||||||
|
y = df["airline_sentiment"].values
|
||||||
|
|
||||||
|
# group dates by requested frequency
|
||||||
|
date_groups = df["tweet_created"].dt.to_period(freq)
|
||||||
|
|
||||||
|
# assigns index to date blocks
|
||||||
|
unique_periods = date_groups.unique()
|
||||||
|
period_to_idx = {p: i for i, p in enumerate(unique_periods)}
|
||||||
|
|
||||||
|
date = np.asarray([period_to_idx[p] for p in date_groups])
|
||||||
|
|
||||||
|
# get true limits of period intervals
|
||||||
|
idx2date = []
|
||||||
|
for p in unique_periods:
|
||||||
|
start = p.start_time
|
||||||
|
end = p.end_time
|
||||||
|
idx2date.append((start, end))
|
||||||
|
|
||||||
|
return X, y, date, idx2date
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_labelled_collections():
|
||||||
|
# loads and prepares the Twitter US Arlines Sentiment dataset (from Kaggle)
|
||||||
|
# returns a labelled collection for the training data (day 0 and 1), and a list of the
|
||||||
|
# test sets (days 2 to 8) and the time limits for each test period
|
||||||
|
# The dataset is originally ternary (negative, neutral, positive), but we binarize it discarding neutral
|
||||||
|
|
||||||
|
# Download latest version
|
||||||
|
path = kagglehub.dataset_download("crowdflower/twitter-airline-sentiment")
|
||||||
|
df = pd.read_csv(Path(path) / 'Tweets.csv')
|
||||||
|
X, y, date, idx2date = prepare_xy_date_blocks(df, freq="D")
|
||||||
|
|
||||||
|
# binarize
|
||||||
|
|
||||||
|
keep_idx = (y!='neutral')
|
||||||
|
X = X[keep_idx]
|
||||||
|
y = y[keep_idx]
|
||||||
|
date = date[keep_idx]
|
||||||
|
y[y != 'negative'] = 1
|
||||||
|
y[y == 'negative'] = 0
|
||||||
|
y = y.astype(int)
|
||||||
|
|
||||||
|
# use day 0 for training, the rest for test
|
||||||
|
X_train, y_train = X[date<=1], y[date<=1]
|
||||||
|
train = LabelledCollection(X_train, y_train)
|
||||||
|
print(f'training has {len(train)} docs and prevalence={F.strprev(train.prevalence())} classes={train.classes}')
|
||||||
|
|
||||||
|
tests = []
|
||||||
|
test_init = []
|
||||||
|
for date_i in range(2, max(date)+1):
|
||||||
|
X_test_i, y_test_i = X[date==date_i], y[date==date_i]
|
||||||
|
test_i = LabelledCollection(X_test_i, y_test_i, classes=train.classes)
|
||||||
|
print(f'test-{date_i} has {len(test_i)} docs and prevalence={F.strprev(test_i.prevalence())}')
|
||||||
|
tests.append(test_i)
|
||||||
|
test_init.append(idx2date[date_i])
|
||||||
|
|
||||||
|
return train, tests, test_init
|
||||||
|
|
||||||
|
|
||||||
|
from scipy.interpolate import CubicSpline
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
|
||||||
|
def smooth_curve(dates, values, num_points=300):
|
||||||
|
"""
|
||||||
|
dates: list of timestamps
|
||||||
|
values: list of Y-values
|
||||||
|
num_points: number of points in the smooth curve
|
||||||
|
|
||||||
|
Returns new_x, new_y for plotting a smooth line.
|
||||||
|
"""
|
||||||
|
# Convert datetime to numeric (matplotlib float representation)
|
||||||
|
x = [d.timestamp() for d in dates]
|
||||||
|
x = np.array(x)
|
||||||
|
y = np.array(values)
|
||||||
|
|
||||||
|
# Create new X-axis with more points
|
||||||
|
x_new = np.linspace(x.min(), x.max(), num_points)
|
||||||
|
|
||||||
|
# Smooth spline
|
||||||
|
spline = CubicSpline(x, y)
|
||||||
|
y_new = spline(x_new)
|
||||||
|
|
||||||
|
# Convert numeric x_new back to datetime
|
||||||
|
dates_new = [pd.to_datetime(t, unit='s') for t in x_new]
|
||||||
|
|
||||||
|
return dates_new, y_new
|
||||||
|
|
||||||
|
|
||||||
|
def plot_prevalences(results_dict, target_class=1, target_label='positive', savepath=None):
|
||||||
|
"""
|
||||||
|
Plot prevalence estimates over time for each method contained in results_dict.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
results_dict : dict
|
||||||
|
A dictionary where:
|
||||||
|
- "date-start" : list of datetime-like objects
|
||||||
|
- all other keys : list of prevalence vectors (arrays), e.g. [p_pos, p_neg]
|
||||||
|
Only the first component (p_pos) will be plotted.
|
||||||
|
"""
|
||||||
|
dates = results_dict["date-start"]
|
||||||
|
|
||||||
|
# Create figure
|
||||||
|
plt.figure(figsize=(20, 10))
|
||||||
|
|
||||||
|
# Plot one line per method (except "date-start")
|
||||||
|
for method, values in results_dict.items():
|
||||||
|
if method == "date-start":
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Extract first component from each prevalence array
|
||||||
|
target_component = [v[target_class]*100 for v in values]
|
||||||
|
|
||||||
|
dates_smooth, y_smooth = smooth_curve(dates, target_component)
|
||||||
|
|
||||||
|
if method=='true-prev':
|
||||||
|
line,=plt.plot(dates_smooth, y_smooth, label=method, linewidth=3, linestyle='-')
|
||||||
|
else:
|
||||||
|
line,=plt.plot(dates_smooth, y_smooth, label=method, linewidth=2, linestyle='--')
|
||||||
|
plt.plot(dates, target_component, 'o', markersize=10, color=line.get_color())
|
||||||
|
|
||||||
|
# Axis labels
|
||||||
|
# plt.xlabel("Date")
|
||||||
|
plt.ylabel("% of "+target_label+" tweets")
|
||||||
|
|
||||||
|
# Rotate date labels for readability
|
||||||
|
plt.xticks(rotation=45)
|
||||||
|
|
||||||
|
plt.minorticks_on()
|
||||||
|
plt.grid(which='major', linestyle='-', linewidth=0.5)
|
||||||
|
plt.grid(which='minor', linestyle=':', linewidth=0.3)
|
||||||
|
|
||||||
|
# Place the legend outside to the right
|
||||||
|
plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
if savepath is not None:
|
||||||
|
os.makedirs(Path(savepath).parent, exist_ok=True)
|
||||||
|
plt.savefig(savepath)
|
||||||
|
else:
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
class HDxDensify(DistributionMatchingX):
|
||||||
|
def fit(self, X, y):
|
||||||
|
self.reductor = TruncatedSVD(n_components=5, random_state=0)
|
||||||
|
Xred = self.reductor.fit_transform(X)
|
||||||
|
return super().fit(Xred, y)
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
Xred = self.reductor.transform(X)
|
||||||
|
return super().predict(Xred)
|
||||||
|
|
||||||
|
|
||||||
|
class QUnfoldWrapperDensify(QUnfoldWrapper):
|
||||||
|
def fit(self, X, y):
|
||||||
|
self.reductor = TruncatedSVD(n_components=5, random_state=0)
|
||||||
|
Xred = self.reductor.fit_transform(X)
|
||||||
|
return super().fit(Xred, y)
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
Xred = self.reductor.transform(X)
|
||||||
|
return super().predict(Xred)
|
||||||
|
|
||||||
|
|
||||||
|
# A scikit-learn's style wrapper for a huggingface-based pre-trained transformer for binary sentiment classification
|
||||||
|
class HFTextClassifier(BaseEstimator, ClassifierMixin):
|
||||||
|
def __init__(self, model_name='distilbert-base-uncased-finetuned-sst-2-english'):
|
||||||
|
self.pipe = pipeline("sentiment-analysis", model=model_name)
|
||||||
|
self.classes_ = np.asarray([0,1])
|
||||||
|
|
||||||
|
def fit(self, X, y=None):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def _binary_decisions(self, transformer_output: List[Dict]):
|
||||||
|
return np.array([(1 if p['label']=='POSITIVE' else 0) for p in transformer_output], dtype=int)
|
||||||
|
|
||||||
|
def predict(self, X):
|
||||||
|
X = list(map(str, X))
|
||||||
|
preds = self.pipe(X, truncation=True)
|
||||||
|
return self._binary_decisions(preds)
|
||||||
|
|
||||||
|
def predict_proba(self, X):
|
||||||
|
X = list(map(str, X))
|
||||||
|
n_examples = len(X)
|
||||||
|
preds = self.pipe(X, truncation=True)
|
||||||
|
decisions = self._binary_decisions(preds)
|
||||||
|
scores = np.array([p['score'] for p in preds], dtype=float)
|
||||||
|
probas = np.zeros(shape=(len(X), 2), dtype=float)
|
||||||
|
probas[np.arange(n_examples),decisions] = scores
|
||||||
|
probas[np.arange(n_examples),~decisions] = 1-scores
|
||||||
|
return probas
|
||||||
|
|
||||||
|
# def methods(pre_trained_classifier):
|
||||||
|
# yield 'CC', CC(pre_trained_classifier, fit_classifier=False)
|
||||||
|
|
||||||
|
USE_LOGISTIC_REGRESSION = True
|
||||||
|
|
||||||
|
if USE_LOGISTIC_REGRESSION:
|
||||||
|
new_classifier = lambda:LR()
|
||||||
|
to_fit = True
|
||||||
|
else:
|
||||||
|
pretrained = HFTextClassifier()
|
||||||
|
new_classifier = lambda:pretrained
|
||||||
|
to_fit = False
|
||||||
|
|
||||||
|
|
||||||
|
def methods():
|
||||||
|
yield 'CC', CC(new_classifier(), fit_classifier=to_fit)
|
||||||
|
yield 'ACC', ACC(new_classifier(), fit_classifier=to_fit)
|
||||||
|
yield 'HDy', DistributionMatchingY(new_classifier(), fit_classifier=to_fit)
|
||||||
|
yield 'HDx', HDxDensify()
|
||||||
|
yield 'KMM', QUnfoldWrapperDensify(KMM())
|
||||||
|
yield 'SLD', EMQ(new_classifier(), fit_classifier=to_fit)
|
||||||
|
yield 'KDEy', KDEyML(new_classifier(), fit_classifier=to_fit)
|
||||||
|
|
||||||
|
|
||||||
|
train, tests, test_init = prepare_labelled_collections()
|
||||||
|
|
||||||
|
if USE_LOGISTIC_REGRESSION:
|
||||||
|
# vectorize text for logistic regression
|
||||||
|
vectorizer = TfidfVectorizer(min_df=5, sublinear_tf=True)
|
||||||
|
Xtr = vectorizer.fit_transform(train.X)
|
||||||
|
train = LabelledCollection(Xtr, train.labels, train.classes_)
|
||||||
|
for i in range(len(tests)):
|
||||||
|
Xte = vectorizer.transform(tests[i].X)
|
||||||
|
tests[i] = LabelledCollection(Xte, tests[i].labels, train.classes_)
|
||||||
|
|
||||||
|
|
||||||
|
results = defaultdict(list)
|
||||||
|
for test_i, test_init_i in zip(tests, test_init):
|
||||||
|
results['true-prev'].append(test_i.prevalence())
|
||||||
|
results['date-start'].append(test_init_i[0])
|
||||||
|
|
||||||
|
for q_name, quant in methods():
|
||||||
|
quant.fit(*train.Xy)
|
||||||
|
for test_i, test_init_i in tqdm(zip(tests, test_init), desc=f'{q_name} predicting', total=len(tests)):
|
||||||
|
pred_i = quant.predict(test_i.X)
|
||||||
|
results[q_name].append(pred_i)
|
||||||
|
|
||||||
|
suffix = '_lr' if USE_LOGISTIC_REGRESSION else '_transformer'
|
||||||
|
plot_prevalences(results, savepath=f'./plots_ieee/over_time{suffix}.pdf')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -93,4 +93,5 @@ ax.set_zlabel('')
|
||||||
ax.grid(False)
|
ax.grid(False)
|
||||||
|
|
||||||
plt.tight_layout()
|
plt.tight_layout()
|
||||||
plt.show()
|
# plt.show()
|
||||||
|
plt.savefig('plots_ieee/tetrahedron.pdf')
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue