diff --git a/IEEEProc2025_plots/over_time_experiment.py b/IEEEProc2025_plots/over_time_experiment.py new file mode 100644 index 0000000..3aaca28 --- /dev/null +++ b/IEEEProc2025_plots/over_time_experiment.py @@ -0,0 +1,300 @@ +import os +from collections import defaultdict +from typing import List, Dict + +import matplotlib.pyplot as plt +import kagglehub +import pandas as pd +from pathlib import Path +import numpy as np +from qunfold import KMM +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.decomposition import TruncatedSVD +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.linear_model import LogisticRegression as LR, LogisticRegressionCV +from tqdm import tqdm +import quapy as qp +from data import LabelledCollection, Dataset +import quapy.functional as F +from method.composable import QUnfoldWrapper +from quapy.method.aggregative import DistributionMatchingY, EMQ, KDEyML +from quapy.method.non_aggregative import DistributionMatchingX +from quapy.method.aggregative import CC, ACC, HDy +from transformers import pipeline + + +pd.set_option('display.max_columns', None) +pd.set_option('display.width', 2000) +pd.set_option('display.max_rows', None) +pd.set_option("display.expand_frame_repr", False) +pd.set_option("display.precision", 4) +pd.set_option("display.float_format", "{:.4f}".format) + + +def prepare_xy_date_blocks(df, freq="M"): + """ + df: DataFrame con columnas 'text', 'airline_sentiment', 'tweet_created' + freq: frecuencia de los bloques temporales ('D', 'W', 'M', etc.) + + Devuelve: + X: lista de textos + y: np.ndarray de etiquetas + date: lista de índices enteros por bloque temporal + idx2date: lista con los límites temporales de cada bloque (tuplas) + """ + + df["tweet_created"] = pd.to_datetime(df["tweet_created"], errors="coerce") + df = df.sort_values("tweet_created").reset_index(drop=True) + + X = df["text"].astype(str).values + y = df["airline_sentiment"].values + + # group dates by requested frequency + date_groups = df["tweet_created"].dt.to_period(freq) + + # assigns index to date blocks + unique_periods = date_groups.unique() + period_to_idx = {p: i for i, p in enumerate(unique_periods)} + + date = np.asarray([period_to_idx[p] for p in date_groups]) + + # get true limits of period intervals + idx2date = [] + for p in unique_periods: + start = p.start_time + end = p.end_time + idx2date.append((start, end)) + + return X, y, date, idx2date + + +def prepare_labelled_collections(): + # loads and prepares the Twitter US Arlines Sentiment dataset (from Kaggle) + # returns a labelled collection for the training data (day 0 and 1), and a list of the + # test sets (days 2 to 8) and the time limits for each test period + # The dataset is originally ternary (negative, neutral, positive), but we binarize it discarding neutral + + # Download latest version + path = kagglehub.dataset_download("crowdflower/twitter-airline-sentiment") + df = pd.read_csv(Path(path) / 'Tweets.csv') + X, y, date, idx2date = prepare_xy_date_blocks(df, freq="D") + + # binarize + + keep_idx = (y!='neutral') + X = X[keep_idx] + y = y[keep_idx] + date = date[keep_idx] + y[y != 'negative'] = 1 + y[y == 'negative'] = 0 + y = y.astype(int) + + # use day 0 for training, the rest for test + X_train, y_train = X[date<=1], y[date<=1] + train = LabelledCollection(X_train, y_train) + print(f'training has {len(train)} docs and prevalence={F.strprev(train.prevalence())} classes={train.classes}') + + tests = [] + test_init = [] + for date_i in range(2, max(date)+1): + X_test_i, y_test_i = X[date==date_i], y[date==date_i] + test_i = LabelledCollection(X_test_i, y_test_i, classes=train.classes) + print(f'test-{date_i} has {len(test_i)} docs and prevalence={F.strprev(test_i.prevalence())}') + tests.append(test_i) + test_init.append(idx2date[date_i]) + + return train, tests, test_init + + +from scipy.interpolate import CubicSpline +import numpy as np +import matplotlib.pyplot as plt + + +def smooth_curve(dates, values, num_points=300): + """ + dates: list of timestamps + values: list of Y-values + num_points: number of points in the smooth curve + + Returns new_x, new_y for plotting a smooth line. + """ + # Convert datetime to numeric (matplotlib float representation) + x = [d.timestamp() for d in dates] + x = np.array(x) + y = np.array(values) + + # Create new X-axis with more points + x_new = np.linspace(x.min(), x.max(), num_points) + + # Smooth spline + spline = CubicSpline(x, y) + y_new = spline(x_new) + + # Convert numeric x_new back to datetime + dates_new = [pd.to_datetime(t, unit='s') for t in x_new] + + return dates_new, y_new + + +def plot_prevalences(results_dict, target_class=1, target_label='positive', savepath=None): + """ + Plot prevalence estimates over time for each method contained in results_dict. + + Parameters + ---------- + results_dict : dict + A dictionary where: + - "date-start" : list of datetime-like objects + - all other keys : list of prevalence vectors (arrays), e.g. [p_pos, p_neg] + Only the first component (p_pos) will be plotted. + """ + dates = results_dict["date-start"] + + # Create figure + plt.figure(figsize=(20, 10)) + + # Plot one line per method (except "date-start") + for method, values in results_dict.items(): + if method == "date-start": + continue + + # Extract first component from each prevalence array + target_component = [v[target_class]*100 for v in values] + + dates_smooth, y_smooth = smooth_curve(dates, target_component) + + if method=='true-prev': + line,=plt.plot(dates_smooth, y_smooth, label=method, linewidth=3, linestyle='-') + else: + line,=plt.plot(dates_smooth, y_smooth, label=method, linewidth=2, linestyle='--') + plt.plot(dates, target_component, 'o', markersize=10, color=line.get_color()) + + # Axis labels + # plt.xlabel("Date") + plt.ylabel("% of "+target_label+" tweets") + + # Rotate date labels for readability + plt.xticks(rotation=45) + + plt.minorticks_on() + plt.grid(which='major', linestyle='-', linewidth=0.5) + plt.grid(which='minor', linestyle=':', linewidth=0.3) + + # Place the legend outside to the right + plt.legend(loc="center left", bbox_to_anchor=(1, 0.5)) + + plt.tight_layout() + if savepath is not None: + os.makedirs(Path(savepath).parent, exist_ok=True) + plt.savefig(savepath) + else: + plt.show() + + +class HDxDensify(DistributionMatchingX): + def fit(self, X, y): + self.reductor = TruncatedSVD(n_components=5, random_state=0) + Xred = self.reductor.fit_transform(X) + return super().fit(Xred, y) + + def predict(self, X): + Xred = self.reductor.transform(X) + return super().predict(Xred) + + +class QUnfoldWrapperDensify(QUnfoldWrapper): + def fit(self, X, y): + self.reductor = TruncatedSVD(n_components=5, random_state=0) + Xred = self.reductor.fit_transform(X) + return super().fit(Xred, y) + + def predict(self, X): + Xred = self.reductor.transform(X) + return super().predict(Xred) + + +# A scikit-learn's style wrapper for a huggingface-based pre-trained transformer for binary sentiment classification +class HFTextClassifier(BaseEstimator, ClassifierMixin): + def __init__(self, model_name='distilbert-base-uncased-finetuned-sst-2-english'): + self.pipe = pipeline("sentiment-analysis", model=model_name) + self.classes_ = np.asarray([0,1]) + + def fit(self, X, y=None): + return self + + def _binary_decisions(self, transformer_output: List[Dict]): + return np.array([(1 if p['label']=='POSITIVE' else 0) for p in transformer_output], dtype=int) + + def predict(self, X): + X = list(map(str, X)) + preds = self.pipe(X, truncation=True) + return self._binary_decisions(preds) + + def predict_proba(self, X): + X = list(map(str, X)) + n_examples = len(X) + preds = self.pipe(X, truncation=True) + decisions = self._binary_decisions(preds) + scores = np.array([p['score'] for p in preds], dtype=float) + probas = np.zeros(shape=(len(X), 2), dtype=float) + probas[np.arange(n_examples),decisions] = scores + probas[np.arange(n_examples),~decisions] = 1-scores + return probas + +# def methods(pre_trained_classifier): +# yield 'CC', CC(pre_trained_classifier, fit_classifier=False) + +USE_LOGISTIC_REGRESSION = True + +if USE_LOGISTIC_REGRESSION: + new_classifier = lambda:LR() + to_fit = True +else: + pretrained = HFTextClassifier() + new_classifier = lambda:pretrained + to_fit = False + + +def methods(): + yield 'CC', CC(new_classifier(), fit_classifier=to_fit) + yield 'ACC', ACC(new_classifier(), fit_classifier=to_fit) + yield 'HDy', DistributionMatchingY(new_classifier(), fit_classifier=to_fit) + yield 'HDx', HDxDensify() + yield 'KMM', QUnfoldWrapperDensify(KMM()) + yield 'SLD', EMQ(new_classifier(), fit_classifier=to_fit) + yield 'KDEy', KDEyML(new_classifier(), fit_classifier=to_fit) + + +train, tests, test_init = prepare_labelled_collections() + +if USE_LOGISTIC_REGRESSION: + # vectorize text for logistic regression + vectorizer = TfidfVectorizer(min_df=5, sublinear_tf=True) + Xtr = vectorizer.fit_transform(train.X) + train = LabelledCollection(Xtr, train.labels, train.classes_) + for i in range(len(tests)): + Xte = vectorizer.transform(tests[i].X) + tests[i] = LabelledCollection(Xte, tests[i].labels, train.classes_) + + +results = defaultdict(list) +for test_i, test_init_i in zip(tests, test_init): + results['true-prev'].append(test_i.prevalence()) + results['date-start'].append(test_init_i[0]) + +for q_name, quant in methods(): + quant.fit(*train.Xy) + for test_i, test_init_i in tqdm(zip(tests, test_init), desc=f'{q_name} predicting', total=len(tests)): + pred_i = quant.predict(test_i.X) + results[q_name].append(pred_i) + +suffix = '_lr' if USE_LOGISTIC_REGRESSION else '_transformer' +plot_prevalences(results, savepath=f'./plots_ieee/over_time{suffix}.pdf') + + + + + + + diff --git a/IEEEProc2025_plots/uniform_sampling_simplex.py b/IEEEProc2025_plots/uniform_sampling_simplex.py index 0823439..d26a959 100644 --- a/IEEEProc2025_plots/uniform_sampling_simplex.py +++ b/IEEEProc2025_plots/uniform_sampling_simplex.py @@ -93,4 +93,5 @@ ax.set_zlabel('') ax.grid(False) plt.tight_layout() -plt.show() +# plt.show() +plt.savefig('plots_ieee/tetrahedron.pdf')