quantification over time experiment

2025-12-10 19:43:08 +01:00 · 2025-12-10 19:43:08 +01:00 · 1661a79dbb
parent 5c2554861c
commit 1661a79dbb
2 changed files with 302 additions and 1 deletions
--- a/IEEEProc2025_plots/over_time_experiment.py
+++ b/IEEEProc2025_plots/over_time_experiment.py
@ -0,0 +1,300 @@
+import os
+from collections import defaultdict
+from typing import List, Dict
+
+import matplotlib.pyplot as plt
+import kagglehub
+import pandas as pd
+from pathlib import Path
+import numpy as np
+from qunfold import KMM
+from sklearn.base import BaseEstimator, ClassifierMixin
+from sklearn.decomposition import TruncatedSVD
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import LogisticRegression as LR, LogisticRegressionCV
+from tqdm import tqdm
+import quapy as qp
+from data import LabelledCollection, Dataset
+import quapy.functional as F
+from method.composable import QUnfoldWrapper
+from quapy.method.aggregative import DistributionMatchingY, EMQ, KDEyML
+from quapy.method.non_aggregative import DistributionMatchingX
+from quapy.method.aggregative import CC, ACC, HDy
+from transformers import pipeline
+
+
+pd.set_option('display.max_columns', None)
+pd.set_option('display.width', 2000)
+pd.set_option('display.max_rows', None)
+pd.set_option("display.expand_frame_repr", False)
+pd.set_option("display.precision", 4)
+pd.set_option("display.float_format", "{:.4f}".format)
+
+
+def prepare_xy_date_blocks(df, freq="M"):
+    """
+    df: DataFrame con columnas 'text', 'airline_sentiment', 'tweet_created'
+    freq: frecuencia de los bloques temporales ('D', 'W', 'M', etc.)
+
+    Devuelve:
+        X: lista de textos
+        y: np.ndarray de etiquetas
+        date: lista de índices enteros por bloque temporal
+        idx2date: lista con los límites temporales de cada bloque (tuplas)
+    """
+
+    df["tweet_created"] = pd.to_datetime(df["tweet_created"], errors="coerce")
+    df = df.sort_values("tweet_created").reset_index(drop=True)
+
+    X = df["text"].astype(str).values
+    y = df["airline_sentiment"].values
+
+    # group dates by requested frequency
+    date_groups = df["tweet_created"].dt.to_period(freq)
+
+    # assigns index to date blocks
+    unique_periods = date_groups.unique()
+    period_to_idx = {p: i for i, p in enumerate(unique_periods)}
+
+    date = np.asarray([period_to_idx[p] for p in date_groups])
+
+    # get true limits of period intervals
+    idx2date = []
+    for p in unique_periods:
+        start = p.start_time
+        end = p.end_time
+        idx2date.append((start, end))
+
+    return X, y, date, idx2date
+
+
+def prepare_labelled_collections():
+    # loads and prepares the Twitter US Arlines Sentiment dataset (from Kaggle)
+    # returns a labelled collection for the training data (day 0 and 1), and a list of the
+    # test sets (days 2 to 8) and the time limits for each test period
+    # The dataset is originally ternary (negative, neutral, positive), but we binarize it discarding neutral
+
+    # Download latest version
+    path = kagglehub.dataset_download("crowdflower/twitter-airline-sentiment")
+    df = pd.read_csv(Path(path) / 'Tweets.csv')
+    X, y, date, idx2date = prepare_xy_date_blocks(df, freq="D")
+
+    # binarize
+
+    keep_idx = (y!='neutral')
+    X = X[keep_idx]
+    y = y[keep_idx]
+    date = date[keep_idx]
+    y[y != 'negative'] = 1
+    y[y == 'negative'] = 0
+    y = y.astype(int)
+
+    # use day 0 for training, the rest for test
+    X_train, y_train = X[date<=1], y[date<=1]
+    train = LabelledCollection(X_train, y_train)
+    print(f'training has {len(train)} docs and prevalence={F.strprev(train.prevalence())} classes={train.classes}')
+
+    tests = []
+    test_init = []
+    for date_i in range(2, max(date)+1):
+        X_test_i, y_test_i = X[date==date_i], y[date==date_i]
+        test_i = LabelledCollection(X_test_i, y_test_i, classes=train.classes)
+        print(f'test-{date_i} has {len(test_i)} docs and prevalence={F.strprev(test_i.prevalence())}')
+        tests.append(test_i)
+        test_init.append(idx2date[date_i])
+
+    return train, tests, test_init
+
+
+from scipy.interpolate import CubicSpline
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+def smooth_curve(dates, values, num_points=300):
+    """
+    dates: list of timestamps
+    values: list of Y-values
+    num_points: number of points in the smooth curve
+
+    Returns new_x, new_y for plotting a smooth line.
+    """
+    # Convert datetime to numeric (matplotlib float representation)
+    x = [d.timestamp() for d in dates]
+    x = np.array(x)
+    y = np.array(values)
+
+    # Create new X-axis with more points
+    x_new = np.linspace(x.min(), x.max(), num_points)
+
+    # Smooth spline
+    spline = CubicSpline(x, y)
+    y_new = spline(x_new)
+
+    # Convert numeric x_new back to datetime
+    dates_new = [pd.to_datetime(t, unit='s') for t in x_new]
+
+    return dates_new, y_new
+
+
+def plot_prevalences(results_dict, target_class=1, target_label='positive', savepath=None):
+    """
+    Plot prevalence estimates over time for each method contained in results_dict.
+
+    Parameters
+    ----------
+    results_dict : dict
+        A dictionary where:
+            - "date-start" : list of datetime-like objects
+            - all other keys : list of prevalence vectors (arrays), e.g. [p_pos, p_neg]
+              Only the first component (p_pos) will be plotted.
+    """
+    dates = results_dict["date-start"]
+
+    # Create figure
+    plt.figure(figsize=(20, 10))
+
+    # Plot one line per method (except "date-start")
+    for method, values in results_dict.items():
+        if method == "date-start":
+            continue
+
+        # Extract first component from each prevalence array
+        target_component = [v[target_class]*100 for v in values]
+
+        dates_smooth, y_smooth = smooth_curve(dates, target_component)
+
+        if method=='true-prev':
+            line,=plt.plot(dates_smooth, y_smooth, label=method, linewidth=3, linestyle='-')
+        else:
+            line,=plt.plot(dates_smooth, y_smooth, label=method, linewidth=2, linestyle='--')
+        plt.plot(dates, target_component, 'o', markersize=10, color=line.get_color())
+
+    # Axis labels
+    # plt.xlabel("Date")
+    plt.ylabel("% of "+target_label+" tweets")
+
+    # Rotate date labels for readability
+    plt.xticks(rotation=45)
+
+    plt.minorticks_on()
+    plt.grid(which='major', linestyle='-', linewidth=0.5)
+    plt.grid(which='minor', linestyle=':', linewidth=0.3)
+
+    # Place the legend outside to the right
+    plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
+
+    plt.tight_layout()
+    if savepath is not None:
+        os.makedirs(Path(savepath).parent, exist_ok=True)
+        plt.savefig(savepath)
+    else:
+        plt.show()
+
+
+class HDxDensify(DistributionMatchingX):
+    def fit(self, X, y):
+        self.reductor = TruncatedSVD(n_components=5, random_state=0)
+        Xred = self.reductor.fit_transform(X)
+        return super().fit(Xred, y)
+
+    def predict(self, X):
+        Xred = self.reductor.transform(X)
+        return super().predict(Xred)
+
+
+class QUnfoldWrapperDensify(QUnfoldWrapper):
+    def fit(self, X, y):
+        self.reductor = TruncatedSVD(n_components=5, random_state=0)
+        Xred = self.reductor.fit_transform(X)
+        return super().fit(Xred, y)
+
+    def predict(self, X):
+        Xred = self.reductor.transform(X)
+        return super().predict(Xred)
+
+
+# A scikit-learn's style wrapper for a huggingface-based pre-trained transformer for binary sentiment classification
+class HFTextClassifier(BaseEstimator, ClassifierMixin):
+    def __init__(self, model_name='distilbert-base-uncased-finetuned-sst-2-english'):
+        self.pipe = pipeline("sentiment-analysis", model=model_name)
+        self.classes_ = np.asarray([0,1])
+
+    def fit(self, X, y=None):
+        return self
+
+    def _binary_decisions(self, transformer_output: List[Dict]):
+        return np.array([(1 if p['label']=='POSITIVE' else 0) for p in transformer_output], dtype=int)
+
+    def predict(self, X):
+        X = list(map(str, X))
+        preds = self.pipe(X, truncation=True)
+        return self._binary_decisions(preds)
+
+    def predict_proba(self, X):
+        X = list(map(str, X))
+        n_examples = len(X)
+        preds = self.pipe(X, truncation=True)
+        decisions = self._binary_decisions(preds)
+        scores = np.array([p['score'] for p in preds], dtype=float)
+        probas = np.zeros(shape=(len(X), 2), dtype=float)
+        probas[np.arange(n_examples),decisions] = scores
+        probas[np.arange(n_examples),~decisions] = 1-scores
+        return probas
+
+# def methods(pre_trained_classifier):
+#     yield 'CC', CC(pre_trained_classifier, fit_classifier=False)
+
+USE_LOGISTIC_REGRESSION = True
+
+if USE_LOGISTIC_REGRESSION:
+    new_classifier = lambda:LR()
+    to_fit = True
+else:
+    pretrained = HFTextClassifier()
+    new_classifier = lambda:pretrained
+    to_fit = False
+
+
+def methods():
+    yield 'CC', CC(new_classifier(), fit_classifier=to_fit)
+    yield 'ACC', ACC(new_classifier(), fit_classifier=to_fit)
+    yield 'HDy', DistributionMatchingY(new_classifier(), fit_classifier=to_fit)
+    yield 'HDx', HDxDensify()
+    yield 'KMM', QUnfoldWrapperDensify(KMM())
+    yield 'SLD', EMQ(new_classifier(), fit_classifier=to_fit)
+    yield 'KDEy', KDEyML(new_classifier(), fit_classifier=to_fit)
+
+
+train, tests, test_init = prepare_labelled_collections()
+
+if USE_LOGISTIC_REGRESSION:
+    # vectorize text for logistic regression
+    vectorizer = TfidfVectorizer(min_df=5, sublinear_tf=True)
+    Xtr = vectorizer.fit_transform(train.X)
+    train = LabelledCollection(Xtr, train.labels, train.classes_)
+    for i in range(len(tests)):
+        Xte = vectorizer.transform(tests[i].X)
+        tests[i] = LabelledCollection(Xte, tests[i].labels, train.classes_)
+
+
+results = defaultdict(list)
+for test_i, test_init_i in zip(tests, test_init):
+    results['true-prev'].append(test_i.prevalence())
+    results['date-start'].append(test_init_i[0])
+
+for q_name, quant in methods():
+    quant.fit(*train.Xy)
+    for test_i, test_init_i in tqdm(zip(tests, test_init), desc=f'{q_name} predicting', total=len(tests)):
+        pred_i = quant.predict(test_i.X)
+        results[q_name].append(pred_i)
+
+suffix = '_lr' if USE_LOGISTIC_REGRESSION else '_transformer'
+plot_prevalences(results, savepath=f'./plots_ieee/over_time{suffix}.pdf')
+
+
+
+
+
+
+
--- a/IEEEProc2025_plots/uniform_sampling_simplex.py
+++ b/IEEEProc2025_plots/uniform_sampling_simplex.py
@ -93,4 +93,5 @@ ax.set_zlabel('')
 ax.grid(False)

 plt.tight_layout()
-plt.show()
+# plt.show()
+plt.savefig('plots_ieee/tetrahedron.pdf')