quantification over time experiment

2025-12-10 19:43:08 +01:00 · 2025-12-10 19:43:08 +01:00 · 1661a79dbb
parent 5c2554861c
commit 1661a79dbb
2 changed files with 302 additions and 1 deletions
--- a/IEEEProc2025_plots/over_time_experiment.py
+++ b/IEEEProc2025_plots/over_time_experiment.py
@ -0,0 +1,300 @@
 import os
 from collections import defaultdict
 from typing import List, Dict
 import matplotlib.pyplot as plt
 import kagglehub
 import pandas as pd
 from pathlib import Path
 import numpy as np
 from qunfold import KMM
 from sklearn.base import BaseEstimator, ClassifierMixin
 from sklearn.decomposition import TruncatedSVD
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.linear_model import LogisticRegression as LR, LogisticRegressionCV
 from tqdm import tqdm
 import quapy as qp
 from data import LabelledCollection, Dataset
 import quapy.functional as F
 from method.composable import QUnfoldWrapper
 from quapy.method.aggregative import DistributionMatchingY, EMQ, KDEyML
 from quapy.method.non_aggregative import DistributionMatchingX
 from quapy.method.aggregative import CC, ACC, HDy
 from transformers import pipeline
 pd.set_option('display.max_columns', None)
 pd.set_option('display.width', 2000)
 pd.set_option('display.max_rows', None)
 pd.set_option("display.expand_frame_repr", False)
 pd.set_option("display.precision", 4)
 pd.set_option("display.float_format", "{:.4f}".format)
 def prepare_xy_date_blocks(df, freq="M"):
    """
    df: DataFrame con columnas 'text', 'airline_sentiment', 'tweet_created'
    freq: frecuencia de los bloques temporales ('D', 'W', 'M', etc.)
    Devuelve:
        X: lista de textos
        y: np.ndarray de etiquetas
        date: lista de índices enteros por bloque temporal
        idx2date: lista con los límites temporales de cada bloque (tuplas)
    """
    df["tweet_created"] = pd.to_datetime(df["tweet_created"], errors="coerce")
    df = df.sort_values("tweet_created").reset_index(drop=True)
    X = df["text"].astype(str).values
    y = df["airline_sentiment"].values
    # group dates by requested frequency
    date_groups = df["tweet_created"].dt.to_period(freq)
    # assigns index to date blocks
    unique_periods = date_groups.unique()
    period_to_idx = {p: i for i, p in enumerate(unique_periods)}
    date = np.asarray([period_to_idx[p] for p in date_groups])
    # get true limits of period intervals
    idx2date = []
    for p in unique_periods:
        start = p.start_time
        end = p.end_time
        idx2date.append((start, end))
    return X, y, date, idx2date
 def prepare_labelled_collections():
    # loads and prepares the Twitter US Arlines Sentiment dataset (from Kaggle)
    # returns a labelled collection for the training data (day 0 and 1), and a list of the
    # test sets (days 2 to 8) and the time limits for each test period
    # The dataset is originally ternary (negative, neutral, positive), but we binarize it discarding neutral
    # Download latest version
    path = kagglehub.dataset_download("crowdflower/twitter-airline-sentiment")
    df = pd.read_csv(Path(path) / 'Tweets.csv')
    X, y, date, idx2date = prepare_xy_date_blocks(df, freq="D")
    # binarize
    keep_idx = (y!='neutral')
    X = X[keep_idx]
    y = y[keep_idx]
    date = date[keep_idx]
    y[y != 'negative'] = 1
    y[y == 'negative'] = 0
    y = y.astype(int)
    # use day 0 for training, the rest for test
    X_train, y_train = X[date<=1], y[date<=1]
    train = LabelledCollection(X_train, y_train)
    print(f'training has {len(train)} docs and prevalence={F.strprev(train.prevalence())} classes={train.classes}')
    tests = []
    test_init = []
    for date_i in range(2, max(date)+1):
        X_test_i, y_test_i = X[date==date_i], y[date==date_i]
        test_i = LabelledCollection(X_test_i, y_test_i, classes=train.classes)
        print(f'test-{date_i} has {len(test_i)} docs and prevalence={F.strprev(test_i.prevalence())}')
        tests.append(test_i)
        test_init.append(idx2date[date_i])
    return train, tests, test_init
 from scipy.interpolate import CubicSpline
 import numpy as np
 import matplotlib.pyplot as plt
 def smooth_curve(dates, values, num_points=300):
    """
    dates: list of timestamps
    values: list of Y-values
    num_points: number of points in the smooth curve
    Returns new_x, new_y for plotting a smooth line.
    """
    # Convert datetime to numeric (matplotlib float representation)
    x = [d.timestamp() for d in dates]
    x = np.array(x)
    y = np.array(values)
    # Create new X-axis with more points
    x_new = np.linspace(x.min(), x.max(), num_points)
    # Smooth spline
    spline = CubicSpline(x, y)
    y_new = spline(x_new)
    # Convert numeric x_new back to datetime
    dates_new = [pd.to_datetime(t, unit='s') for t in x_new]
    return dates_new, y_new
 def plot_prevalences(results_dict, target_class=1, target_label='positive', savepath=None):
    """
    Plot prevalence estimates over time for each method contained in results_dict.
    Parameters
    ----------
    results_dict : dict
        A dictionary where:
            - "date-start" : list of datetime-like objects
            - all other keys : list of prevalence vectors (arrays), e.g. [p_pos, p_neg]
              Only the first component (p_pos) will be plotted.
    """
    dates = results_dict["date-start"]
    # Create figure
    plt.figure(figsize=(20, 10))
    # Plot one line per method (except "date-start")
    for method, values in results_dict.items():
        if method == "date-start":
            continue
        # Extract first component from each prevalence array
        target_component = [v[target_class]*100 for v in values]
        dates_smooth, y_smooth = smooth_curve(dates, target_component)
        if method=='true-prev':
            line,=plt.plot(dates_smooth, y_smooth, label=method, linewidth=3, linestyle='-')
        else:
            line,=plt.plot(dates_smooth, y_smooth, label=method, linewidth=2, linestyle='--')
        plt.plot(dates, target_component, 'o', markersize=10, color=line.get_color())
    # Axis labels
    # plt.xlabel("Date")
    plt.ylabel("% of "+target_label+" tweets")
    # Rotate date labels for readability
    plt.xticks(rotation=45)
    plt.minorticks_on()
    plt.grid(which='major', linestyle='-', linewidth=0.5)
    plt.grid(which='minor', linestyle=':', linewidth=0.3)
    # Place the legend outside to the right
    plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
    plt.tight_layout()
    if savepath is not None:
        os.makedirs(Path(savepath).parent, exist_ok=True)
        plt.savefig(savepath)
    else:
        plt.show()
 class HDxDensify(DistributionMatchingX):
    def fit(self, X, y):
        self.reductor = TruncatedSVD(n_components=5, random_state=0)
        Xred = self.reductor.fit_transform(X)
        return super().fit(Xred, y)
    def predict(self, X):
        Xred = self.reductor.transform(X)
        return super().predict(Xred)
 class QUnfoldWrapperDensify(QUnfoldWrapper):
    def fit(self, X, y):
        self.reductor = TruncatedSVD(n_components=5, random_state=0)
        Xred = self.reductor.fit_transform(X)
        return super().fit(Xred, y)
    def predict(self, X):
        Xred = self.reductor.transform(X)
        return super().predict(Xred)
 # A scikit-learn's style wrapper for a huggingface-based pre-trained transformer for binary sentiment classification
 class HFTextClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, model_name='distilbert-base-uncased-finetuned-sst-2-english'):
        self.pipe = pipeline("sentiment-analysis", model=model_name)
        self.classes_ = np.asarray([0,1])
    def fit(self, X, y=None):
        return self
    def _binary_decisions(self, transformer_output: List[Dict]):
        return np.array([(1 if p['label']=='POSITIVE' else 0) for p in transformer_output], dtype=int)
    def predict(self, X):
        X = list(map(str, X))
        preds = self.pipe(X, truncation=True)
        return self._binary_decisions(preds)
    def predict_proba(self, X):
        X = list(map(str, X))
        n_examples = len(X)
        preds = self.pipe(X, truncation=True)
        decisions = self._binary_decisions(preds)
        scores = np.array([p['score'] for p in preds], dtype=float)
        probas = np.zeros(shape=(len(X), 2), dtype=float)
        probas[np.arange(n_examples),decisions] = scores
        probas[np.arange(n_examples),~decisions] = 1-scores
        return probas
 # def methods(pre_trained_classifier):
 #     yield 'CC', CC(pre_trained_classifier, fit_classifier=False)
 USE_LOGISTIC_REGRESSION = True
 if USE_LOGISTIC_REGRESSION:
    new_classifier = lambda:LR()
    to_fit = True
 else:
    pretrained = HFTextClassifier()
    new_classifier = lambda:pretrained
    to_fit = False
 def methods():
    yield 'CC', CC(new_classifier(), fit_classifier=to_fit)
    yield 'ACC', ACC(new_classifier(), fit_classifier=to_fit)
    yield 'HDy', DistributionMatchingY(new_classifier(), fit_classifier=to_fit)
    yield 'HDx', HDxDensify()
    yield 'KMM', QUnfoldWrapperDensify(KMM())
    yield 'SLD', EMQ(new_classifier(), fit_classifier=to_fit)
    yield 'KDEy', KDEyML(new_classifier(), fit_classifier=to_fit)
 train, tests, test_init = prepare_labelled_collections()
 if USE_LOGISTIC_REGRESSION:
    # vectorize text for logistic regression
    vectorizer = TfidfVectorizer(min_df=5, sublinear_tf=True)
    Xtr = vectorizer.fit_transform(train.X)
    train = LabelledCollection(Xtr, train.labels, train.classes_)
    for i in range(len(tests)):
        Xte = vectorizer.transform(tests[i].X)
        tests[i] = LabelledCollection(Xte, tests[i].labels, train.classes_)
 results = defaultdict(list)
 for test_i, test_init_i in zip(tests, test_init):
    results['true-prev'].append(test_i.prevalence())
    results['date-start'].append(test_init_i[0])
 for q_name, quant in methods():
    quant.fit(*train.Xy)
    for test_i, test_init_i in tqdm(zip(tests, test_init), desc=f'{q_name} predicting', total=len(tests)):
        pred_i = quant.predict(test_i.X)
        results[q_name].append(pred_i)
 suffix = '_lr' if USE_LOGISTIC_REGRESSION else '_transformer'
 plot_prevalences(results, savepath=f'./plots_ieee/over_time{suffix}.pdf')
--- a/IEEEProc2025_plots/uniform_sampling_simplex.py
+++ b/IEEEProc2025_plots/uniform_sampling_simplex.py
@ -93,4 +93,5 @@ ax.set_zlabel('')
 ax.grid(False)
 plt.tight_layout()
-plt.show()
+# plt.show()
 plt.savefig('plots_ieee/tetrahedron.pdf')