# --- Import librerie --- import pandas as pd import numpy as np import unicodedata import re import warnings from sentence_transformers import SentenceTransformer import pickle # ----- Percorsi file ----- LABELED_IN = "main/datasets/annotated_dataset.xlsx" REVIEWED = "main/datasets/first500_reviewed.xlsx" LABELED_OUT = "main/datasets/annotated_dataset_updated.xlsx" # ----- Funzioni di pulizia ----- def clean(x): if pd.isna(x): return "" return str(x).strip() def normalize_problem_type(x): x = clean(x).upper() if x == "S": return "RULE_SPECIFIC" if x == "G": return "GENERIC" return x def normalize_severity(x): return clean(x).upper() def clean_id(x): if pd.isna(x): return "" s = str(x).strip().strip('"').strip("'") return s.lower() def clean_folder(x): if pd.isna(x): return "" s = str(x).strip().lower() s = unicodedata.normalize("NFKC", s) s = re.sub(r'\s+', ' ', s) return s # ----- Step 1: caricare datasets ----- df_labeled = pd.read_excel(LABELED_IN) df_labeled = df_labeled.loc[:, ~df_labeled.columns.str.contains("^Unnamed")].copy() df_labeled = df_labeled.dropna(how="all") df_labeled = df_labeled.rename(columns={"automation_id": "id"}) df_rev = pd.read_excel(REVIEWED) # Normalizzazione problem_type e severity if "error_type" in df_labeled.columns: df_labeled["error_type"] = df_labeled["error_type"].apply(normalize_problem_type) # Costruzione dataset pulito dai primi 500 rows = [] for _, r in df_rev.iterrows(): category = clean(r["final_category"]) subcategory = clean(r["final_subcategory"]) error_type = normalize_problem_type(r["final_problem_type"]) severity = normalize_severity(r["final_gravity"]) # Coerenza HARMLESS if category.upper() == "HARMLESS": subcategory = "" error_type = "none" severity = "none" rows.append({ "id": clean(r["id"]), "folder": clean(r["folder"]), "automation": clean(r["automation_text"]), "description": clean(r.get("llm_rationale", "")), "category": category, "subcategory": subcategory, "error_type": error_type, "severity": severity, "borderline": clean(r["borderline"]), }) df_new = pd.DataFrame(rows) # Normalizzazione valori 'none' df_new["error_type"] = df_new["error_type"].apply(lambda x: x.lower() if x.lower() == "none" else x) df_new["severity"] = df_new["severity"].apply(lambda x: x.lower() if x.lower() == "none" else x) # Rimuovi righe senza categoria df_new = df_new[df_new["category"] != ""].copy() # Pulizia id e folder in entrambi i dataset for df in [df_labeled, df_new]: df["id"] = df["id"].apply(clean_id) df["folder"] = df["folder"].apply(clean_folder) # Rimuovere duplicati: eliminare dal labeled righe già presenti in df_new new_keys = set(zip(df_new["id"], df_new["folder"])) df_labeled_clean = df_labeled[~df_labeled.apply(lambda r: (r["id"], r["folder"]) in new_keys, axis=1)].copy() # Concat finale df_final = pd.concat([df_labeled_clean, df_new], ignore_index=True).fillna("") # Salva dataset aggiornato df_final.to_excel(LABELED_OUT, index=False) print("✅ Merge completato") print("Righe iniziali:", len(df_labeled)) print("Righe aggiunte:", len(df_rev)) print("Totale finale:", len(df_final)) # ----- Step 2: calcolo embeddings ----- warnings.filterwarnings("ignore") model = SentenceTransformer("all-MiniLM-L6-v2") texts = df_final["automation"].tolist() embeddings = model.encode( texts, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True ).astype("float32") print("Shape embeddings ricalcolati:", embeddings.shape) # ----- Step 3: salva embeddings ----- with open("main/labeled_embeddings.pkl", "wb") as f: pickle.dump({"embeddings": embeddings, "id": df_final["id"].tolist()}, f) print("Embeddings salvati con successo!")