From 110d70fc58bff925796254e44f91cca7672cbbdf Mon Sep 17 00:00:00 2001 From: Arianna Di Serio Date: Wed, 11 Mar 2026 15:38:45 +0100 Subject: [PATCH] update --- annotation.py | 101 +++++++++++++++++++++++---------------- merge.py | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+), 39 deletions(-) create mode 100644 merge.py diff --git a/annotation.py b/annotation.py index 656cac1..5e9cf27 100644 --- a/annotation.py +++ b/annotation.py @@ -14,42 +14,64 @@ from collections import Counter from prompts.prompt import build_prompt_local import warnings import logging - +import unicodedata # --- Configurazione --- endpoint = "https://gpt-sw-central-tap-security.openai.azure.com/" -deployment = "gpt-4o" +deployment = "gpt-5.1-chat-3" subscription_key = "8zufUIPs0Dijh0M6NpifkkDvxJHZMFtott7u8V8ySTYNcpYVoRbsJQQJ99BBACfhMk5XJ3w3AAABACOGr6sq" client = AzureOpenAI( azure_endpoint=endpoint, api_key=subscription_key, - api_version="2024-05-01-preview", + api_version="2025-04-01-preview", ) # ----- Step 1: caricare datasets ----- -df_labeled = pd.read_csv("main/datasets/annotated_dataset.csv", encoding="cp1252", sep=";") -df_unlabeled = pd.read_csv("main/datasets/unlabeled_dataset.csv", sep="\t", encoding="utf-8") +#df_labeled = pd.read_excel("main/datasets/annotated_dataset.xlsx").dropna(how="all") +df_labeled = pd.read_excel("main/datasets/annotated_dataset_updated.xlsx").dropna(how="all") +df_unlabeled = pd.read_excel("main/datasets/unlabeled_dataset.xlsx").dropna(how="all") print("***STEP 1***\nDataset etichettato caricato. Numero righe:", len(df_labeled), "\nDataset non etichettato caricato. Numero righe:", len(df_unlabeled)) +df_labeled = df_labeled.rename(columns={"automation_id": "id"}) +df_unlabeled = df_unlabeled.rename(columns={"automation_id": "id"}) +# Pulizia colonne def clean_id(x): if pd.isna(x): return "" - s = str(x) - m = re.search(r"\d+", s) - return m.group(0) if m else s.strip() + s = str(x).strip() # rimuove spazi + s = s.strip('"').strip("'") # rimuove eventuali virgolette + return s.lower() -df_labeled["automation_id"] = df_labeled["automation_id"].apply(clean_id) -df_unlabeled["automation_id"] = df_unlabeled["automation_id"].apply(clean_id) -df_labeled["folder"] = df_labeled["folder"].astype(str).str.strip() -df_unlabeled["folder"] = df_unlabeled["folder"].astype(str).str.strip() +def clean_folder(x): + """Pulizia folder: rimuove spazi multipli, normalizza unicode.""" + if pd.isna(x): + return "" + s = str(x).strip().lower() + s = unicodedata.normalize("NFKC", s) + s = re.sub(r'\s+', ' ', s) + return s -labeled_pairs = set(zip(df_labeled["automation_id"], df_labeled["folder"])) -df_unlabeled_filtered = df_unlabeled[ - ~df_unlabeled.apply(lambda row: (row["automation_id"], row["folder"]) in labeled_pairs, axis=1) -] -print("Automazioni non etichettate rimanenti dopo la pulizia:", len(df_unlabeled_filtered)) +for df in [df_labeled, df_unlabeled]: + df["id"] = df["id"].apply(clean_id) + df["folder"] = df["folder"].apply(clean_folder) +labeled_pairs = set(zip(df_labeled["id"], df_labeled["folder"])) + +# crea maschera: True = la riga NON è presente in labeled +mask_unlabeled = ~df_unlabeled.apply(lambda r: (r["id"], r["folder"]) in labeled_pairs, axis=1) +# filtra +df_unlabeled_filtered = df_unlabeled[mask_unlabeled].copy() + +print("Numero righe df_unlabeled dopo aver rimosso quelle già etichettate:", len(df_unlabeled_filtered)) + +unlabeled_pairs = set(zip(df_unlabeled["id"], df_unlabeled["folder"])) +missing_in_unlabeled = labeled_pairs - unlabeled_pairs +print("Numero coppie etichettate non presenti in unlabeled:", len(missing_in_unlabeled)) +if missing_in_unlabeled: + print("Coppie mancanti:") + for p in list(missing_in_unlabeled)[:50]: # stampa solo le prime 50 per comodità + print(p) # ----- Step 2: embeddings ----- # Silenzia warning generici @@ -62,6 +84,7 @@ logging.getLogger("huggingface_hub").setLevel(logging.ERROR) print("\n***Step 2***\nEmbeddings") model = SentenceTransformer("all-MiniLM-L6-v2") +#with open("main/labeled_embeddings_71.pkl", "rb") as f: with open("main/labeled_embeddings.pkl", "rb") as f: data = pickle.load(f) @@ -70,7 +93,7 @@ print("Shape embeddings:", embeddings.shape) -# ----- Step3: Creazione indice FAISS e calcolo similarity --- +# ----- Step3: Creazione indice FAISS --- faiss.normalize_L2(embeddings) dimension = embeddings.shape[1] index = faiss.IndexFlatIP(dimension) @@ -78,13 +101,12 @@ index.add(embeddings) print(f"\n***Step 3: Indice FAISS creato***.\nNumero di vettori nell'indice: {index.ntotal}") -# ----- Step 4: Retrieval: 5 automazioni più simili ----- +# ----- Step 4: Retrieval (similarità cosine) ----- k = 5 output_rows = [] -df_sample = df_unlabeled_filtered.head(50).reset_index(drop=True) +df_sample = df_unlabeled_filtered.head(10).reset_index(drop=True) llm_rows = [] -# label in bale alla similarity def sim_label(sim: float) -> str: # più alto = più simile if sim >= 0.80: @@ -122,8 +144,7 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1): rank1_category = topk_cats[0] if topk_cats else "" majority_category = Counter(topk_cats).most_common(1)[0][0] if topk_cats else "" consistency = (sum(c == majority_category for c in topk_cats) / len(topk_cats)) if topk_cats else 0.0 - - # per ognuna delle 5 automazioni simili + for rank in range(k): idx = int(indices[0][rank]) sim = float(sims[0][rank]) @@ -164,7 +185,7 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1): {"role": "system", "content": prompt}, {"role": "user", "content": f'automation to evaluate: {query_text}'} ], - temperature=0, + reasoning_effort= "low" ) content = resp.choices[0].message.content.strip() @@ -185,7 +206,7 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1): # (4) Salvataggio di 1 riga per automazione con: # - metriche retrieval (rank1/majority/consistency) - # - output dell'LLM + # - output dell'LLM (scores + label finale + review flag) llm_category = str(parsed.get("category", "")).strip() llm_subcategory = str(parsed.get("subcategory", "")).strip() llm_problem_type = str(parsed.get("problem_type", "")).strip() @@ -194,27 +215,28 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1): llm_subcategory = "" llm_problem_type = "none" llm_gravity = "NONE" - # di default l'etichetta finale assegnata è quella del LLM - revisionata se review=true + # di default l'etichetta assegnata è quella del LLM - rivista se review=true final_category = llm_category final_subcategory = llm_subcategory final_problem_type = llm_problem_type final_gravity = llm_gravity - - # ================= REVIEW LOGIC ================= + if top1_similarity_label == "Debole" or top1_similarity_label == "Similarità instabile": - needs_human_review = True + needs_review = True else: - needs_human_review = False + needs_review = False - final_needs_human_review = needs_human_review + + final_needs_review = needs_review + # ================= HUMAN REVIEW LOGIC ================= aligned_strong = ( llm_category == majority_category and llm_category == rank1_category and llm_category != "" ) - OVERRIDE_MIN_SIMILARITY = 0.38 + OVERRIDE_MIN_SIMILARITY = 0.39 OVERRIDE_MIN_CONSISTENCY = 0.60 good_retrieval = ( @@ -223,12 +245,12 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1): ) if aligned_strong and good_retrieval: - final_needs_human_review = False + final_needs_review = False # ===================================================== llm_rows.append({ - "id": row.get("automation_id", ""), + "id": row.get("id", ""), "folder": row.get("folder", ""), "automation_text": query_text, @@ -246,8 +268,8 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1): "llm_problem_type": llm_problem_type, "llm_gravity": llm_gravity, - "needs_review": needs_human_review, - "final_needs_review": final_needs_human_review, + "needs_review": needs_review, + "final_needs_review": final_needs_review, # FINAL "final_category": final_category, @@ -261,16 +283,17 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1): # ----- Step 6: output Excel ----- df_out = pd.DataFrame(llm_rows) -out_path = "main/datasets/labeling_first50.xlsx" +out_path = "main/datasets/labeling_2_500.xlsx" df_out.to_excel(out_path, index=False) wb = load_workbook(out_path) ws = wb.active -# colore delle colonne review true_fill = PatternFill(start_color="FF6347", end_color="FF6347", fill_type="solid") # rosso false_fill = PatternFill(start_color="90EE90", end_color="90EE90", fill_type="solid") # verde + col_index = {cell.value: idx for idx, cell in enumerate(ws[1], start=1)} + for col_name in ["needs_review", "final_needs_review"]: if col_name in col_index: c = col_index[col_name] @@ -284,7 +307,7 @@ for col_name in ["needs_review", "final_needs_review"]: wb.save(out_path) print(f"\n***Step 6: Excel salvato in {out_path}") -# --- Conteggio final_needs_review --- +# --- Conteggio needs_human_review --- review_counts = df_out["final_needs_review"].value_counts(dropna=False) true_count = review_counts.get(True, 0) false_count = review_counts.get(False, 0) diff --git a/merge.py b/merge.py new file mode 100644 index 0000000..a4a7020 --- /dev/null +++ b/merge.py @@ -0,0 +1,127 @@ +# --- Import librerie --- +import pandas as pd +import numpy as np +import unicodedata +import re +import warnings +from sentence_transformers import SentenceTransformer +import pickle + +# ----- Percorsi file ----- +LABELED_IN = "main/datasets/annotated_dataset.xlsx" +REVIEWED = "main/datasets/first500_reviewed.xlsx" +LABELED_OUT = "main/datasets/annotated_dataset_updated.xlsx" + +# ----- Funzioni di pulizia ----- +def clean(x): + if pd.isna(x): + return "" + return str(x).strip() + +def normalize_problem_type(x): + x = clean(x).upper() + if x == "S": + return "RULE_SPECIFIC" + if x == "G": + return "GENERIC" + return x + +def normalize_severity(x): + return clean(x).upper() + +def clean_id(x): + if pd.isna(x): + return "" + s = str(x).strip().strip('"').strip("'") + return s.lower() + +def clean_folder(x): + if pd.isna(x): + return "" + s = str(x).strip().lower() + s = unicodedata.normalize("NFKC", s) + s = re.sub(r'\s+', ' ', s) + return s + +# ----- Step 1: caricare datasets ----- +df_labeled = pd.read_excel(LABELED_IN) +df_labeled = df_labeled.loc[:, ~df_labeled.columns.str.contains("^Unnamed")].copy() +df_labeled = df_labeled.dropna(how="all") +df_labeled = df_labeled.rename(columns={"automation_id": "id"}) + +df_rev = pd.read_excel(REVIEWED) + +# Normalizzazione problem_type e severity +if "error_type" in df_labeled.columns: + df_labeled["error_type"] = df_labeled["error_type"].apply(normalize_problem_type) + +# Costruzione dataset pulito dai primi 500 +rows = [] +for _, r in df_rev.iterrows(): + category = clean(r["final_category"]) + subcategory = clean(r["final_subcategory"]) + error_type = normalize_problem_type(r["final_problem_type"]) + severity = normalize_severity(r["final_gravity"]) + + # Coerenza HARMLESS + if category.upper() == "HARMLESS": + subcategory = "" + error_type = "none" + severity = "none" + + rows.append({ + "id": clean(r["id"]), + "folder": clean(r["folder"]), + "automation": clean(r["automation_text"]), + "description": clean(r.get("llm_rationale", "")), + "category": category, + "subcategory": subcategory, + "error_type": error_type, + "severity": severity, + "borderline": clean(r["borderline"]), + }) + +df_new = pd.DataFrame(rows) + +# Normalizzazione valori 'none' +df_new["error_type"] = df_new["error_type"].apply(lambda x: x.lower() if x.lower() == "none" else x) +df_new["severity"] = df_new["severity"].apply(lambda x: x.lower() if x.lower() == "none" else x) + +# Rimuovi righe senza categoria +df_new = df_new[df_new["category"] != ""].copy() + +# Pulizia id e folder in entrambi i dataset +for df in [df_labeled, df_new]: + df["id"] = df["id"].apply(clean_id) + df["folder"] = df["folder"].apply(clean_folder) + +# Rimuovere duplicati: eliminare dal labeled righe già presenti in df_new +new_keys = set(zip(df_new["id"], df_new["folder"])) +df_labeled_clean = df_labeled[~df_labeled.apply(lambda r: (r["id"], r["folder"]) in new_keys, axis=1)].copy() + +# Concat finale +df_final = pd.concat([df_labeled_clean, df_new], ignore_index=True).fillna("") + +# Salva dataset aggiornato +df_final.to_excel(LABELED_OUT, index=False) +print("✅ Merge completato") +print("Righe iniziali:", len(df_labeled)) +print("Righe aggiunte:", len(df_rev)) +print("Totale finale:", len(df_final)) + +# ----- Step 2: calcolo embeddings ----- +warnings.filterwarnings("ignore") +model = SentenceTransformer("all-MiniLM-L6-v2") + +texts = df_final["automation"].tolist() +embeddings = model.encode( + texts, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True +).astype("float32") + +print("Shape embeddings ricalcolati:", embeddings.shape) + +# ----- Step 3: salva embeddings ----- +with open("main/labeled_embeddings.pkl", "wb") as f: + pickle.dump({"embeddings": embeddings, "id": df_final["id"].tolist()}, f) + +print("Embeddings salvati con successo!") \ No newline at end of file