From f5c16bbf3dc51c13204f6695eff8bba5a548451d Mon Sep 17 00:00:00 2001
From: Arianna Di Serio <arianna.diserio@gitea-s2i2s@isti.cnr.it>
Date: Fri, 6 Mar 2026 12:03:53 +0100
Subject: [PATCH] update

---
 annotation.py | 30 ++++++++++++++----------------
 1 file changed, 14 insertions(+), 16 deletions(-)

diff --git a/annotation.py b/annotation.py
index 3f0e01d..656cac1 100644
--- a/annotation.py
+++ b/annotation.py
@@ -70,7 +70,7 @@ print("Shape embeddings:", embeddings.shape)
 
 
 
-# ----- Step3: Creazione indice FAISS ---
+# ----- Step3: Creazione indice FAISS e calcolo similarity ---
 faiss.normalize_L2(embeddings)
 dimension = embeddings.shape[1]
 index = faiss.IndexFlatIP(dimension)
@@ -78,13 +78,13 @@ index.add(embeddings)
 print(f"\n***Step 3: Indice FAISS creato***.\nNumero di vettori nell'indice: {index.ntotal}")
 
 
-# ----- Step 4: Retrieval (similarità cosine) -----
+# ----- Step 4: Retrieval: 5 automazioni più simili -----
 k = 5
 output_rows = []
 df_sample = df_unlabeled_filtered.head(50).reset_index(drop=True)
 llm_rows = []
 
-
+# label in bale alla similarity
 def sim_label(sim: float) -> str:
     # più alto = più simile
     if sim >= 0.80:
@@ -122,8 +122,8 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
     rank1_category = topk_cats[0] if topk_cats else ""
     majority_category = Counter(topk_cats).most_common(1)[0][0] if topk_cats else ""
     consistency = (sum(c == majority_category for c in topk_cats) / len(topk_cats)) if topk_cats else 0.0
-    print(consistency)
-
+    
+    # per ognuna delle 5 automazioni simili
     for rank in range(k):
         idx = int(indices[0][rank])
         sim = float(sims[0][rank])
@@ -185,7 +185,7 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
 
     # (4) Salvataggio di 1 riga per automazione con:
     # - metriche retrieval (rank1/majority/consistency)
-    # - output dell'LLM (scores + label finale + review flag)
+    # - output dell'LLM
     llm_category = str(parsed.get("category", "")).strip()
     llm_subcategory = str(parsed.get("subcategory", "")).strip()
     llm_problem_type = str(parsed.get("problem_type", "")).strip()
@@ -194,21 +194,20 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
         llm_subcategory = ""
         llm_problem_type = "none"
         llm_gravity = "NONE"
-    # di default l'etichetta assegnata è quella del LLM - rivista se review=true 
+    # di default l'etichetta finale assegnata è quella del LLM - revisionata se review=true 
     final_category = llm_category
     final_subcategory = llm_subcategory
     final_problem_type = llm_problem_type
     final_gravity = llm_gravity
 
-    
+
+    # ================= REVIEW LOGIC =================
     if top1_similarity_label == "Debole" or top1_similarity_label == "Similarità instabile":
         needs_human_review = True
     else:
         needs_human_review = False
 
-
     final_needs_human_review = needs_human_review
-    # ================= HUMAN REVIEW LOGIC =================
     aligned_strong = (
         llm_category == majority_category
         and llm_category == rank1_category
@@ -229,7 +228,7 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
 
 
     llm_rows.append({
-        "automation_id": row.get("automation_id", ""),
+        "id": row.get("automation_id", ""),
         "folder": row.get("folder", ""),
         "automation_text": query_text,
 
@@ -268,12 +267,11 @@ df_out.to_excel(out_path, index=False)
 wb = load_workbook(out_path)
 ws = wb.active
 
+# colore delle colonne review
 true_fill = PatternFill(start_color="FF6347", end_color="FF6347", fill_type="solid")   # rosso
 false_fill = PatternFill(start_color="90EE90", end_color="90EE90", fill_type="solid") # verde
-
 col_index = {cell.value: idx for idx, cell in enumerate(ws[1], start=1)}
-
-for col_name in ["needs_human_review", "final_needs_human_review"]:
+for col_name in ["needs_review", "final_needs_review"]:
     if col_name in col_index:
         c = col_index[col_name]
         for r in range(2, ws.max_row + 1):
@@ -286,8 +284,8 @@ for col_name in ["needs_human_review", "final_needs_human_review"]:
 wb.save(out_path)
 print(f"\n***Step 6: Excel salvato in {out_path}")
 
-# --- Conteggio needs_human_review ---
-review_counts = df_out["final_needs_human_review"].value_counts(dropna=False)
+# --- Conteggio final_needs_review ---
+review_counts = df_out["final_needs_review"].value_counts(dropna=False)
 true_count = review_counts.get(True, 0)
 false_count = review_counts.get(False, 0)
 print("\n--- Needs human review summary ---")