update
This commit is contained in:
parent
4e68ffc3dc
commit
f5c16bbf3d
|
|
@ -70,7 +70,7 @@ print("Shape embeddings:", embeddings.shape)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ----- Step3: Creazione indice FAISS ---
|
# ----- Step3: Creazione indice FAISS e calcolo similarity ---
|
||||||
faiss.normalize_L2(embeddings)
|
faiss.normalize_L2(embeddings)
|
||||||
dimension = embeddings.shape[1]
|
dimension = embeddings.shape[1]
|
||||||
index = faiss.IndexFlatIP(dimension)
|
index = faiss.IndexFlatIP(dimension)
|
||||||
|
|
@ -78,13 +78,13 @@ index.add(embeddings)
|
||||||
print(f"\n***Step 3: Indice FAISS creato***.\nNumero di vettori nell'indice: {index.ntotal}")
|
print(f"\n***Step 3: Indice FAISS creato***.\nNumero di vettori nell'indice: {index.ntotal}")
|
||||||
|
|
||||||
|
|
||||||
# ----- Step 4: Retrieval (similarità cosine) -----
|
# ----- Step 4: Retrieval: 5 automazioni più simili -----
|
||||||
k = 5
|
k = 5
|
||||||
output_rows = []
|
output_rows = []
|
||||||
df_sample = df_unlabeled_filtered.head(50).reset_index(drop=True)
|
df_sample = df_unlabeled_filtered.head(50).reset_index(drop=True)
|
||||||
llm_rows = []
|
llm_rows = []
|
||||||
|
|
||||||
|
# label in bale alla similarity
|
||||||
def sim_label(sim: float) -> str:
|
def sim_label(sim: float) -> str:
|
||||||
# più alto = più simile
|
# più alto = più simile
|
||||||
if sim >= 0.80:
|
if sim >= 0.80:
|
||||||
|
|
@ -122,8 +122,8 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
|
||||||
rank1_category = topk_cats[0] if topk_cats else ""
|
rank1_category = topk_cats[0] if topk_cats else ""
|
||||||
majority_category = Counter(topk_cats).most_common(1)[0][0] if topk_cats else ""
|
majority_category = Counter(topk_cats).most_common(1)[0][0] if topk_cats else ""
|
||||||
consistency = (sum(c == majority_category for c in topk_cats) / len(topk_cats)) if topk_cats else 0.0
|
consistency = (sum(c == majority_category for c in topk_cats) / len(topk_cats)) if topk_cats else 0.0
|
||||||
print(consistency)
|
|
||||||
|
# per ognuna delle 5 automazioni simili
|
||||||
for rank in range(k):
|
for rank in range(k):
|
||||||
idx = int(indices[0][rank])
|
idx = int(indices[0][rank])
|
||||||
sim = float(sims[0][rank])
|
sim = float(sims[0][rank])
|
||||||
|
|
@ -185,7 +185,7 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
|
||||||
|
|
||||||
# (4) Salvataggio di 1 riga per automazione con:
|
# (4) Salvataggio di 1 riga per automazione con:
|
||||||
# - metriche retrieval (rank1/majority/consistency)
|
# - metriche retrieval (rank1/majority/consistency)
|
||||||
# - output dell'LLM (scores + label finale + review flag)
|
# - output dell'LLM
|
||||||
llm_category = str(parsed.get("category", "")).strip()
|
llm_category = str(parsed.get("category", "")).strip()
|
||||||
llm_subcategory = str(parsed.get("subcategory", "")).strip()
|
llm_subcategory = str(parsed.get("subcategory", "")).strip()
|
||||||
llm_problem_type = str(parsed.get("problem_type", "")).strip()
|
llm_problem_type = str(parsed.get("problem_type", "")).strip()
|
||||||
|
|
@ -194,21 +194,20 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
|
||||||
llm_subcategory = ""
|
llm_subcategory = ""
|
||||||
llm_problem_type = "none"
|
llm_problem_type = "none"
|
||||||
llm_gravity = "NONE"
|
llm_gravity = "NONE"
|
||||||
# di default l'etichetta assegnata è quella del LLM - rivista se review=true
|
# di default l'etichetta finale assegnata è quella del LLM - revisionata se review=true
|
||||||
final_category = llm_category
|
final_category = llm_category
|
||||||
final_subcategory = llm_subcategory
|
final_subcategory = llm_subcategory
|
||||||
final_problem_type = llm_problem_type
|
final_problem_type = llm_problem_type
|
||||||
final_gravity = llm_gravity
|
final_gravity = llm_gravity
|
||||||
|
|
||||||
|
|
||||||
|
# ================= REVIEW LOGIC =================
|
||||||
if top1_similarity_label == "Debole" or top1_similarity_label == "Similarità instabile":
|
if top1_similarity_label == "Debole" or top1_similarity_label == "Similarità instabile":
|
||||||
needs_human_review = True
|
needs_human_review = True
|
||||||
else:
|
else:
|
||||||
needs_human_review = False
|
needs_human_review = False
|
||||||
|
|
||||||
|
|
||||||
final_needs_human_review = needs_human_review
|
final_needs_human_review = needs_human_review
|
||||||
# ================= HUMAN REVIEW LOGIC =================
|
|
||||||
aligned_strong = (
|
aligned_strong = (
|
||||||
llm_category == majority_category
|
llm_category == majority_category
|
||||||
and llm_category == rank1_category
|
and llm_category == rank1_category
|
||||||
|
|
@ -229,7 +228,7 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
|
||||||
|
|
||||||
|
|
||||||
llm_rows.append({
|
llm_rows.append({
|
||||||
"automation_id": row.get("automation_id", ""),
|
"id": row.get("automation_id", ""),
|
||||||
"folder": row.get("folder", ""),
|
"folder": row.get("folder", ""),
|
||||||
"automation_text": query_text,
|
"automation_text": query_text,
|
||||||
|
|
||||||
|
|
@ -268,12 +267,11 @@ df_out.to_excel(out_path, index=False)
|
||||||
wb = load_workbook(out_path)
|
wb = load_workbook(out_path)
|
||||||
ws = wb.active
|
ws = wb.active
|
||||||
|
|
||||||
|
# colore delle colonne review
|
||||||
true_fill = PatternFill(start_color="FF6347", end_color="FF6347", fill_type="solid") # rosso
|
true_fill = PatternFill(start_color="FF6347", end_color="FF6347", fill_type="solid") # rosso
|
||||||
false_fill = PatternFill(start_color="90EE90", end_color="90EE90", fill_type="solid") # verde
|
false_fill = PatternFill(start_color="90EE90", end_color="90EE90", fill_type="solid") # verde
|
||||||
|
|
||||||
col_index = {cell.value: idx for idx, cell in enumerate(ws[1], start=1)}
|
col_index = {cell.value: idx for idx, cell in enumerate(ws[1], start=1)}
|
||||||
|
for col_name in ["needs_review", "final_needs_review"]:
|
||||||
for col_name in ["needs_human_review", "final_needs_human_review"]:
|
|
||||||
if col_name in col_index:
|
if col_name in col_index:
|
||||||
c = col_index[col_name]
|
c = col_index[col_name]
|
||||||
for r in range(2, ws.max_row + 1):
|
for r in range(2, ws.max_row + 1):
|
||||||
|
|
@ -286,8 +284,8 @@ for col_name in ["needs_human_review", "final_needs_human_review"]:
|
||||||
wb.save(out_path)
|
wb.save(out_path)
|
||||||
print(f"\n***Step 6: Excel salvato in {out_path}")
|
print(f"\n***Step 6: Excel salvato in {out_path}")
|
||||||
|
|
||||||
# --- Conteggio needs_human_review ---
|
# --- Conteggio final_needs_review ---
|
||||||
review_counts = df_out["final_needs_human_review"].value_counts(dropna=False)
|
review_counts = df_out["final_needs_review"].value_counts(dropna=False)
|
||||||
true_count = review_counts.get(True, 0)
|
true_count = review_counts.get(True, 0)
|
||||||
false_count = review_counts.get(False, 0)
|
false_count = review_counts.get(False, 0)
|
||||||
print("\n--- Needs human review summary ---")
|
print("\n--- Needs human review summary ---")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue