upgrade e costruzione datasetHF

This commit is contained in:
Nicola Leonardi 2025-12-09 12:51:12 +01:00
parent cde7259ed7
commit 85c03b3a1a
8 changed files with 763 additions and 116 deletions

View File

@ -1,4 +1,5 @@
DB_PATH=persistence/wcag_validator_ui.db
WCAG_REST_SERVER_URL=http://localhost:8000
URL_LIST_old=["http://www.amazon.it","https://web.archive.org/web/20230630235957/http://www.amazon.com/", "https://web.archive.org/web/20251130033532/https://www.ebay.com/"]
URL_LIST=["https://amazon.com","https://ebay.com","https://walmart.com","https://etsy.com","https://target.com","https://wayfair.com","https://bestbuy.com","https://macys.com","https://homedepot.com","https://costco.com","https://www.ansa.it","https://en.wikipedia.org/wiki/Main_Page","https://www.lanazione.it","https://www.ansa.it","https://www.bbc.com","https://www.cnn.com","https://www.nytimes.com","https://www.theguardian.com"]
URL_LIST_old=["https://www.amazon.com/s?k=magllioni&crid=CGD2UWO33O58&sprefix=magllioni%2Caps%2C209&ref=nb_sb_noss","https://web.archive.org/web/20251011214807/https://www.ilfattoquotidiano.it/","https://amazon.com","https://ebay.com","https://walmart.com","https://etsy.com","https://target.com","https://wayfair.com","https://www.ansa.it","https://en.wikipedia.org/wiki/Main_Page","https://www.lanazione.it","https://www.bbc.com","https://www.cnn.com","https://www.nytimes.com","https://www.theguardian.com"]
URL_LIST=["https://giove.isti.cnr.it/users/manca/eBay.html","http://www.amazon.it"]

View File

@ -31,6 +31,45 @@ import sqlite3
WCAG_VALIDATOR_RESTSERVER_HEADERS = [("Content-Type", "application/json")]
def process_dataframe(db_path, url, updated_df, user_state={}):
print("Processing dataframe to adjust columns...")
column_rating_name = "User Assessment for LLM Proposal"
# Get the assessment column
try:
updated_df[column_rating_name] = updated_df[column_rating_name].astype(int)
except ValueError:
return "Error: User Assessment for LLM Proposal must be an integer"
if (updated_df[column_rating_name] < 1).any() or (
updated_df[column_rating_name] > 5
).any():
return "Error: User Assessment for LLM Proposal must be between 1 and 5"
dataframe_json = updated_df.to_json(orient="records")
connection_db = sqlite3.connect(db_path)
json_user_str = json.dumps({"username": user_state["username"]}, ensure_ascii=False)
try:
# insert after everything to keep datetime aligned
db_persistence_insert(
connection_db=connection_db,
insert_type="wcag_user_llm_alttext_assessments",
page_url=url,
user=json_user_str,
llm_model="",
json_in_str=dataframe_json, # to improve
json_out_str="done via UI",
table="wcag_user_assessments",
)
except Exception as e:
print("Error inserting user assessment into database:", str(e))
finally:
if connection_db:
connection_db.close()
return "User assessment saved successfully!"
def load_images_from_json(json_input):
"""Extract URLs and alt text from JSON and create HTML gallery"""
try:
@ -40,7 +79,7 @@ def load_images_from_json(json_input):
return "No images found in JSON", ""
images = data["images"]
info_text = f"Found {len(images)} image(s)\n"
info_text = f"Found {len(images)} image(s)"
print(f"Found {len(data['images'])} image(s)")
# Create HTML gallery with checkboxes and assessment forms
@ -58,14 +97,14 @@ def load_images_from_json(json_input):
padding: 10px;
background: white;
}
.image-card:has(input:checked) {
.image-card:has(input[type="checkbox"]:checked) {
border-color: #2196F3;
background: #a7c1c1;
}
.image-card img {
width: 100%;
height: 200px;
object-fit: cover;
object-fit: scale-down;
border-radius: 4px;
}
.image-info {
@ -93,7 +132,7 @@ def load_images_from_json(json_input):
display: none;
margin-top: 15px;
padding: 10px;
background: #f0f7ff;
background: #7896b9;
border-radius: 4px;
border: 1px solid #2196F3;
}
@ -109,18 +148,22 @@ def load_images_from_json(json_input):
margin-bottom: 5px;
font-size: 13px;
}
.range-container {
.radio-container {
display: flex;
gap: 15px;
align-items: center;
}
.radio-option {
display: flex;
align-items: center;
gap: 10px;
gap: 5px;
cursor: pointer;
}
.range-container input[type="range"] {
flex: 1;
}
.range-value {
font-weight: bold;
min-width: 20px;
text-align: center;
.radio-label {
font-weight: 500;
}
textarea {
width: 100%;
@ -166,12 +209,28 @@ def load_images_from_json(json_input):
<div id="panel-{idx}" class="assessment-panel">
<div class="form-group">
<label>Rate current alt-text:</label>
<div class="range-container">
<input type="range" min="1" max="5" value="3"
class="assessment-range" data-index="{idx}"
oninput="document.getElementById('range-value-{idx}').textContent = this.value">
<span id="range-value-{idx}" class="range-value">3</span>
</div>
<div class="radio-container">
<label class="radio-option">
<input type="radio" name="assessment-{idx}" value="1" data-index="{idx}">
<span class="radio-label">1</span>
</label>
<label class="radio-option">
<input type="radio" name="assessment-{idx}" value="2" data-index="{idx}">
<span class="radio-label">2</span>
</label>
<label class="radio-option">
<input type="radio" name="assessment-{idx}" value="3" data-index="{idx}" checked>
<span class="radio-label">3</span>
</label>
<label class="radio-option">
<input type="radio" name="assessment-{idx}" value="4" data-index="{idx}">
<span class="radio-label">4</span>
</label>
<label class="radio-option">
<input type="radio" name="assessment-{idx}" value="5" data-index="{idx}">
<span class="radio-label">5</span>
</label>
</div>
</div>
<div class="form-group">
<label>New alt-text:</label>
@ -226,7 +285,7 @@ def load_llm_assessment_from_json(json_input):
{
"Original Alt Text": alt_text_original,
"LLM Assessment": original_alt_text_assessment,
"Proposed Alt Text": new_alt_text,
"LLM Proposed Alt Text": new_alt_text,
}
)
@ -257,7 +316,7 @@ def make_alttext_llm_assessment_api_call(
if not selected_images or len(selected_images) == 0:
info_text = "No images selected"
print(info_text)
return pd.DataFrame()
return "LLM assessment not started", pd.DataFrame()
# prepare data for insertion
json_in_str = {}
@ -267,6 +326,7 @@ def make_alttext_llm_assessment_api_call(
user_assessments = []
user_new_alt_texts = []
selected_image_id = []
user_assessments_llm_proposal = []
for img in selected_images:
selected_urls.append(img["image_url"])
selected_alt_text_original.append(img["original_alt_text"])
@ -275,6 +335,7 @@ def make_alttext_llm_assessment_api_call(
selected_image_id.append(
int(img["image_index"]) + 1
) # add the id selected (+1 for index alignment)
user_assessments_llm_proposal.append(3) # default value for now
json_in_str["images_urls"] = selected_urls
json_in_str["images_alt_text_original"] = selected_alt_text_original
json_out_str["user_assessments"] = user_assessments
@ -302,9 +363,17 @@ def make_alttext_llm_assessment_api_call(
)
# return response
info_dataframe = load_llm_assessment_from_json(response)
# add the UI ids and other fields to to api response
info_dataframe.insert(
0, "Image #", selected_image_id
) # add the UI ids from to api response
info_dataframe.insert(2, "User Assessment", user_assessments)
info_dataframe.insert(3, "User Proposed Alt Text", user_new_alt_texts)
info_dataframe["User Assessment for LLM Proposal"] = (
user_assessments_llm_proposal
)
except Exception as e:
return {"error": str(e)}
@ -326,7 +395,7 @@ def make_alttext_llm_assessment_api_call(
finally:
if connection_db:
connection_db.close()
return info_dataframe
return "LLM assessment completed", info_dataframe
def make_image_extraction_api_call(
@ -449,9 +518,10 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
images_number = gr.Slider(
5,
100,
value=30,
value=50,
step=5,
label="Max number of images to retrieve",
visible=False,
)
with gr.Column():
@ -459,39 +529,54 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
"Extract Images & Alt Texts", variant="primary"
)
alttext_api_call_btn = gr.Button(
"Alt Text LLM Assessment",
"Start LLM Assessment",
variant="secondary",
interactive=False,
)
image_info_output = gr.Textbox(
label="Activity tracking", lines=1
)
with gr.Row():
image_info_output = gr.Textbox(label="Managed Images", lines=5)
with gr.Row(visible=False) as alttext_results_row:
# Use DataFrame for tabular output
alttext_info_output = gr.DataFrame(
headers=[
"Image #",
"Original Alt Text",
"User Assessment",
"User Proposed Alt Text",
"LLM Assessment",
"Proposed Alt Text",
"LLM Proposed Alt Text",
"User Assessment for LLM Proposal",
],
label="LLM Assessment Results",
wrap=True, # Wrap text in cells
interactive=False,
interactive=True,
scale=7,
)
with gr.Column():
save_user_assessment_btn = gr.Button(
"Save Your Assessment",
variant="secondary",
interactive=True,
scale=1,
)
gr.Markdown(
" Info: to assess the LLM output, only the values for the 'User Assessment for LLM Proposal' column need to be changed."
)
with gr.Row():
gallery_html = gr.HTML(label="Image Gallery")
image_extraction_api_call_btn.click(
fn=lambda: ("", "", pd.DataFrame(), gr.Button(interactive=False)),
fn=lambda: ("", "", gr.update(visible=False), gr.Button(interactive=False)),
inputs=[],
outputs=[
image_info_output,
gallery_html,
alttext_info_output,
alttext_results_row,
alttext_api_call_btn,
],
).then(
@ -515,7 +600,7 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
wcag_rest_server_url_state,
user_state,
],
outputs=[alttext_info_output],
outputs=[image_info_output, alttext_info_output],
js="""
(url_input,gallery_html) => {
const checkboxes = document.querySelectorAll('.image-checkbox:checked');
@ -533,7 +618,8 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
const index = checkbox.dataset.index;
const imageUrl = checkbox.dataset.imgurl;
const originalAlt = document.querySelector('.original-alt[data-index="' + index + '"]').value;
const assessment = document.querySelector('.assessment-range[data-index="' + index + '"]').value;
const assessment = document.querySelector('input[name="assessment-' + index + '"]:checked').value;
console.log("assessment:",assessment)
const newAltText = document.querySelector('.new-alt-text[data-index="' + index + '"]').value;
selectedData.push({
@ -548,6 +634,16 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
return [url_input,JSON.stringify(selectedData)];
}
""",
).then(
fn=lambda: gr.update(visible=True),
inputs=[],
outputs=[alttext_results_row],
)
save_user_assessment_btn.click(
fn=process_dataframe,
inputs=[db_path_state, url_input, alttext_info_output, user_state],
outputs=[image_info_output],
)
# placed here at the end to give full contents visibility to events

View File

@ -55,7 +55,6 @@ class ImageExtractor:
# Also check query parameters (e.g., format=jpeg)
return any(fmt in img_url.lower() for fmt in self.SUPPORTED_FORMATS)
async def _download_image(self, image_url, output_dir="images") -> None:
# Parse the URL to get the path without query parameters
@ -79,7 +78,7 @@ class ImageExtractor:
# Sanitize image name (remove special characters, limit length)
image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_"))
image_name = image_name[:200] # Limit filename length
image_name = image_name[:50] # Limit filename length
# If name is empty after sanitization, create a hash-based name
if not image_name:
@ -88,13 +87,15 @@ class ImageExtractor:
image_name = hashlib.md5(image_url.encode()).hexdigest()[:16]
# Download the image
print("getting image:", image_url)
print("getting image url:", image_url)
print("getting image name:", image_name)
response = requests.get(image_url, timeout=10)
response.raise_for_status()
try:
# Save the image
output_path = os.path.join(output_dir, f"{image_name}.{ext}")
print("saving image to:", output_path)
with open(output_path, "wb") as f:
f.write(response.content)
print(f"Saved: {output_path}")
@ -292,43 +293,36 @@ class ImageExtractor:
error_msg = f"Error extracting context: {str(e)}"
return error_msg, error_msg, error_msg
async def _get_page_metadata(self, page) -> Dict[str, Optional[str]]:
"""Extract page metadata including title, description, and keywords."""
metadata = {
"title": await page.title(),
"description": None,
"keywords": None,
"headings": [],
}
async def _get_page_metadata(self, page):
"""Extract page metadata in one fast evaluate call. Batch DOM extraction inside one evaluate()."""
return await page.evaluate(
"""
() => {
const metadata = {
title: document.title || null,
description: null,
keywords: null,
headings: []
};
# Extract meta description
try:
description = await page.locator('meta[name="description"]').get_attribute(
"content"
)
metadata["description"] = description
except:
pass
const desc = document.querySelector('meta[name="description"]');
const keys = document.querySelector('meta[name="keywords"]');
metadata.description = desc?.content || null;
metadata.keywords = keys?.content || null;
# Extract meta keywords
try:
keywords = await page.locator('meta[name="keywords"]').get_attribute(
"content"
)
metadata["keywords"] = keywords
except:
pass
// Collect all headings h1h6
const allHeadings = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
metadata.headings = Array.from(allHeadings)
.map(h => ({
level: parseInt(h.tagName.substring(1), 10),
text: h.textContent.trim()
}))
.filter(h => h.text.length > 0);
# Extract all headings (h1-h6)
for level in range(1, 7):
headings = await page.locator(f"h{level}").all_text_contents()
for heading in headings:
if heading.strip():
metadata["headings"].append(
{"level": level, "text": heading.strip()}
)
return metadata
return metadata;
}
"""
)
async def extract_images(
self, extract_context=True, specific_images_urls=[]
@ -344,15 +338,18 @@ class ImageExtractor:
page = await browser.new_page()
try:
#await page.goto(self.url, wait_until="networkidle") # method 1: use if the page has unpredictable async content and there is the need to ensure everything loads
# The "networkidle" approach is generally more robust but slower, while the fixed timeout is faster but less adaptive to actual page behavior.
# await page.goto(self.url, wait_until="networkidle") # method 1: use if the page has unpredictable async content and there is the need to ensure everything loads
# The "networkidle" approach is generally more robust but slower, while the fixed timeout is faster but less adaptive to actual page behavior.
# ---alternative method2: use if there is total awareness of the page's loading pattern and want faster, more reliable execution
await page.goto(self.url, timeout=50000, wait_until="load")# deafult timeout=30000, 30sec
await page.goto(
self.url, timeout=50000, wait_until="load"
) # deafult timeout=30000, 30sec
# Wait for page to load completely
await page.wait_for_timeout(2000) # Wait for dynamic content
# -----
if extract_context:
print("Getting page metadata...")
# Get page metadata once
page_metadata = await self._get_page_metadata(page)
page_title = page_metadata["title"]
@ -367,15 +364,41 @@ class ImageExtractor:
if len(specific_images_urls) == 0:
# Find all img elements
print("Extracting all images from the page",self.url)
img_elements = await page.locator("img").all()
print("Extracting all images from the page", self.url)
# img_elements = await page.locator("img").all()
else:
print(
"Extracting specific images from the page:",
self.url,
specific_images_urls,
)
img_elements = []
# img_elements = await page.locator("img").all()
""" # method 3: optimized approach
# Get all src attributes in one go
all_img_elements = await page.locator("img").all()
all_srcs = await page.locator("img").evaluate_all(
"elements => elements.map(el => el.src || '')"
)
# Filter with the pre-fetched src values
img_elements = [
elem for elem, src in zip(all_img_elements, all_srcs)
if src in specific_images_urls
]
"""
""" #method 2: single pass to find matching images
for img_element in all_img_elements: #This is more efficient than making separate locator queries for each specific URL and avoids timeout issues.
try:
src = await img_element.get_attribute("src")
print("found image src:", src)
if src in specific_images_urls:
img_elements.append(img_element)
except Exception as e:
print(f"Error getting src attribute from image: {str(e)}")"""
""" # method 1: separate locator queries for each specific URL
for url in specific_images_urls:
try:
img_element = await page.locator(
@ -384,8 +407,11 @@ class ImageExtractor:
if img_element:
img_elements.append(img_element)
except Exception as e:
print(f"Error locating image with src {url}: {str(e)}")
print(f"Error locating image with src {url}: {str(e)}")"""
img_elements = await page.locator(
"img"
).all() # unified approach to start with all images and filter later
image_source_list = [] # avoid multiple check for the same image url
images_data = []
@ -404,6 +430,12 @@ class ImageExtractor:
if not src:
print("image has no src attribute. Skipped.")
continue
if (
src not in specific_images_urls
and len(specific_images_urls) > 0
):
# print("image src",src,"not in the specific images list. Skipped.")
continue
if src not in image_source_list:
image_source_list.append(src)
@ -434,6 +466,7 @@ class ImageExtractor:
alt_text = await img.get_attribute("alt") or ""
if extract_context:
print("Extracting context for image:", img_url)
# Get surrounding HTML context (full, immediate, and nearby)
html_context, immediate_context, nearby_text = (
await self._get_element_context(page, img)

View File

@ -76,39 +76,7 @@ class MLLMManager:
return payload
def get_alt_text_system_prompt(self):
system_prompt_old = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for
images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present
the same information as the image, and should be able to substitute for the non-text content. The text alternative would
be brief but as informative as possible.
Follow these instructions carefully:
1. You will be provided as input with the following:
- The image found on the webpage.
- The associated alternative text. When the alt-text is empty or absent, you will be explicitly informed.
- The surrounding context of the image.
- The page title, headings and the content of the keywords and description <meta> tag, if found.
2. Determine the function and purpose of the image by analyzing these elements. Take into account the purpose and function
of the associated image by considering the page context. Check also if the image is, or is associated with, a link or a button,
and consider this in your judgement. If the image contains text use that as part of the context.
3. Provide a final assessment based on the following:
- 'success' if you can assess with 'sufficient certainty' the alt-text is appropriate in relation to the image purpose,
- 'failure' if you can assess with 'sufficient certainty' that the alt-text is NOT appropriate,
- 'warning' if you cannot determine with 'sufficient certainty'.
where the level of certainty goes from 1 to 100 and 'sufficient certainty' means > 80
4. The original alt-text assessment on a scale from 1 to 5, where 5 is the best score. Use an integer number only.
5. Provide a brief reasoning for your judgment. If the image contains text, write it verbatim. Your response should be in English.
6. Keep your response within 150 words.
7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words.
8. Here is the JSON format the results must have:
{"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}"""
# https://www.w3.org/WAI/WCAG22/Techniques/general/G94 without examples
system_prompt = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for
images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present
@ -122,7 +90,7 @@ class MLLMManager:
What purpose does it fulfill?
If I could not use the image content, what words would I use to convey the same function and/or information?
When image content contains words that are important to understanding the content, the alt text should include those words
When image content contains words that are important to understanding the content, the alt text should include those words.
Follow these instructions carefully:
1. You will be provided as input with the following:
@ -147,7 +115,7 @@ class MLLMManager:
6. Keep your response within 150 words.
7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words.
7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words. Use the same language as the original alt-text.
8. Here is the JSON format the results must have:
{"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}"""
@ -181,7 +149,7 @@ class MLLMManager:
print("Using end_point:", self.end_point)
alt_text_system_prompt = self.get_alt_text_system_prompt()
print("alt_text_system_prompt:", alt_text_system_prompt)
#print("alt_text_system_prompt:", alt_text_system_prompt)
mllm_responses = []
for img_info in images:

View File

@ -46,6 +46,7 @@ class ExtractImagesRoutes:
self, request: Request, data: ExtractImages
) -> JSONResponse:
"""Return the alt text validation assessment based on WCAG guidelines"""
print("Received extract images request.")
try:
json_content = json.loads(data.model_dump_json())

View File

@ -53,6 +53,7 @@ class WCAGAltTextValuationRoutes:
) -> JSONResponse:
"""Return the alt text validation assessment based on WCAG guidelines"""
try:
print("Received wcag alttext validation request.")
json_content = json.loads(data.model_dump_json())
mllm_model_id = self.mllm_settings["mllm_model_id"]
@ -67,7 +68,12 @@ class WCAGAltTextValuationRoutes:
.replace(":", "")
.replace("//", "_")
.replace("/", "_")
.replace("%2", "_")
.replace("?", "_")
.replace("=", "_")
.replace("&", "_")
)
url_path=url_path[:50] # limit length
now = datetime.now(timezone.utc)
now_str = now.strftime("%Y_%m_%d-%H_%M_%S")
folder_str = mllm_model_id.replace(":", "-") + "_" + now_str
@ -93,7 +99,7 @@ class WCAGAltTextValuationRoutes:
# Extract images
logging.info(f"Extracting images from: {json_content['page_url']}")
images = await image_extractor.extract_images(
specific_images_urls=json_content["specific_images_urls"]
specific_images_urls=json_content["specific_images_urls"],extract_context=True
)
# MLLM settings
mllm_end_point = self.mllm_settings["mllm_end_point"]

View File

@ -0,0 +1,541 @@
# to launch: python build_dataset_from_folder.py --ref_path "" --push_to_hub --repo_id "nicolaleo/LLM-alt-text-assessment" --token ""
from datasets import Dataset, DatasetDict
import datasets
import json
from pathlib import Path
from PIL import Image
import hashlib
import urllib.parse
import argparse
'''
# Dataset metadata
_DESCRIPTION = """\
Dataset for image alt-text assessment and improvement using MLLM responses.
Contains images, original alt-texts, quality assessments, and improved versions.
"""
_CITATION = """\
@misc{alt_text_assessment,
title={Alt-Text Assessment Dataset},
year={2024}
}
"""
class AltTextDataset(datasets.GeneratorBasedBuilder):
"""Dataset for alt-text assessment with images and MLLM responses."""
VERSION = datasets.Version("1.0.0")
def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features({
"image": datasets.Image(),
"image_url": datasets.Value("string"),
"alt_text": datasets.Value("string"),
"original_alt_text_assessment": datasets.Value("string"),
"assessment": datasets.Value("string"),
"evaluation_result": datasets.Value("string"),
"new_alt_text": datasets.Value("string"),
#"source_folder": datasets.Value("string"),
}),
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Define data splits."""
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"json_filepath": "data.json",
"images_dir": "images"
},
),
]
def _generate_examples(self, json_filepath, images_dir):
"""Generate examples from JSON file and image directory."""
with open(json_filepath, encoding="utf-8") as f:
data = json.load(f)
images_path = Path(images_dir)
for idx, entry in enumerate(data):
image_url = entry["image_url"]
image_filename = url_to_filename(image_url)
image_path = images_path / image_filename
# Load image if exists, otherwise None
image = str(image_path) if image_path.exists() else None
yield idx, {
"image": image,
"image_url": image_url,
"alt_text": entry["alt_text"],
"original_alt_text_assessment": entry["mllm_response"]["original_alt_text_assessment"],
"assessment": entry["mllm_response"]["assessment"],
"evaluation_result": entry["mllm_response"]["evaluation_result"],
"new_alt_text": entry["mllm_response"]["new_alt_text"],
}
'''
# ============================================================================
# SIMPLE USAGE FUNCTIONS
# ============================================================================
def url_to_filename(image_url): # save step as in the image_extractor dependence
"""
Convert image URL to sanitized filename following your exact logic.
Args:
image_url: The image URL
Returns:
Sanitized filename with extension
"""
# Parse the URL to get the path without query parameters
parsed_url = urllib.parse.urlparse(image_url)
url_path = parsed_url.path
# Get the filename from the path
filename = url_path.split("/")[-1]
print(f"Original filename: '{filename}'")
# Split filename and extension
if "." in filename:
image_name, ext = filename.rsplit(".", 1)
ext = ext.lower()
else:
image_name = filename
ext = "jpg"
# Validate extension
if ext not in ["jpg", "jpeg", "png", "gif", "webp"]:
ext = "jpg"
# Sanitize image name (remove special characters, limit length)
image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_"))
image_name = image_name[:50] # Limit filename length
# If name is empty after sanitization, create a hash-based name
if not image_name:
image_name = hashlib.md5(image_url.encode()).hexdigest()[:16]
return f"{image_name}.{ext}"
def push_to_hub_example(dataset_path="alt_text_merged_dataset", repo_id="",token=None):
"""
Example of how to push dataset to Hugging Face Hub.
You need to authenticate first!
"""
from huggingface_hub import login
print("\n=== Pushing Dataset to Hugging Face Hub ===")
# Method 1: Login interactively (will prompt for token)
# login()
# Method 2: Login with token directly
login(token=token)
# Method 3: Set token as environment variable
# export HF_TOKEN="hf_YourTokenHere"
# Then login() will use it automatically
# Load your dataset
ds = load_dataset_from_disk(dataset_path)
# Combine into DatasetDict
ds = DatasetDict(
{
"train": ds,
# #"test": test_dataset
}
)
# Push to hub (creates repo if it doesn't exist)
ds.push_to_hub( # Automatically converts to Parquet when uploading to Hub
repo_id, # Replace with your username
private=False, # Set True for private dataset
)
print("Dataset pushed successfully!")
print(f"View at: https://huggingface.co/datasets/{repo_id}")
def create_dataset_from_json(json_filepath, json_filepath_images, images_dir="images"):
"""
Create a Hugging Face Dataset from JSON file with local images.
Args:
json_filepath: Path to JSON file with your data structure
images_dir: Directory containing the images (default: "images")
Returns:
datasets.Dataset object with images loaded
"""
with open(json_filepath, "r", encoding="utf-8") as f:
data = json.load(f)
with open(json_filepath_images, "r", encoding="utf-8") as f:
data_images = json.load(f)
images_path = Path(images_dir)
# Flatten the nested structure and load images
flattened_data = {
"image": [],
"image_url": [],
"alt_text": [],
"original_alt_text_assessment": [],
"assessment": [],
"evaluation_result": [],
"new_alt_text": [],
"page_url": [],
"html_context": [],
}
count_entry = 0
for entry in data:
if (
entry["mllm_response"]["original_alt_text_assessment"] is None
): # important! skip entries with no MLLM response. not usable data
print(
f"Skipping entry with image URL: {entry['image_url']} due to missing MLLM response"
)
count_entry += 1
continue # Skip entries with no MLLM response
image_url = entry["image_url"]
image_filename = url_to_filename(image_url)
image_path = images_path / image_filename
# Load image if it exists
if image_path.exists():
img = Image.open(image_path)
flattened_data["image"].append(img)
else:
print(f"Warning: Image not found: {image_path}")
flattened_data["image"].append(None)
flattened_data["image_url"].append(image_url)
flattened_data["alt_text"].append(entry["alt_text"])
flattened_data["original_alt_text_assessment"].append(
str(entry["mllm_response"]["original_alt_text_assessment"])
)
flattened_data["assessment"].append(entry["mllm_response"]["assessment"])
flattened_data["evaluation_result"].append(
entry["mllm_response"]["evaluation_result"]
)
flattened_data["new_alt_text"].append(entry["mllm_response"]["new_alt_text"])
flattened_data["page_url"].append(data_images[count_entry]["page_url"])
flattened_data["html_context"].append(data_images[count_entry]["html_context"])
count_entry += 1
print(f"Total valid entries loaded: {len(flattened_data['image_url'])}")
return datasets.Dataset.from_dict(flattened_data)
def create_dataset_from_folders(
ref_path,
json_filename="mllm_alttext_assessments.json",
json_filename_images="extracted_images.json",
images_dirname="images",
):
"""
Create a merged dataset from multiple folders under ref_path.
Each folder should contain a JSON file and an images subdirectory.
Args:
ref_path: Root path containing multiple folders
json_filename: Name of JSON file in each folder (default: "data.json")
images_dirname: Name of images subdirectory (default: "images")
Returns:
datasets.Dataset object with all entries merged
"""
ref_path = Path(ref_path)
all_datasets = []
# Find all subdirectories containing the JSON file
folders_processed = 0
for folder in ref_path.iterdir():
if not folder.is_dir():
continue
json_path = folder / json_filename
json_path_images = folder / json_filename_images
images_path = folder / images_dirname
# Check if both JSON and images directory exist
if not json_path.exists():
print(f"Skipping {folder.name}: no {json_filename} found")
continue
if not json_path_images.exists():
print(f"Skipping {folder.name}: no {json_filename_images} found")
continue
if not images_path.exists():
print(f"Warning: {folder.name}: images directory not found")
# continue
# Continue anyway, images might be optional (from urls only)
print(f"Processing folder: {folder.name}")
try:
# Create dataset for this folder
ds = create_dataset_from_json(
str(json_path), str(json_path_images), str(images_path)
)
all_datasets.append(ds)
folders_processed += 1
print(f" -> Loaded {len(ds)} entries")
except Exception as e:
print(f"Error processing {folder.name}: {e}")
continue
if not all_datasets:
raise ValueError(f"No valid folders found in {ref_path}")
# Merge all datasets
print(f"\n=== Merging {folders_processed} folders ===")
merged_dataset = datasets.concatenate_datasets(all_datasets)
print(f"Total entries: {len(merged_dataset)}")
return merged_dataset
def verify_images(json_filepath, images_dir="images"):
"""
Verify that all images referenced in JSON exist in the images directory.
Args:
json_filepath: Path to JSON file
images_dir: Directory containing images
Returns:
Dict with 'found', 'missing', and 'details' keys
"""
with open(json_filepath, "r", encoding="utf-8") as f:
data = json.load(f)
images_path = Path(images_dir)
found = []
missing = []
for entry in data:
image_url = entry["image_url"]
image_filename = url_to_filename(image_url)
image_path = images_path / image_filename
print(
"image_url:",
image_url,
"image_filename:",
image_filename,
"image_path:",
image_path,
)
if image_path.exists():
found.append(
{"url": image_url, "filename": image_filename, "path": str(image_path)}
)
else:
missing.append(
{
"url": image_url,
"filename": image_filename,
"expected_path": str(image_path),
}
)
return {
"found": len(found),
"missing": len(missing),
"total": len(data),
"details": {"found_images": found, "missing_images": missing},
}
def verify_images_in_folders(
ref_path, json_filename="mllm_alttext_assessments.json", images_dirname="images"
):
"""
Verify images across all folders under ref_path.
Args:
ref_path: Root path containing multiple folders
json_filename: Name of JSON file in each folder
images_dirname: Name of images subdirectory
Returns:
Dict with aggregated verification results
"""
ref_path = Path(ref_path)
total_found = 0
total_missing = 0
total_entries = 0
folder_results = {}
for folder in ref_path.iterdir():
if not folder.is_dir():
continue
json_path = folder / json_filename
images_path = folder / images_dirname
if not json_path.exists():
continue
print(f"Verifying folder: {folder.name}")
try:
verification = verify_images(str(json_path), str(images_path))
folder_results[folder.name] = verification
total_found += verification["found"]
total_missing += verification["missing"]
total_entries += verification["total"]
print(f" Found: {verification['found']}/{verification['total']}")
except Exception as e:
print(f" Error: {e}")
continue
return {
"found": total_found,
"missing": total_missing,
"total": total_entries,
"folders": folder_results,
}
def save_dataset(dataset, output_path):
"""Save dataset in Arrow format (includes images)."""
dataset.save_to_disk(output_path)
# print(f"Dataset saved to {output_path}")
# Or save as JSON
# dataset.to_json(f"{output_path}/data.json")
# Or save as CSV
# dataset.to_csv(f"{output_path}/data.csv")
# Or save as Parquet
# dataset.to_parquet(f"{output_path}/data.parquet")
def load_dataset_from_disk(dataset_path):
"""Load a previously saved dataset."""
return datasets.load_from_disk(dataset_path)
# ============================================================================
# EXAMPLE USAGE
# ============================================================================
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--ref_path",
type=str,
help=("Root path containing multiple folders"),
default="",
)
parser.add_argument(
"--push_to_hub",
action="store_true",
default=False,
help=("If True push the merged dataset to Hugging Face Hub"),
)
parser.add_argument(
"--token",
type=str,
help=("Hugging Face authentication token"),
default="",
)
parser.add_argument(
"--repo_id",
type=str,
help=("Hugging Face repository ID"),
default="nicolaleo/LLM-alt-text-assessment",
)
args = parser.parse_args()
# Example 1: Verify images across all folders
print("=== Verifying Images in All Folders ===")
verification = verify_images_in_folders(args.ref_path)
print("\n######## Verifier output ################################")
print(f"Total Found: {verification['found']}/{verification['total']}")
print(f"Total Missing: {verification['missing']}/{verification['total']}")
print("########################################")
# Show per-folder breakdown
print("\n=== Per-Folder Breakdown ===")
for folder_name, results in verification["folders"].items():
print(f"{folder_name}: {results['found']}/{results['total']} images found")
# Example 2: Create merged dataset from all folders
print("\n=== Creating Merged Dataset ===")
ds = create_dataset_from_folders(args.ref_path)
print("\n######## Merged Dataset output ################################")
print(f"Final dataset size: {len(ds)} entries")
print("########################################")
# Example 3: Analyze the merged dataset
print("\n=== Dataset Analysis ===")
print(ds)
# Example 3: Access images and data
print("\n=== First Example ===")
first_example = ds[0]
print(f"Image URL: {first_example['image_url']}")
print(f"Alt text: {first_example['alt_text']}")
print(f"Assessment: {first_example['assessment']}")
print(f"New alt text: {first_example['new_alt_text']}")
print(f"Image loaded: {first_example['image'] is not None}")
if first_example["image"] is not None:
img = first_example["image"]
print(f"Image size: {img.size}")
# img.show() # Uncomment to display image
# Example 4: Filter and work with merged data
print("\n=== Filtering Merged Dataset ===")
successful = ds.filter(lambda x: x["assessment"] == "success")
print(f"Successful assessments: {len(successful)}")
high_rated = ds.filter(lambda x: int(x["original_alt_text_assessment"]) >= 4)
print(f"High-rated (>=4): {len(high_rated)}")
# Example 5: Save merged dataset
print("\n=== Saving Merged Dataset ===")
save_dataset(ds, "alt_text_merged_dataset")
# Example 6: Load dataset
print("\n=== Loading Dataset ===")
loaded_ds = load_dataset_from_disk("alt_text_merged_dataset")
print(f"Loaded {len(loaded_ds)} entries")
if args.push_to_hub:
# Push to Hugging Face Hub (optional)
push_to_hub_example(repo_id=args.repo_id, token=args.token) # function below for details

View File

@ -5,4 +5,5 @@ transformers==4.57.1
numpy==2.2.6
matplotlib==3.10.7
scikit-learn==1.7.2
sentence-transformers==5.1.2
sentence-transformers==5.1.2
datasets==4.4.1