diff --git a/UI/.env b/UI/.env
index 1b0e7f3..10f0839 100644
--- a/UI/.env
+++ b/UI/.env
@@ -1,4 +1,5 @@
DB_PATH=persistence/wcag_validator_ui.db
WCAG_REST_SERVER_URL=http://localhost:8000
URL_LIST_old=["http://www.amazon.it","https://web.archive.org/web/20230630235957/http://www.amazon.com/", "https://web.archive.org/web/20251130033532/https://www.ebay.com/"]
-URL_LIST=["https://amazon.com","https://ebay.com","https://walmart.com","https://etsy.com","https://target.com","https://wayfair.com","https://bestbuy.com","https://macys.com","https://homedepot.com","https://costco.com","https://www.ansa.it","https://en.wikipedia.org/wiki/Main_Page","https://www.lanazione.it","https://www.ansa.it","https://www.bbc.com","https://www.cnn.com","https://www.nytimes.com","https://www.theguardian.com"]
\ No newline at end of file
+URL_LIST_old=["https://www.amazon.com/s?k=magllioni&crid=CGD2UWO33O58&sprefix=magllioni%2Caps%2C209&ref=nb_sb_noss","https://web.archive.org/web/20251011214807/https://www.ilfattoquotidiano.it/","https://amazon.com","https://ebay.com","https://walmart.com","https://etsy.com","https://target.com","https://wayfair.com","https://www.ansa.it","https://en.wikipedia.org/wiki/Main_Page","https://www.lanazione.it","https://www.bbc.com","https://www.cnn.com","https://www.nytimes.com","https://www.theguardian.com"]
+URL_LIST=["https://giove.isti.cnr.it/users/manca/eBay.html","http://www.amazon.it"]
\ No newline at end of file
diff --git a/UI/wcag_validator_ui.py b/UI/wcag_validator_ui.py
index edfb8f5..7cfacba 100644
--- a/UI/wcag_validator_ui.py
+++ b/UI/wcag_validator_ui.py
@@ -31,6 +31,45 @@ import sqlite3
WCAG_VALIDATOR_RESTSERVER_HEADERS = [("Content-Type", "application/json")]
+def process_dataframe(db_path, url, updated_df, user_state={}):
+
+ print("Processing dataframe to adjust columns...")
+ column_rating_name = "User Assessment for LLM Proposal"
+
+ # Get the assessment column
+ try:
+ updated_df[column_rating_name] = updated_df[column_rating_name].astype(int)
+ except ValueError:
+ return "Error: User Assessment for LLM Proposal must be an integer"
+
+ if (updated_df[column_rating_name] < 1).any() or (
+ updated_df[column_rating_name] > 5
+ ).any():
+ return "Error: User Assessment for LLM Proposal must be between 1 and 5"
+
+ dataframe_json = updated_df.to_json(orient="records")
+ connection_db = sqlite3.connect(db_path)
+ json_user_str = json.dumps({"username": user_state["username"]}, ensure_ascii=False)
+ try:
+ # insert after everything to keep datetime aligned
+ db_persistence_insert(
+ connection_db=connection_db,
+ insert_type="wcag_user_llm_alttext_assessments",
+ page_url=url,
+ user=json_user_str,
+ llm_model="",
+ json_in_str=dataframe_json, # to improve
+ json_out_str="done via UI",
+ table="wcag_user_assessments",
+ )
+ except Exception as e:
+ print("Error inserting user assessment into database:", str(e))
+ finally:
+ if connection_db:
+ connection_db.close()
+ return "User assessment saved successfully!"
+
+
def load_images_from_json(json_input):
"""Extract URLs and alt text from JSON and create HTML gallery"""
try:
@@ -40,7 +79,7 @@ def load_images_from_json(json_input):
return "No images found in JSON", ""
images = data["images"]
- info_text = f"Found {len(images)} image(s)\n"
+ info_text = f"Found {len(images)} image(s)"
print(f"Found {len(data['images'])} image(s)")
# Create HTML gallery with checkboxes and assessment forms
@@ -58,14 +97,14 @@ def load_images_from_json(json_input):
padding: 10px;
background: white;
}
- .image-card:has(input:checked) {
+ .image-card:has(input[type="checkbox"]:checked) {
border-color: #2196F3;
background: #a7c1c1;
}
.image-card img {
width: 100%;
height: 200px;
- object-fit: cover;
+ object-fit: scale-down;
border-radius: 4px;
}
.image-info {
@@ -93,7 +132,7 @@ def load_images_from_json(json_input):
display: none;
margin-top: 15px;
padding: 10px;
- background: #f0f7ff;
+ background: #7896b9;
border-radius: 4px;
border: 1px solid #2196F3;
}
@@ -109,18 +148,22 @@ def load_images_from_json(json_input):
margin-bottom: 5px;
font-size: 13px;
}
- .range-container {
+
+ .radio-container {
+ display: flex;
+ gap: 15px;
+ align-items: center;
+ }
+
+ .radio-option {
display: flex;
align-items: center;
- gap: 10px;
+ gap: 5px;
+ cursor: pointer;
}
- .range-container input[type="range"] {
- flex: 1;
- }
- .range-value {
- font-weight: bold;
- min-width: 20px;
- text-align: center;
+
+ .radio-label {
+ font-weight: 500;
}
textarea {
width: 100%;
@@ -166,12 +209,28 @@ def load_images_from_json(json_input):
@@ -226,7 +285,7 @@ def load_llm_assessment_from_json(json_input):
{
"Original Alt Text": alt_text_original,
"LLM Assessment": original_alt_text_assessment,
- "Proposed Alt Text": new_alt_text,
+ "LLM Proposed Alt Text": new_alt_text,
}
)
@@ -257,7 +316,7 @@ def make_alttext_llm_assessment_api_call(
if not selected_images or len(selected_images) == 0:
info_text = "No images selected"
print(info_text)
- return pd.DataFrame()
+ return "LLM assessment not started", pd.DataFrame()
# prepare data for insertion
json_in_str = {}
@@ -267,6 +326,7 @@ def make_alttext_llm_assessment_api_call(
user_assessments = []
user_new_alt_texts = []
selected_image_id = []
+ user_assessments_llm_proposal = []
for img in selected_images:
selected_urls.append(img["image_url"])
selected_alt_text_original.append(img["original_alt_text"])
@@ -275,6 +335,7 @@ def make_alttext_llm_assessment_api_call(
selected_image_id.append(
int(img["image_index"]) + 1
) # add the id selected (+1 for index alignment)
+ user_assessments_llm_proposal.append(3) # default value for now
json_in_str["images_urls"] = selected_urls
json_in_str["images_alt_text_original"] = selected_alt_text_original
json_out_str["user_assessments"] = user_assessments
@@ -302,9 +363,17 @@ def make_alttext_llm_assessment_api_call(
)
# return response
info_dataframe = load_llm_assessment_from_json(response)
+
+ # add the UI ids and other fields to to api response
info_dataframe.insert(
0, "Image #", selected_image_id
) # add the UI ids from to api response
+ info_dataframe.insert(2, "User Assessment", user_assessments)
+
+ info_dataframe.insert(3, "User Proposed Alt Text", user_new_alt_texts)
+ info_dataframe["User Assessment for LLM Proposal"] = (
+ user_assessments_llm_proposal
+ )
except Exception as e:
return {"error": str(e)}
@@ -326,7 +395,7 @@ def make_alttext_llm_assessment_api_call(
finally:
if connection_db:
connection_db.close()
- return info_dataframe
+ return "LLM assessment completed", info_dataframe
def make_image_extraction_api_call(
@@ -449,9 +518,10 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
images_number = gr.Slider(
5,
100,
- value=30,
+ value=50,
step=5,
label="Max number of images to retrieve",
+ visible=False,
)
with gr.Column():
@@ -459,39 +529,54 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
"Extract Images & Alt Texts", variant="primary"
)
alttext_api_call_btn = gr.Button(
- "Alt Text LLM Assessment",
+ "Start LLM Assessment",
variant="secondary",
interactive=False,
)
+ image_info_output = gr.Textbox(
+ label="Activity tracking", lines=1
+ )
- with gr.Row():
-
- image_info_output = gr.Textbox(label="Managed Images", lines=5)
+ with gr.Row(visible=False) as alttext_results_row:
# Use DataFrame for tabular output
alttext_info_output = gr.DataFrame(
headers=[
"Image #",
"Original Alt Text",
+ "User Assessment",
+ "User Proposed Alt Text",
"LLM Assessment",
- "Proposed Alt Text",
+ "LLM Proposed Alt Text",
+ "User Assessment for LLM Proposal",
],
label="LLM Assessment Results",
wrap=True, # Wrap text in cells
- interactive=False,
+ interactive=True,
+ scale=7,
)
+ with gr.Column():
+ save_user_assessment_btn = gr.Button(
+ "Save Your Assessment",
+ variant="secondary",
+ interactive=True,
+ scale=1,
+ )
+ gr.Markdown(
+ "ℹ Info: to assess the LLM output, only the values for the 'User Assessment for LLM Proposal' column need to be changed."
+ )
with gr.Row():
gallery_html = gr.HTML(label="Image Gallery")
image_extraction_api_call_btn.click(
- fn=lambda: ("", "", pd.DataFrame(), gr.Button(interactive=False)),
+ fn=lambda: ("", "", gr.update(visible=False), gr.Button(interactive=False)),
inputs=[],
outputs=[
image_info_output,
gallery_html,
- alttext_info_output,
+ alttext_results_row,
alttext_api_call_btn,
],
).then(
@@ -515,7 +600,7 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
wcag_rest_server_url_state,
user_state,
],
- outputs=[alttext_info_output],
+ outputs=[image_info_output, alttext_info_output],
js="""
(url_input,gallery_html) => {
const checkboxes = document.querySelectorAll('.image-checkbox:checked');
@@ -533,7 +618,8 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
const index = checkbox.dataset.index;
const imageUrl = checkbox.dataset.imgurl;
const originalAlt = document.querySelector('.original-alt[data-index="' + index + '"]').value;
- const assessment = document.querySelector('.assessment-range[data-index="' + index + '"]').value;
+ const assessment = document.querySelector('input[name="assessment-' + index + '"]:checked').value;
+ console.log("assessment:",assessment)
const newAltText = document.querySelector('.new-alt-text[data-index="' + index + '"]').value;
selectedData.push({
@@ -548,6 +634,16 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
return [url_input,JSON.stringify(selectedData)];
}
""",
+ ).then(
+ fn=lambda: gr.update(visible=True),
+ inputs=[],
+ outputs=[alttext_results_row],
+ )
+
+ save_user_assessment_btn.click(
+ fn=process_dataframe,
+ inputs=[db_path_state, url_input, alttext_info_output, user_state],
+ outputs=[image_info_output],
)
# placed here at the end to give full contents visibility to events
diff --git a/dependences/image_extractor.py b/dependences/image_extractor.py
index 357e41f..eaff27f 100644
--- a/dependences/image_extractor.py
+++ b/dependences/image_extractor.py
@@ -55,7 +55,6 @@ class ImageExtractor:
# Also check query parameters (e.g., format=jpeg)
return any(fmt in img_url.lower() for fmt in self.SUPPORTED_FORMATS)
-
async def _download_image(self, image_url, output_dir="images") -> None:
# Parse the URL to get the path without query parameters
@@ -79,7 +78,7 @@ class ImageExtractor:
# Sanitize image name (remove special characters, limit length)
image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_"))
- image_name = image_name[:200] # Limit filename length
+ image_name = image_name[:50] # Limit filename length
# If name is empty after sanitization, create a hash-based name
if not image_name:
@@ -88,13 +87,15 @@ class ImageExtractor:
image_name = hashlib.md5(image_url.encode()).hexdigest()[:16]
# Download the image
- print("getting image:", image_url)
+ print("getting image url:", image_url)
+ print("getting image name:", image_name)
response = requests.get(image_url, timeout=10)
response.raise_for_status()
try:
# Save the image
output_path = os.path.join(output_dir, f"{image_name}.{ext}")
+ print("saving image to:", output_path)
with open(output_path, "wb") as f:
f.write(response.content)
print(f"Saved: {output_path}")
@@ -292,43 +293,36 @@ class ImageExtractor:
error_msg = f"Error extracting context: {str(e)}"
return error_msg, error_msg, error_msg
- async def _get_page_metadata(self, page) -> Dict[str, Optional[str]]:
- """Extract page metadata including title, description, and keywords."""
- metadata = {
- "title": await page.title(),
- "description": None,
- "keywords": None,
- "headings": [],
- }
+ async def _get_page_metadata(self, page):
+ """Extract page metadata in one fast evaluate call. Batch DOM extraction inside one evaluate()."""
+ return await page.evaluate(
+ """
+ () => {
+ const metadata = {
+ title: document.title || null,
+ description: null,
+ keywords: null,
+ headings: []
+ };
- # Extract meta description
- try:
- description = await page.locator('meta[name="description"]').get_attribute(
- "content"
- )
- metadata["description"] = description
- except:
- pass
+ const desc = document.querySelector('meta[name="description"]');
+ const keys = document.querySelector('meta[name="keywords"]');
+ metadata.description = desc?.content || null;
+ metadata.keywords = keys?.content || null;
- # Extract meta keywords
- try:
- keywords = await page.locator('meta[name="keywords"]').get_attribute(
- "content"
- )
- metadata["keywords"] = keywords
- except:
- pass
+ // Collect all headings h1–h6
+ const allHeadings = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
+ metadata.headings = Array.from(allHeadings)
+ .map(h => ({
+ level: parseInt(h.tagName.substring(1), 10),
+ text: h.textContent.trim()
+ }))
+ .filter(h => h.text.length > 0);
- # Extract all headings (h1-h6)
- for level in range(1, 7):
- headings = await page.locator(f"h{level}").all_text_contents()
- for heading in headings:
- if heading.strip():
- metadata["headings"].append(
- {"level": level, "text": heading.strip()}
- )
-
- return metadata
+ return metadata;
+ }
+ """
+ )
async def extract_images(
self, extract_context=True, specific_images_urls=[]
@@ -344,15 +338,18 @@ class ImageExtractor:
page = await browser.new_page()
try:
- #await page.goto(self.url, wait_until="networkidle") # method 1: use if the page has unpredictable async content and there is the need to ensure everything loads
- # The "networkidle" approach is generally more robust but slower, while the fixed timeout is faster but less adaptive to actual page behavior.
+ # await page.goto(self.url, wait_until="networkidle") # method 1: use if the page has unpredictable async content and there is the need to ensure everything loads
+ # The "networkidle" approach is generally more robust but slower, while the fixed timeout is faster but less adaptive to actual page behavior.
# ---alternative method2: use if there is total awareness of the page's loading pattern and want faster, more reliable execution
- await page.goto(self.url, timeout=50000, wait_until="load")# deafult timeout=30000, 30sec
+ await page.goto(
+ self.url, timeout=50000, wait_until="load"
+ ) # deafult timeout=30000, 30sec
# Wait for page to load completely
await page.wait_for_timeout(2000) # Wait for dynamic content
# -----
if extract_context:
+ print("Getting page metadata...")
# Get page metadata once
page_metadata = await self._get_page_metadata(page)
page_title = page_metadata["title"]
@@ -367,15 +364,41 @@ class ImageExtractor:
if len(specific_images_urls) == 0:
# Find all img elements
- print("Extracting all images from the page",self.url)
- img_elements = await page.locator("img").all()
+ print("Extracting all images from the page", self.url)
+ # img_elements = await page.locator("img").all()
else:
print(
"Extracting specific images from the page:",
self.url,
specific_images_urls,
)
- img_elements = []
+ # img_elements = await page.locator("img").all()
+
+ """ # method 3: optimized approach
+ # Get all src attributes in one go
+ all_img_elements = await page.locator("img").all()
+ all_srcs = await page.locator("img").evaluate_all(
+ "elements => elements.map(el => el.src || '')"
+ )
+
+ # Filter with the pre-fetched src values
+ img_elements = [
+ elem for elem, src in zip(all_img_elements, all_srcs)
+ if src in specific_images_urls
+ ]
+ """
+
+ """ #method 2: single pass to find matching images
+ for img_element in all_img_elements: #This is more efficient than making separate locator queries for each specific URL and avoids timeout issues.
+ try:
+ src = await img_element.get_attribute("src")
+ print("found image src:", src)
+ if src in specific_images_urls:
+ img_elements.append(img_element)
+ except Exception as e:
+ print(f"Error getting src attribute from image: {str(e)}")"""
+
+ """ # method 1: separate locator queries for each specific URL
for url in specific_images_urls:
try:
img_element = await page.locator(
@@ -384,8 +407,11 @@ class ImageExtractor:
if img_element:
img_elements.append(img_element)
except Exception as e:
- print(f"Error locating image with src {url}: {str(e)}")
+ print(f"Error locating image with src {url}: {str(e)}")"""
+ img_elements = await page.locator(
+ "img"
+ ).all() # unified approach to start with all images and filter later
image_source_list = [] # avoid multiple check for the same image url
images_data = []
@@ -404,6 +430,12 @@ class ImageExtractor:
if not src:
print("image has no src attribute. Skipped.")
continue
+ if (
+ src not in specific_images_urls
+ and len(specific_images_urls) > 0
+ ):
+ # print("image src",src,"not in the specific images list. Skipped.")
+ continue
if src not in image_source_list:
image_source_list.append(src)
@@ -434,6 +466,7 @@ class ImageExtractor:
alt_text = await img.get_attribute("alt") or ""
if extract_context:
+ print("Extracting context for image:", img_url)
# Get surrounding HTML context (full, immediate, and nearby)
html_context, immediate_context, nearby_text = (
await self._get_element_context(page, img)
diff --git a/dependences/mllm_management.py b/dependences/mllm_management.py
index 1aa4f2f..5aec00b 100644
--- a/dependences/mllm_management.py
+++ b/dependences/mllm_management.py
@@ -76,39 +76,7 @@ class MLLMManager:
return payload
def get_alt_text_system_prompt(self):
- system_prompt_old = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for
- images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present
- the same information as the image, and should be able to substitute for the non-text content. The text alternative would
- be brief but as informative as possible.
-
- Follow these instructions carefully:
- 1. You will be provided as input with the following:
- - The image found on the webpage.
- - The associated alternative text. When the alt-text is empty or absent, you will be explicitly informed.
- - The surrounding context of the image.
- - The page title, headings and the content of the “keywords” and “description” tag, if found.
-
- 2. Determine the function and purpose of the image by analyzing these elements. Take into account the purpose and function
- of the associated image by considering the page context. Check also if the image is, or is associated with, a link or a button,
- and consider this in your judgement. If the image contains text use that as part of the context.
-
- 3. Provide a final assessment based on the following:
- - 'success' if you can assess with 'sufficient certainty' the alt-text is appropriate in relation to the image purpose,
- - 'failure' if you can assess with 'sufficient certainty' that the alt-text is NOT appropriate,
- - 'warning' if you cannot determine with 'sufficient certainty'.
- where the level of certainty goes from 1 to 100 and 'sufficient certainty' means > 80
-
- 4. The original alt-text assessment on a scale from 1 to 5, where 5 is the best score. Use an integer number only.
-
- 5. Provide a brief reasoning for your judgment. If the image contains text, write it verbatim. Your response should be in English.
-
- 6. Keep your response within 150 words.
-
- 7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words.
-
- 8. Here is the JSON format the results must have:
- {"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}"""
-
+
# https://www.w3.org/WAI/WCAG22/Techniques/general/G94 without examples
system_prompt = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for
images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present
@@ -122,7 +90,7 @@ class MLLMManager:
What purpose does it fulfill?
If I could not use the image content, what words would I use to convey the same function and/or information?
- When image content contains words that are important to understanding the content, the alt text should include those words
+ When image content contains words that are important to understanding the content, the alt text should include those words.
Follow these instructions carefully:
1. You will be provided as input with the following:
@@ -147,7 +115,7 @@ class MLLMManager:
6. Keep your response within 150 words.
- 7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words.
+ 7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words. Use the same language as the original alt-text.
8. Here is the JSON format the results must have:
{"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}"""
@@ -181,7 +149,7 @@ class MLLMManager:
print("Using end_point:", self.end_point)
alt_text_system_prompt = self.get_alt_text_system_prompt()
- print("alt_text_system_prompt:", alt_text_system_prompt)
+ #print("alt_text_system_prompt:", alt_text_system_prompt)
mllm_responses = []
for img_info in images:
diff --git a/restserver/routers/routes_extract_images.py b/restserver/routers/routes_extract_images.py
index e173667..914948b 100644
--- a/restserver/routers/routes_extract_images.py
+++ b/restserver/routers/routes_extract_images.py
@@ -46,6 +46,7 @@ class ExtractImagesRoutes:
self, request: Request, data: ExtractImages
) -> JSONResponse:
"""Return the alt text validation assessment based on WCAG guidelines"""
+ print("Received extract images request.")
try:
json_content = json.loads(data.model_dump_json())
diff --git a/restserver/routers/routes_wcag_alttext.py b/restserver/routers/routes_wcag_alttext.py
index 15afa9e..ac8633f 100644
--- a/restserver/routers/routes_wcag_alttext.py
+++ b/restserver/routers/routes_wcag_alttext.py
@@ -53,6 +53,7 @@ class WCAGAltTextValuationRoutes:
) -> JSONResponse:
"""Return the alt text validation assessment based on WCAG guidelines"""
try:
+ print("Received wcag alttext validation request.")
json_content = json.loads(data.model_dump_json())
mllm_model_id = self.mllm_settings["mllm_model_id"]
@@ -67,7 +68,12 @@ class WCAGAltTextValuationRoutes:
.replace(":", "")
.replace("//", "_")
.replace("/", "_")
+ .replace("%2", "_")
+ .replace("?", "_")
+ .replace("=", "_")
+ .replace("&", "_")
)
+ url_path=url_path[:50] # limit length
now = datetime.now(timezone.utc)
now_str = now.strftime("%Y_%m_%d-%H_%M_%S")
folder_str = mllm_model_id.replace(":", "-") + "_" + now_str
@@ -93,7 +99,7 @@ class WCAGAltTextValuationRoutes:
# Extract images
logging.info(f"Extracting images from: {json_content['page_url']}")
images = await image_extractor.extract_images(
- specific_images_urls=json_content["specific_images_urls"]
+ specific_images_urls=json_content["specific_images_urls"],extract_context=True
)
# MLLM settings
mllm_end_point = self.mllm_settings["mllm_end_point"]
diff --git a/scripts/build_dataset_from_folder.py b/scripts/build_dataset_from_folder.py
new file mode 100644
index 0000000..4839c8b
--- /dev/null
+++ b/scripts/build_dataset_from_folder.py
@@ -0,0 +1,541 @@
+# to launch: python build_dataset_from_folder.py --ref_path "" --push_to_hub --repo_id "nicolaleo/LLM-alt-text-assessment" --token ""
+
+from datasets import Dataset, DatasetDict
+import datasets
+import json
+from pathlib import Path
+from PIL import Image
+import hashlib
+import urllib.parse
+import argparse
+
+
+'''
+# Dataset metadata
+_DESCRIPTION = """\
+Dataset for image alt-text assessment and improvement using MLLM responses.
+Contains images, original alt-texts, quality assessments, and improved versions.
+"""
+
+_CITATION = """\
+@misc{alt_text_assessment,
+ title={Alt-Text Assessment Dataset},
+ year={2024}
+}
+"""
+
+
+
+
+
+class AltTextDataset(datasets.GeneratorBasedBuilder):
+ """Dataset for alt-text assessment with images and MLLM responses."""
+
+ VERSION = datasets.Version("1.0.0")
+
+ def _info(self):
+ return datasets.DatasetInfo(
+ description=_DESCRIPTION,
+ features=datasets.Features({
+ "image": datasets.Image(),
+ "image_url": datasets.Value("string"),
+ "alt_text": datasets.Value("string"),
+ "original_alt_text_assessment": datasets.Value("string"),
+ "assessment": datasets.Value("string"),
+ "evaluation_result": datasets.Value("string"),
+ "new_alt_text": datasets.Value("string"),
+ #"source_folder": datasets.Value("string"),
+ }),
+ citation=_CITATION,
+ )
+
+ def _split_generators(self, dl_manager):
+ """Define data splits."""
+ return [
+ datasets.SplitGenerator(
+ name=datasets.Split.TRAIN,
+ gen_kwargs={
+ "json_filepath": "data.json",
+ "images_dir": "images"
+ },
+ ),
+ ]
+
+ def _generate_examples(self, json_filepath, images_dir):
+ """Generate examples from JSON file and image directory."""
+ with open(json_filepath, encoding="utf-8") as f:
+ data = json.load(f)
+
+ images_path = Path(images_dir)
+
+ for idx, entry in enumerate(data):
+ image_url = entry["image_url"]
+ image_filename = url_to_filename(image_url)
+ image_path = images_path / image_filename
+
+ # Load image if exists, otherwise None
+ image = str(image_path) if image_path.exists() else None
+
+ yield idx, {
+ "image": image,
+ "image_url": image_url,
+ "alt_text": entry["alt_text"],
+ "original_alt_text_assessment": entry["mllm_response"]["original_alt_text_assessment"],
+ "assessment": entry["mllm_response"]["assessment"],
+ "evaluation_result": entry["mllm_response"]["evaluation_result"],
+ "new_alt_text": entry["mllm_response"]["new_alt_text"],
+ }
+
+'''
+# ============================================================================
+# SIMPLE USAGE FUNCTIONS
+# ============================================================================
+
+
+def url_to_filename(image_url): # save step as in the image_extractor dependence
+ """
+ Convert image URL to sanitized filename following your exact logic.
+
+ Args:
+ image_url: The image URL
+
+ Returns:
+ Sanitized filename with extension
+ """
+
+ # Parse the URL to get the path without query parameters
+ parsed_url = urllib.parse.urlparse(image_url)
+ url_path = parsed_url.path
+
+ # Get the filename from the path
+ filename = url_path.split("/")[-1]
+ print(f"Original filename: '{filename}'")
+
+ # Split filename and extension
+ if "." in filename:
+ image_name, ext = filename.rsplit(".", 1)
+ ext = ext.lower()
+ else:
+ image_name = filename
+ ext = "jpg"
+
+ # Validate extension
+ if ext not in ["jpg", "jpeg", "png", "gif", "webp"]:
+ ext = "jpg"
+
+ # Sanitize image name (remove special characters, limit length)
+ image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_"))
+
+ image_name = image_name[:50] # Limit filename length
+
+ # If name is empty after sanitization, create a hash-based name
+ if not image_name:
+ image_name = hashlib.md5(image_url.encode()).hexdigest()[:16]
+
+ return f"{image_name}.{ext}"
+
+
+def push_to_hub_example(dataset_path="alt_text_merged_dataset", repo_id="",token=None):
+ """
+ Example of how to push dataset to Hugging Face Hub.
+ You need to authenticate first!
+ """
+ from huggingface_hub import login
+
+ print("\n=== Pushing Dataset to Hugging Face Hub ===")
+ # Method 1: Login interactively (will prompt for token)
+ # login()
+
+ # Method 2: Login with token directly
+ login(token=token)
+
+ # Method 3: Set token as environment variable
+ # export HF_TOKEN="hf_YourTokenHere"
+ # Then login() will use it automatically
+
+ # Load your dataset
+ ds = load_dataset_from_disk(dataset_path)
+
+ # Combine into DatasetDict
+ ds = DatasetDict(
+ {
+ "train": ds,
+ # #"test": test_dataset
+ }
+ )
+
+ # Push to hub (creates repo if it doesn't exist)
+ ds.push_to_hub( # Automatically converts to Parquet when uploading to Hub
+ repo_id, # Replace with your username
+ private=False, # Set True for private dataset
+ )
+
+ print("Dataset pushed successfully!")
+ print(f"View at: https://huggingface.co/datasets/{repo_id}")
+
+
+def create_dataset_from_json(json_filepath, json_filepath_images, images_dir="images"):
+ """
+ Create a Hugging Face Dataset from JSON file with local images.
+
+ Args:
+ json_filepath: Path to JSON file with your data structure
+ images_dir: Directory containing the images (default: "images")
+
+ Returns:
+ datasets.Dataset object with images loaded
+ """
+ with open(json_filepath, "r", encoding="utf-8") as f:
+ data = json.load(f)
+
+ with open(json_filepath_images, "r", encoding="utf-8") as f:
+ data_images = json.load(f)
+
+ images_path = Path(images_dir)
+
+ # Flatten the nested structure and load images
+ flattened_data = {
+ "image": [],
+ "image_url": [],
+ "alt_text": [],
+ "original_alt_text_assessment": [],
+ "assessment": [],
+ "evaluation_result": [],
+ "new_alt_text": [],
+ "page_url": [],
+ "html_context": [],
+ }
+
+ count_entry = 0
+ for entry in data:
+ if (
+ entry["mllm_response"]["original_alt_text_assessment"] is None
+ ): # important! skip entries with no MLLM response. not usable data
+ print(
+ f"Skipping entry with image URL: {entry['image_url']} due to missing MLLM response"
+ )
+ count_entry += 1
+ continue # Skip entries with no MLLM response
+ image_url = entry["image_url"]
+ image_filename = url_to_filename(image_url)
+ image_path = images_path / image_filename
+
+ # Load image if it exists
+ if image_path.exists():
+ img = Image.open(image_path)
+ flattened_data["image"].append(img)
+ else:
+ print(f"Warning: Image not found: {image_path}")
+ flattened_data["image"].append(None)
+
+ flattened_data["image_url"].append(image_url)
+ flattened_data["alt_text"].append(entry["alt_text"])
+ flattened_data["original_alt_text_assessment"].append(
+ str(entry["mllm_response"]["original_alt_text_assessment"])
+ )
+ flattened_data["assessment"].append(entry["mllm_response"]["assessment"])
+ flattened_data["evaluation_result"].append(
+ entry["mllm_response"]["evaluation_result"]
+ )
+ flattened_data["new_alt_text"].append(entry["mllm_response"]["new_alt_text"])
+ flattened_data["page_url"].append(data_images[count_entry]["page_url"])
+ flattened_data["html_context"].append(data_images[count_entry]["html_context"])
+
+ count_entry += 1
+
+ print(f"Total valid entries loaded: {len(flattened_data['image_url'])}")
+ return datasets.Dataset.from_dict(flattened_data)
+
+
+def create_dataset_from_folders(
+ ref_path,
+ json_filename="mllm_alttext_assessments.json",
+ json_filename_images="extracted_images.json",
+ images_dirname="images",
+):
+ """
+ Create a merged dataset from multiple folders under ref_path.
+ Each folder should contain a JSON file and an images subdirectory.
+
+ Args:
+ ref_path: Root path containing multiple folders
+ json_filename: Name of JSON file in each folder (default: "data.json")
+ images_dirname: Name of images subdirectory (default: "images")
+
+ Returns:
+ datasets.Dataset object with all entries merged
+ """
+ ref_path = Path(ref_path)
+ all_datasets = []
+
+ # Find all subdirectories containing the JSON file
+ folders_processed = 0
+
+ for folder in ref_path.iterdir():
+ if not folder.is_dir():
+ continue
+
+ json_path = folder / json_filename
+ json_path_images = folder / json_filename_images
+ images_path = folder / images_dirname
+
+ # Check if both JSON and images directory exist
+ if not json_path.exists():
+ print(f"Skipping {folder.name}: no {json_filename} found")
+ continue
+
+ if not json_path_images.exists():
+ print(f"Skipping {folder.name}: no {json_filename_images} found")
+ continue
+
+ if not images_path.exists():
+ print(f"Warning: {folder.name}: images directory not found")
+ # continue
+ # Continue anyway, images might be optional (from urls only)
+
+ print(f"Processing folder: {folder.name}")
+
+ try:
+ # Create dataset for this folder
+ ds = create_dataset_from_json(
+ str(json_path), str(json_path_images), str(images_path)
+ )
+ all_datasets.append(ds)
+
+ folders_processed += 1
+ print(f" -> Loaded {len(ds)} entries")
+ except Exception as e:
+ print(f"Error processing {folder.name}: {e}")
+ continue
+
+ if not all_datasets:
+ raise ValueError(f"No valid folders found in {ref_path}")
+
+ # Merge all datasets
+ print(f"\n=== Merging {folders_processed} folders ===")
+ merged_dataset = datasets.concatenate_datasets(all_datasets)
+ print(f"Total entries: {len(merged_dataset)}")
+
+ return merged_dataset
+
+
+def verify_images(json_filepath, images_dir="images"):
+ """
+ Verify that all images referenced in JSON exist in the images directory.
+
+ Args:
+ json_filepath: Path to JSON file
+ images_dir: Directory containing images
+
+ Returns:
+ Dict with 'found', 'missing', and 'details' keys
+ """
+ with open(json_filepath, "r", encoding="utf-8") as f:
+ data = json.load(f)
+
+ images_path = Path(images_dir)
+
+ found = []
+ missing = []
+
+ for entry in data:
+ image_url = entry["image_url"]
+ image_filename = url_to_filename(image_url)
+ image_path = images_path / image_filename
+ print(
+ "image_url:",
+ image_url,
+ "image_filename:",
+ image_filename,
+ "image_path:",
+ image_path,
+ )
+
+ if image_path.exists():
+ found.append(
+ {"url": image_url, "filename": image_filename, "path": str(image_path)}
+ )
+ else:
+ missing.append(
+ {
+ "url": image_url,
+ "filename": image_filename,
+ "expected_path": str(image_path),
+ }
+ )
+
+ return {
+ "found": len(found),
+ "missing": len(missing),
+ "total": len(data),
+ "details": {"found_images": found, "missing_images": missing},
+ }
+
+
+def verify_images_in_folders(
+ ref_path, json_filename="mllm_alttext_assessments.json", images_dirname="images"
+):
+ """
+ Verify images across all folders under ref_path.
+
+ Args:
+ ref_path: Root path containing multiple folders
+ json_filename: Name of JSON file in each folder
+ images_dirname: Name of images subdirectory
+
+ Returns:
+ Dict with aggregated verification results
+ """
+ ref_path = Path(ref_path)
+ total_found = 0
+ total_missing = 0
+ total_entries = 0
+ folder_results = {}
+
+ for folder in ref_path.iterdir():
+ if not folder.is_dir():
+ continue
+
+ json_path = folder / json_filename
+ images_path = folder / images_dirname
+
+ if not json_path.exists():
+ continue
+
+ print(f"Verifying folder: {folder.name}")
+
+ try:
+ verification = verify_images(str(json_path), str(images_path))
+ folder_results[folder.name] = verification
+
+ total_found += verification["found"]
+ total_missing += verification["missing"]
+ total_entries += verification["total"]
+
+ print(f" Found: {verification['found']}/{verification['total']}")
+
+ except Exception as e:
+ print(f" Error: {e}")
+ continue
+
+ return {
+ "found": total_found,
+ "missing": total_missing,
+ "total": total_entries,
+ "folders": folder_results,
+ }
+
+
+def save_dataset(dataset, output_path):
+ """Save dataset in Arrow format (includes images)."""
+ dataset.save_to_disk(output_path)
+ # print(f"Dataset saved to {output_path}")
+
+ # Or save as JSON
+ # dataset.to_json(f"{output_path}/data.json")
+
+ # Or save as CSV
+ # dataset.to_csv(f"{output_path}/data.csv")
+
+ # Or save as Parquet
+ # dataset.to_parquet(f"{output_path}/data.parquet")
+
+
+def load_dataset_from_disk(dataset_path):
+ """Load a previously saved dataset."""
+ return datasets.load_from_disk(dataset_path)
+
+
+# ============================================================================
+# EXAMPLE USAGE
+# ============================================================================
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument(
+ "--ref_path",
+ type=str,
+ help=("Root path containing multiple folders"),
+ default="",
+ )
+
+ parser.add_argument(
+ "--push_to_hub",
+ action="store_true",
+ default=False,
+ help=("If True push the merged dataset to Hugging Face Hub"),
+ )
+ parser.add_argument(
+ "--token",
+ type=str,
+ help=("Hugging Face authentication token"),
+ default="",
+ )
+ parser.add_argument(
+ "--repo_id",
+ type=str,
+ help=("Hugging Face repository ID"),
+ default="nicolaleo/LLM-alt-text-assessment",
+ )
+ args = parser.parse_args()
+
+ # Example 1: Verify images across all folders
+ print("=== Verifying Images in All Folders ===")
+ verification = verify_images_in_folders(args.ref_path)
+ print("\n######## Verifier output ################################")
+ print(f"Total Found: {verification['found']}/{verification['total']}")
+ print(f"Total Missing: {verification['missing']}/{verification['total']}")
+ print("########################################")
+
+ # Show per-folder breakdown
+ print("\n=== Per-Folder Breakdown ===")
+ for folder_name, results in verification["folders"].items():
+ print(f"{folder_name}: {results['found']}/{results['total']} images found")
+
+ # Example 2: Create merged dataset from all folders
+ print("\n=== Creating Merged Dataset ===")
+ ds = create_dataset_from_folders(args.ref_path)
+ print("\n######## Merged Dataset output ################################")
+ print(f"Final dataset size: {len(ds)} entries")
+ print("########################################")
+
+ # Example 3: Analyze the merged dataset
+ print("\n=== Dataset Analysis ===")
+ print(ds)
+
+ # Example 3: Access images and data
+ print("\n=== First Example ===")
+ first_example = ds[0]
+ print(f"Image URL: {first_example['image_url']}")
+ print(f"Alt text: {first_example['alt_text']}")
+ print(f"Assessment: {first_example['assessment']}")
+ print(f"New alt text: {first_example['new_alt_text']}")
+ print(f"Image loaded: {first_example['image'] is not None}")
+
+ if first_example["image"] is not None:
+ img = first_example["image"]
+ print(f"Image size: {img.size}")
+ # img.show() # Uncomment to display image
+
+ # Example 4: Filter and work with merged data
+ print("\n=== Filtering Merged Dataset ===")
+ successful = ds.filter(lambda x: x["assessment"] == "success")
+ print(f"Successful assessments: {len(successful)}")
+
+ high_rated = ds.filter(lambda x: int(x["original_alt_text_assessment"]) >= 4)
+ print(f"High-rated (>=4): {len(high_rated)}")
+
+ # Example 5: Save merged dataset
+ print("\n=== Saving Merged Dataset ===")
+ save_dataset(ds, "alt_text_merged_dataset")
+
+ # Example 6: Load dataset
+ print("\n=== Loading Dataset ===")
+ loaded_ds = load_dataset_from_disk("alt_text_merged_dataset")
+ print(f"Loaded {len(loaded_ds)} entries")
+
+ if args.push_to_hub:
+ # Push to Hugging Face Hub (optional)
+ push_to_hub_example(repo_id=args.repo_id, token=args.token) # function below for details
\ No newline at end of file
diff --git a/scripts/requirements_extra.txt b/scripts/requirements_extra.txt
index 574a36a..d27bdd4 100644
--- a/scripts/requirements_extra.txt
+++ b/scripts/requirements_extra.txt
@@ -5,4 +5,5 @@ transformers==4.57.1
numpy==2.2.6
matplotlib==3.10.7
scikit-learn==1.7.2
-sentence-transformers==5.1.2
\ No newline at end of file
+sentence-transformers==5.1.2
+datasets==4.4.1
\ No newline at end of file