upgrade e costruzione datasetHF
This commit is contained in:
parent
cde7259ed7
commit
85c03b3a1a
3
UI/.env
3
UI/.env
|
|
@ -1,4 +1,5 @@
|
|||
DB_PATH=persistence/wcag_validator_ui.db
|
||||
WCAG_REST_SERVER_URL=http://localhost:8000
|
||||
URL_LIST_old=["http://www.amazon.it","https://web.archive.org/web/20230630235957/http://www.amazon.com/", "https://web.archive.org/web/20251130033532/https://www.ebay.com/"]
|
||||
URL_LIST=["https://amazon.com","https://ebay.com","https://walmart.com","https://etsy.com","https://target.com","https://wayfair.com","https://bestbuy.com","https://macys.com","https://homedepot.com","https://costco.com","https://www.ansa.it","https://en.wikipedia.org/wiki/Main_Page","https://www.lanazione.it","https://www.ansa.it","https://www.bbc.com","https://www.cnn.com","https://www.nytimes.com","https://www.theguardian.com"]
|
||||
URL_LIST_old=["https://www.amazon.com/s?k=magllioni&crid=CGD2UWO33O58&sprefix=magllioni%2Caps%2C209&ref=nb_sb_noss","https://web.archive.org/web/20251011214807/https://www.ilfattoquotidiano.it/","https://amazon.com","https://ebay.com","https://walmart.com","https://etsy.com","https://target.com","https://wayfair.com","https://www.ansa.it","https://en.wikipedia.org/wiki/Main_Page","https://www.lanazione.it","https://www.bbc.com","https://www.cnn.com","https://www.nytimes.com","https://www.theguardian.com"]
|
||||
URL_LIST=["https://giove.isti.cnr.it/users/manca/eBay.html","http://www.amazon.it"]
|
||||
|
|
@ -31,6 +31,45 @@ import sqlite3
|
|||
WCAG_VALIDATOR_RESTSERVER_HEADERS = [("Content-Type", "application/json")]
|
||||
|
||||
|
||||
def process_dataframe(db_path, url, updated_df, user_state={}):
|
||||
|
||||
print("Processing dataframe to adjust columns...")
|
||||
column_rating_name = "User Assessment for LLM Proposal"
|
||||
|
||||
# Get the assessment column
|
||||
try:
|
||||
updated_df[column_rating_name] = updated_df[column_rating_name].astype(int)
|
||||
except ValueError:
|
||||
return "Error: User Assessment for LLM Proposal must be an integer"
|
||||
|
||||
if (updated_df[column_rating_name] < 1).any() or (
|
||||
updated_df[column_rating_name] > 5
|
||||
).any():
|
||||
return "Error: User Assessment for LLM Proposal must be between 1 and 5"
|
||||
|
||||
dataframe_json = updated_df.to_json(orient="records")
|
||||
connection_db = sqlite3.connect(db_path)
|
||||
json_user_str = json.dumps({"username": user_state["username"]}, ensure_ascii=False)
|
||||
try:
|
||||
# insert after everything to keep datetime aligned
|
||||
db_persistence_insert(
|
||||
connection_db=connection_db,
|
||||
insert_type="wcag_user_llm_alttext_assessments",
|
||||
page_url=url,
|
||||
user=json_user_str,
|
||||
llm_model="",
|
||||
json_in_str=dataframe_json, # to improve
|
||||
json_out_str="done via UI",
|
||||
table="wcag_user_assessments",
|
||||
)
|
||||
except Exception as e:
|
||||
print("Error inserting user assessment into database:", str(e))
|
||||
finally:
|
||||
if connection_db:
|
||||
connection_db.close()
|
||||
return "User assessment saved successfully!"
|
||||
|
||||
|
||||
def load_images_from_json(json_input):
|
||||
"""Extract URLs and alt text from JSON and create HTML gallery"""
|
||||
try:
|
||||
|
|
@ -40,7 +79,7 @@ def load_images_from_json(json_input):
|
|||
return "No images found in JSON", ""
|
||||
|
||||
images = data["images"]
|
||||
info_text = f"Found {len(images)} image(s)\n"
|
||||
info_text = f"Found {len(images)} image(s)"
|
||||
print(f"Found {len(data['images'])} image(s)")
|
||||
|
||||
# Create HTML gallery with checkboxes and assessment forms
|
||||
|
|
@ -58,14 +97,14 @@ def load_images_from_json(json_input):
|
|||
padding: 10px;
|
||||
background: white;
|
||||
}
|
||||
.image-card:has(input:checked) {
|
||||
.image-card:has(input[type="checkbox"]:checked) {
|
||||
border-color: #2196F3;
|
||||
background: #a7c1c1;
|
||||
}
|
||||
.image-card img {
|
||||
width: 100%;
|
||||
height: 200px;
|
||||
object-fit: cover;
|
||||
object-fit: scale-down;
|
||||
border-radius: 4px;
|
||||
}
|
||||
.image-info {
|
||||
|
|
@ -93,7 +132,7 @@ def load_images_from_json(json_input):
|
|||
display: none;
|
||||
margin-top: 15px;
|
||||
padding: 10px;
|
||||
background: #f0f7ff;
|
||||
background: #7896b9;
|
||||
border-radius: 4px;
|
||||
border: 1px solid #2196F3;
|
||||
}
|
||||
|
|
@ -109,18 +148,22 @@ def load_images_from_json(json_input):
|
|||
margin-bottom: 5px;
|
||||
font-size: 13px;
|
||||
}
|
||||
.range-container {
|
||||
|
||||
.radio-container {
|
||||
display: flex;
|
||||
gap: 15px;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.radio-option {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
gap: 5px;
|
||||
cursor: pointer;
|
||||
}
|
||||
.range-container input[type="range"] {
|
||||
flex: 1;
|
||||
}
|
||||
.range-value {
|
||||
font-weight: bold;
|
||||
min-width: 20px;
|
||||
text-align: center;
|
||||
|
||||
.radio-label {
|
||||
font-weight: 500;
|
||||
}
|
||||
textarea {
|
||||
width: 100%;
|
||||
|
|
@ -166,12 +209,28 @@ def load_images_from_json(json_input):
|
|||
<div id="panel-{idx}" class="assessment-panel">
|
||||
<div class="form-group">
|
||||
<label>Rate current alt-text:</label>
|
||||
<div class="range-container">
|
||||
<input type="range" min="1" max="5" value="3"
|
||||
class="assessment-range" data-index="{idx}"
|
||||
oninput="document.getElementById('range-value-{idx}').textContent = this.value">
|
||||
<span id="range-value-{idx}" class="range-value">3</span>
|
||||
</div>
|
||||
<div class="radio-container">
|
||||
<label class="radio-option">
|
||||
<input type="radio" name="assessment-{idx}" value="1" data-index="{idx}">
|
||||
<span class="radio-label">1</span>
|
||||
</label>
|
||||
<label class="radio-option">
|
||||
<input type="radio" name="assessment-{idx}" value="2" data-index="{idx}">
|
||||
<span class="radio-label">2</span>
|
||||
</label>
|
||||
<label class="radio-option">
|
||||
<input type="radio" name="assessment-{idx}" value="3" data-index="{idx}" checked>
|
||||
<span class="radio-label">3</span>
|
||||
</label>
|
||||
<label class="radio-option">
|
||||
<input type="radio" name="assessment-{idx}" value="4" data-index="{idx}">
|
||||
<span class="radio-label">4</span>
|
||||
</label>
|
||||
<label class="radio-option">
|
||||
<input type="radio" name="assessment-{idx}" value="5" data-index="{idx}">
|
||||
<span class="radio-label">5</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label>New alt-text:</label>
|
||||
|
|
@ -226,7 +285,7 @@ def load_llm_assessment_from_json(json_input):
|
|||
{
|
||||
"Original Alt Text": alt_text_original,
|
||||
"LLM Assessment": original_alt_text_assessment,
|
||||
"Proposed Alt Text": new_alt_text,
|
||||
"LLM Proposed Alt Text": new_alt_text,
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -257,7 +316,7 @@ def make_alttext_llm_assessment_api_call(
|
|||
if not selected_images or len(selected_images) == 0:
|
||||
info_text = "No images selected"
|
||||
print(info_text)
|
||||
return pd.DataFrame()
|
||||
return "LLM assessment not started", pd.DataFrame()
|
||||
|
||||
# prepare data for insertion
|
||||
json_in_str = {}
|
||||
|
|
@ -267,6 +326,7 @@ def make_alttext_llm_assessment_api_call(
|
|||
user_assessments = []
|
||||
user_new_alt_texts = []
|
||||
selected_image_id = []
|
||||
user_assessments_llm_proposal = []
|
||||
for img in selected_images:
|
||||
selected_urls.append(img["image_url"])
|
||||
selected_alt_text_original.append(img["original_alt_text"])
|
||||
|
|
@ -275,6 +335,7 @@ def make_alttext_llm_assessment_api_call(
|
|||
selected_image_id.append(
|
||||
int(img["image_index"]) + 1
|
||||
) # add the id selected (+1 for index alignment)
|
||||
user_assessments_llm_proposal.append(3) # default value for now
|
||||
json_in_str["images_urls"] = selected_urls
|
||||
json_in_str["images_alt_text_original"] = selected_alt_text_original
|
||||
json_out_str["user_assessments"] = user_assessments
|
||||
|
|
@ -302,9 +363,17 @@ def make_alttext_llm_assessment_api_call(
|
|||
)
|
||||
# return response
|
||||
info_dataframe = load_llm_assessment_from_json(response)
|
||||
|
||||
# add the UI ids and other fields to to api response
|
||||
info_dataframe.insert(
|
||||
0, "Image #", selected_image_id
|
||||
) # add the UI ids from to api response
|
||||
info_dataframe.insert(2, "User Assessment", user_assessments)
|
||||
|
||||
info_dataframe.insert(3, "User Proposed Alt Text", user_new_alt_texts)
|
||||
info_dataframe["User Assessment for LLM Proposal"] = (
|
||||
user_assessments_llm_proposal
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
|
@ -326,7 +395,7 @@ def make_alttext_llm_assessment_api_call(
|
|||
finally:
|
||||
if connection_db:
|
||||
connection_db.close()
|
||||
return info_dataframe
|
||||
return "LLM assessment completed", info_dataframe
|
||||
|
||||
|
||||
def make_image_extraction_api_call(
|
||||
|
|
@ -449,9 +518,10 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
|
|||
images_number = gr.Slider(
|
||||
5,
|
||||
100,
|
||||
value=30,
|
||||
value=50,
|
||||
step=5,
|
||||
label="Max number of images to retrieve",
|
||||
visible=False,
|
||||
)
|
||||
with gr.Column():
|
||||
|
||||
|
|
@ -459,39 +529,54 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
|
|||
"Extract Images & Alt Texts", variant="primary"
|
||||
)
|
||||
alttext_api_call_btn = gr.Button(
|
||||
"Alt Text LLM Assessment",
|
||||
"Start LLM Assessment",
|
||||
variant="secondary",
|
||||
interactive=False,
|
||||
)
|
||||
image_info_output = gr.Textbox(
|
||||
label="Activity tracking", lines=1
|
||||
)
|
||||
|
||||
with gr.Row():
|
||||
|
||||
image_info_output = gr.Textbox(label="Managed Images", lines=5)
|
||||
with gr.Row(visible=False) as alttext_results_row:
|
||||
|
||||
# Use DataFrame for tabular output
|
||||
alttext_info_output = gr.DataFrame(
|
||||
headers=[
|
||||
"Image #",
|
||||
"Original Alt Text",
|
||||
"User Assessment",
|
||||
"User Proposed Alt Text",
|
||||
"LLM Assessment",
|
||||
"Proposed Alt Text",
|
||||
"LLM Proposed Alt Text",
|
||||
"User Assessment for LLM Proposal",
|
||||
],
|
||||
label="LLM Assessment Results",
|
||||
wrap=True, # Wrap text in cells
|
||||
interactive=False,
|
||||
interactive=True,
|
||||
scale=7,
|
||||
)
|
||||
with gr.Column():
|
||||
save_user_assessment_btn = gr.Button(
|
||||
"Save Your Assessment",
|
||||
variant="secondary",
|
||||
interactive=True,
|
||||
scale=1,
|
||||
)
|
||||
gr.Markdown(
|
||||
"ℹ Info: to assess the LLM output, only the values for the 'User Assessment for LLM Proposal' column need to be changed."
|
||||
)
|
||||
|
||||
with gr.Row():
|
||||
|
||||
gallery_html = gr.HTML(label="Image Gallery")
|
||||
|
||||
image_extraction_api_call_btn.click(
|
||||
fn=lambda: ("", "", pd.DataFrame(), gr.Button(interactive=False)),
|
||||
fn=lambda: ("", "", gr.update(visible=False), gr.Button(interactive=False)),
|
||||
inputs=[],
|
||||
outputs=[
|
||||
image_info_output,
|
||||
gallery_html,
|
||||
alttext_info_output,
|
||||
alttext_results_row,
|
||||
alttext_api_call_btn,
|
||||
],
|
||||
).then(
|
||||
|
|
@ -515,7 +600,7 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
|
|||
wcag_rest_server_url_state,
|
||||
user_state,
|
||||
],
|
||||
outputs=[alttext_info_output],
|
||||
outputs=[image_info_output, alttext_info_output],
|
||||
js="""
|
||||
(url_input,gallery_html) => {
|
||||
const checkboxes = document.querySelectorAll('.image-checkbox:checked');
|
||||
|
|
@ -533,7 +618,8 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
|
|||
const index = checkbox.dataset.index;
|
||||
const imageUrl = checkbox.dataset.imgurl;
|
||||
const originalAlt = document.querySelector('.original-alt[data-index="' + index + '"]').value;
|
||||
const assessment = document.querySelector('.assessment-range[data-index="' + index + '"]').value;
|
||||
const assessment = document.querySelector('input[name="assessment-' + index + '"]:checked').value;
|
||||
console.log("assessment:",assessment)
|
||||
const newAltText = document.querySelector('.new-alt-text[data-index="' + index + '"]').value;
|
||||
|
||||
selectedData.push({
|
||||
|
|
@ -548,6 +634,16 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
|
|||
return [url_input,JSON.stringify(selectedData)];
|
||||
}
|
||||
""",
|
||||
).then(
|
||||
fn=lambda: gr.update(visible=True),
|
||||
inputs=[],
|
||||
outputs=[alttext_results_row],
|
||||
)
|
||||
|
||||
save_user_assessment_btn.click(
|
||||
fn=process_dataframe,
|
||||
inputs=[db_path_state, url_input, alttext_info_output, user_state],
|
||||
outputs=[image_info_output],
|
||||
)
|
||||
|
||||
# placed here at the end to give full contents visibility to events
|
||||
|
|
|
|||
|
|
@ -55,7 +55,6 @@ class ImageExtractor:
|
|||
# Also check query parameters (e.g., format=jpeg)
|
||||
return any(fmt in img_url.lower() for fmt in self.SUPPORTED_FORMATS)
|
||||
|
||||
|
||||
async def _download_image(self, image_url, output_dir="images") -> None:
|
||||
|
||||
# Parse the URL to get the path without query parameters
|
||||
|
|
@ -79,7 +78,7 @@ class ImageExtractor:
|
|||
|
||||
# Sanitize image name (remove special characters, limit length)
|
||||
image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_"))
|
||||
image_name = image_name[:200] # Limit filename length
|
||||
image_name = image_name[:50] # Limit filename length
|
||||
|
||||
# If name is empty after sanitization, create a hash-based name
|
||||
if not image_name:
|
||||
|
|
@ -88,13 +87,15 @@ class ImageExtractor:
|
|||
image_name = hashlib.md5(image_url.encode()).hexdigest()[:16]
|
||||
|
||||
# Download the image
|
||||
print("getting image:", image_url)
|
||||
print("getting image url:", image_url)
|
||||
print("getting image name:", image_name)
|
||||
response = requests.get(image_url, timeout=10)
|
||||
response.raise_for_status()
|
||||
|
||||
try:
|
||||
# Save the image
|
||||
output_path = os.path.join(output_dir, f"{image_name}.{ext}")
|
||||
print("saving image to:", output_path)
|
||||
with open(output_path, "wb") as f:
|
||||
f.write(response.content)
|
||||
print(f"Saved: {output_path}")
|
||||
|
|
@ -292,43 +293,36 @@ class ImageExtractor:
|
|||
error_msg = f"Error extracting context: {str(e)}"
|
||||
return error_msg, error_msg, error_msg
|
||||
|
||||
async def _get_page_metadata(self, page) -> Dict[str, Optional[str]]:
|
||||
"""Extract page metadata including title, description, and keywords."""
|
||||
metadata = {
|
||||
"title": await page.title(),
|
||||
"description": None,
|
||||
"keywords": None,
|
||||
"headings": [],
|
||||
}
|
||||
async def _get_page_metadata(self, page):
|
||||
"""Extract page metadata in one fast evaluate call. Batch DOM extraction inside one evaluate()."""
|
||||
return await page.evaluate(
|
||||
"""
|
||||
() => {
|
||||
const metadata = {
|
||||
title: document.title || null,
|
||||
description: null,
|
||||
keywords: null,
|
||||
headings: []
|
||||
};
|
||||
|
||||
# Extract meta description
|
||||
try:
|
||||
description = await page.locator('meta[name="description"]').get_attribute(
|
||||
"content"
|
||||
)
|
||||
metadata["description"] = description
|
||||
except:
|
||||
pass
|
||||
const desc = document.querySelector('meta[name="description"]');
|
||||
const keys = document.querySelector('meta[name="keywords"]');
|
||||
metadata.description = desc?.content || null;
|
||||
metadata.keywords = keys?.content || null;
|
||||
|
||||
# Extract meta keywords
|
||||
try:
|
||||
keywords = await page.locator('meta[name="keywords"]').get_attribute(
|
||||
"content"
|
||||
)
|
||||
metadata["keywords"] = keywords
|
||||
except:
|
||||
pass
|
||||
// Collect all headings h1–h6
|
||||
const allHeadings = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
|
||||
metadata.headings = Array.from(allHeadings)
|
||||
.map(h => ({
|
||||
level: parseInt(h.tagName.substring(1), 10),
|
||||
text: h.textContent.trim()
|
||||
}))
|
||||
.filter(h => h.text.length > 0);
|
||||
|
||||
# Extract all headings (h1-h6)
|
||||
for level in range(1, 7):
|
||||
headings = await page.locator(f"h{level}").all_text_contents()
|
||||
for heading in headings:
|
||||
if heading.strip():
|
||||
metadata["headings"].append(
|
||||
{"level": level, "text": heading.strip()}
|
||||
)
|
||||
|
||||
return metadata
|
||||
return metadata;
|
||||
}
|
||||
"""
|
||||
)
|
||||
|
||||
async def extract_images(
|
||||
self, extract_context=True, specific_images_urls=[]
|
||||
|
|
@ -344,15 +338,18 @@ class ImageExtractor:
|
|||
page = await browser.new_page()
|
||||
|
||||
try:
|
||||
#await page.goto(self.url, wait_until="networkidle") # method 1: use if the page has unpredictable async content and there is the need to ensure everything loads
|
||||
# The "networkidle" approach is generally more robust but slower, while the fixed timeout is faster but less adaptive to actual page behavior.
|
||||
# await page.goto(self.url, wait_until="networkidle") # method 1: use if the page has unpredictable async content and there is the need to ensure everything loads
|
||||
# The "networkidle" approach is generally more robust but slower, while the fixed timeout is faster but less adaptive to actual page behavior.
|
||||
# ---alternative method2: use if there is total awareness of the page's loading pattern and want faster, more reliable execution
|
||||
await page.goto(self.url, timeout=50000, wait_until="load")# deafult timeout=30000, 30sec
|
||||
await page.goto(
|
||||
self.url, timeout=50000, wait_until="load"
|
||||
) # deafult timeout=30000, 30sec
|
||||
# Wait for page to load completely
|
||||
await page.wait_for_timeout(2000) # Wait for dynamic content
|
||||
# -----
|
||||
|
||||
if extract_context:
|
||||
print("Getting page metadata...")
|
||||
# Get page metadata once
|
||||
page_metadata = await self._get_page_metadata(page)
|
||||
page_title = page_metadata["title"]
|
||||
|
|
@ -367,15 +364,41 @@ class ImageExtractor:
|
|||
|
||||
if len(specific_images_urls) == 0:
|
||||
# Find all img elements
|
||||
print("Extracting all images from the page",self.url)
|
||||
img_elements = await page.locator("img").all()
|
||||
print("Extracting all images from the page", self.url)
|
||||
# img_elements = await page.locator("img").all()
|
||||
else:
|
||||
print(
|
||||
"Extracting specific images from the page:",
|
||||
self.url,
|
||||
specific_images_urls,
|
||||
)
|
||||
img_elements = []
|
||||
# img_elements = await page.locator("img").all()
|
||||
|
||||
""" # method 3: optimized approach
|
||||
# Get all src attributes in one go
|
||||
all_img_elements = await page.locator("img").all()
|
||||
all_srcs = await page.locator("img").evaluate_all(
|
||||
"elements => elements.map(el => el.src || '')"
|
||||
)
|
||||
|
||||
# Filter with the pre-fetched src values
|
||||
img_elements = [
|
||||
elem for elem, src in zip(all_img_elements, all_srcs)
|
||||
if src in specific_images_urls
|
||||
]
|
||||
"""
|
||||
|
||||
""" #method 2: single pass to find matching images
|
||||
for img_element in all_img_elements: #This is more efficient than making separate locator queries for each specific URL and avoids timeout issues.
|
||||
try:
|
||||
src = await img_element.get_attribute("src")
|
||||
print("found image src:", src)
|
||||
if src in specific_images_urls:
|
||||
img_elements.append(img_element)
|
||||
except Exception as e:
|
||||
print(f"Error getting src attribute from image: {str(e)}")"""
|
||||
|
||||
""" # method 1: separate locator queries for each specific URL
|
||||
for url in specific_images_urls:
|
||||
try:
|
||||
img_element = await page.locator(
|
||||
|
|
@ -384,8 +407,11 @@ class ImageExtractor:
|
|||
if img_element:
|
||||
img_elements.append(img_element)
|
||||
except Exception as e:
|
||||
print(f"Error locating image with src {url}: {str(e)}")
|
||||
print(f"Error locating image with src {url}: {str(e)}")"""
|
||||
|
||||
img_elements = await page.locator(
|
||||
"img"
|
||||
).all() # unified approach to start with all images and filter later
|
||||
image_source_list = [] # avoid multiple check for the same image url
|
||||
images_data = []
|
||||
|
||||
|
|
@ -404,6 +430,12 @@ class ImageExtractor:
|
|||
if not src:
|
||||
print("image has no src attribute. Skipped.")
|
||||
continue
|
||||
if (
|
||||
src not in specific_images_urls
|
||||
and len(specific_images_urls) > 0
|
||||
):
|
||||
# print("image src",src,"not in the specific images list. Skipped.")
|
||||
continue
|
||||
|
||||
if src not in image_source_list:
|
||||
image_source_list.append(src)
|
||||
|
|
@ -434,6 +466,7 @@ class ImageExtractor:
|
|||
alt_text = await img.get_attribute("alt") or ""
|
||||
|
||||
if extract_context:
|
||||
print("Extracting context for image:", img_url)
|
||||
# Get surrounding HTML context (full, immediate, and nearby)
|
||||
html_context, immediate_context, nearby_text = (
|
||||
await self._get_element_context(page, img)
|
||||
|
|
|
|||
|
|
@ -76,39 +76,7 @@ class MLLMManager:
|
|||
return payload
|
||||
|
||||
def get_alt_text_system_prompt(self):
|
||||
system_prompt_old = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for
|
||||
images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present
|
||||
the same information as the image, and should be able to substitute for the non-text content. The text alternative would
|
||||
be brief but as informative as possible.
|
||||
|
||||
Follow these instructions carefully:
|
||||
1. You will be provided as input with the following:
|
||||
- The image found on the webpage.
|
||||
- The associated alternative text. When the alt-text is empty or absent, you will be explicitly informed.
|
||||
- The surrounding context of the image.
|
||||
- The page title, headings and the content of the “keywords” and “description” <meta> tag, if found.
|
||||
|
||||
2. Determine the function and purpose of the image by analyzing these elements. Take into account the purpose and function
|
||||
of the associated image by considering the page context. Check also if the image is, or is associated with, a link or a button,
|
||||
and consider this in your judgement. If the image contains text use that as part of the context.
|
||||
|
||||
3. Provide a final assessment based on the following:
|
||||
- 'success' if you can assess with 'sufficient certainty' the alt-text is appropriate in relation to the image purpose,
|
||||
- 'failure' if you can assess with 'sufficient certainty' that the alt-text is NOT appropriate,
|
||||
- 'warning' if you cannot determine with 'sufficient certainty'.
|
||||
where the level of certainty goes from 1 to 100 and 'sufficient certainty' means > 80
|
||||
|
||||
4. The original alt-text assessment on a scale from 1 to 5, where 5 is the best score. Use an integer number only.
|
||||
|
||||
5. Provide a brief reasoning for your judgment. If the image contains text, write it verbatim. Your response should be in English.
|
||||
|
||||
6. Keep your response within 150 words.
|
||||
|
||||
7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words.
|
||||
|
||||
8. Here is the JSON format the results must have:
|
||||
{"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}"""
|
||||
|
||||
|
||||
# https://www.w3.org/WAI/WCAG22/Techniques/general/G94 without examples
|
||||
system_prompt = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for
|
||||
images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present
|
||||
|
|
@ -122,7 +90,7 @@ class MLLMManager:
|
|||
What purpose does it fulfill?
|
||||
If I could not use the image content, what words would I use to convey the same function and/or information?
|
||||
|
||||
When image content contains words that are important to understanding the content, the alt text should include those words
|
||||
When image content contains words that are important to understanding the content, the alt text should include those words.
|
||||
|
||||
Follow these instructions carefully:
|
||||
1. You will be provided as input with the following:
|
||||
|
|
@ -147,7 +115,7 @@ class MLLMManager:
|
|||
|
||||
6. Keep your response within 150 words.
|
||||
|
||||
7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words.
|
||||
7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words. Use the same language as the original alt-text.
|
||||
|
||||
8. Here is the JSON format the results must have:
|
||||
{"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}"""
|
||||
|
|
@ -181,7 +149,7 @@ class MLLMManager:
|
|||
print("Using end_point:", self.end_point)
|
||||
|
||||
alt_text_system_prompt = self.get_alt_text_system_prompt()
|
||||
print("alt_text_system_prompt:", alt_text_system_prompt)
|
||||
#print("alt_text_system_prompt:", alt_text_system_prompt)
|
||||
|
||||
mllm_responses = []
|
||||
for img_info in images:
|
||||
|
|
|
|||
|
|
@ -46,6 +46,7 @@ class ExtractImagesRoutes:
|
|||
self, request: Request, data: ExtractImages
|
||||
) -> JSONResponse:
|
||||
"""Return the alt text validation assessment based on WCAG guidelines"""
|
||||
print("Received extract images request.")
|
||||
try:
|
||||
json_content = json.loads(data.model_dump_json())
|
||||
|
||||
|
|
|
|||
|
|
@ -53,6 +53,7 @@ class WCAGAltTextValuationRoutes:
|
|||
) -> JSONResponse:
|
||||
"""Return the alt text validation assessment based on WCAG guidelines"""
|
||||
try:
|
||||
print("Received wcag alttext validation request.")
|
||||
json_content = json.loads(data.model_dump_json())
|
||||
mllm_model_id = self.mllm_settings["mllm_model_id"]
|
||||
|
||||
|
|
@ -67,7 +68,12 @@ class WCAGAltTextValuationRoutes:
|
|||
.replace(":", "")
|
||||
.replace("//", "_")
|
||||
.replace("/", "_")
|
||||
.replace("%2", "_")
|
||||
.replace("?", "_")
|
||||
.replace("=", "_")
|
||||
.replace("&", "_")
|
||||
)
|
||||
url_path=url_path[:50] # limit length
|
||||
now = datetime.now(timezone.utc)
|
||||
now_str = now.strftime("%Y_%m_%d-%H_%M_%S")
|
||||
folder_str = mllm_model_id.replace(":", "-") + "_" + now_str
|
||||
|
|
@ -93,7 +99,7 @@ class WCAGAltTextValuationRoutes:
|
|||
# Extract images
|
||||
logging.info(f"Extracting images from: {json_content['page_url']}")
|
||||
images = await image_extractor.extract_images(
|
||||
specific_images_urls=json_content["specific_images_urls"]
|
||||
specific_images_urls=json_content["specific_images_urls"],extract_context=True
|
||||
)
|
||||
# MLLM settings
|
||||
mllm_end_point = self.mllm_settings["mllm_end_point"]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,541 @@
|
|||
# to launch: python build_dataset_from_folder.py --ref_path "" --push_to_hub --repo_id "nicolaleo/LLM-alt-text-assessment" --token ""
|
||||
|
||||
from datasets import Dataset, DatasetDict
|
||||
import datasets
|
||||
import json
|
||||
from pathlib import Path
|
||||
from PIL import Image
|
||||
import hashlib
|
||||
import urllib.parse
|
||||
import argparse
|
||||
|
||||
|
||||
'''
|
||||
# Dataset metadata
|
||||
_DESCRIPTION = """\
|
||||
Dataset for image alt-text assessment and improvement using MLLM responses.
|
||||
Contains images, original alt-texts, quality assessments, and improved versions.
|
||||
"""
|
||||
|
||||
_CITATION = """\
|
||||
@misc{alt_text_assessment,
|
||||
title={Alt-Text Assessment Dataset},
|
||||
year={2024}
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class AltTextDataset(datasets.GeneratorBasedBuilder):
|
||||
"""Dataset for alt-text assessment with images and MLLM responses."""
|
||||
|
||||
VERSION = datasets.Version("1.0.0")
|
||||
|
||||
def _info(self):
|
||||
return datasets.DatasetInfo(
|
||||
description=_DESCRIPTION,
|
||||
features=datasets.Features({
|
||||
"image": datasets.Image(),
|
||||
"image_url": datasets.Value("string"),
|
||||
"alt_text": datasets.Value("string"),
|
||||
"original_alt_text_assessment": datasets.Value("string"),
|
||||
"assessment": datasets.Value("string"),
|
||||
"evaluation_result": datasets.Value("string"),
|
||||
"new_alt_text": datasets.Value("string"),
|
||||
#"source_folder": datasets.Value("string"),
|
||||
}),
|
||||
citation=_CITATION,
|
||||
)
|
||||
|
||||
def _split_generators(self, dl_manager):
|
||||
"""Define data splits."""
|
||||
return [
|
||||
datasets.SplitGenerator(
|
||||
name=datasets.Split.TRAIN,
|
||||
gen_kwargs={
|
||||
"json_filepath": "data.json",
|
||||
"images_dir": "images"
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
def _generate_examples(self, json_filepath, images_dir):
|
||||
"""Generate examples from JSON file and image directory."""
|
||||
with open(json_filepath, encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
images_path = Path(images_dir)
|
||||
|
||||
for idx, entry in enumerate(data):
|
||||
image_url = entry["image_url"]
|
||||
image_filename = url_to_filename(image_url)
|
||||
image_path = images_path / image_filename
|
||||
|
||||
# Load image if exists, otherwise None
|
||||
image = str(image_path) if image_path.exists() else None
|
||||
|
||||
yield idx, {
|
||||
"image": image,
|
||||
"image_url": image_url,
|
||||
"alt_text": entry["alt_text"],
|
||||
"original_alt_text_assessment": entry["mllm_response"]["original_alt_text_assessment"],
|
||||
"assessment": entry["mllm_response"]["assessment"],
|
||||
"evaluation_result": entry["mllm_response"]["evaluation_result"],
|
||||
"new_alt_text": entry["mllm_response"]["new_alt_text"],
|
||||
}
|
||||
|
||||
'''
|
||||
# ============================================================================
|
||||
# SIMPLE USAGE FUNCTIONS
|
||||
# ============================================================================
|
||||
|
||||
|
||||
def url_to_filename(image_url): # save step as in the image_extractor dependence
|
||||
"""
|
||||
Convert image URL to sanitized filename following your exact logic.
|
||||
|
||||
Args:
|
||||
image_url: The image URL
|
||||
|
||||
Returns:
|
||||
Sanitized filename with extension
|
||||
"""
|
||||
|
||||
# Parse the URL to get the path without query parameters
|
||||
parsed_url = urllib.parse.urlparse(image_url)
|
||||
url_path = parsed_url.path
|
||||
|
||||
# Get the filename from the path
|
||||
filename = url_path.split("/")[-1]
|
||||
print(f"Original filename: '{filename}'")
|
||||
|
||||
# Split filename and extension
|
||||
if "." in filename:
|
||||
image_name, ext = filename.rsplit(".", 1)
|
||||
ext = ext.lower()
|
||||
else:
|
||||
image_name = filename
|
||||
ext = "jpg"
|
||||
|
||||
# Validate extension
|
||||
if ext not in ["jpg", "jpeg", "png", "gif", "webp"]:
|
||||
ext = "jpg"
|
||||
|
||||
# Sanitize image name (remove special characters, limit length)
|
||||
image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_"))
|
||||
|
||||
image_name = image_name[:50] # Limit filename length
|
||||
|
||||
# If name is empty after sanitization, create a hash-based name
|
||||
if not image_name:
|
||||
image_name = hashlib.md5(image_url.encode()).hexdigest()[:16]
|
||||
|
||||
return f"{image_name}.{ext}"
|
||||
|
||||
|
||||
def push_to_hub_example(dataset_path="alt_text_merged_dataset", repo_id="",token=None):
|
||||
"""
|
||||
Example of how to push dataset to Hugging Face Hub.
|
||||
You need to authenticate first!
|
||||
"""
|
||||
from huggingface_hub import login
|
||||
|
||||
print("\n=== Pushing Dataset to Hugging Face Hub ===")
|
||||
# Method 1: Login interactively (will prompt for token)
|
||||
# login()
|
||||
|
||||
# Method 2: Login with token directly
|
||||
login(token=token)
|
||||
|
||||
# Method 3: Set token as environment variable
|
||||
# export HF_TOKEN="hf_YourTokenHere"
|
||||
# Then login() will use it automatically
|
||||
|
||||
# Load your dataset
|
||||
ds = load_dataset_from_disk(dataset_path)
|
||||
|
||||
# Combine into DatasetDict
|
||||
ds = DatasetDict(
|
||||
{
|
||||
"train": ds,
|
||||
# #"test": test_dataset
|
||||
}
|
||||
)
|
||||
|
||||
# Push to hub (creates repo if it doesn't exist)
|
||||
ds.push_to_hub( # Automatically converts to Parquet when uploading to Hub
|
||||
repo_id, # Replace with your username
|
||||
private=False, # Set True for private dataset
|
||||
)
|
||||
|
||||
print("Dataset pushed successfully!")
|
||||
print(f"View at: https://huggingface.co/datasets/{repo_id}")
|
||||
|
||||
|
||||
def create_dataset_from_json(json_filepath, json_filepath_images, images_dir="images"):
|
||||
"""
|
||||
Create a Hugging Face Dataset from JSON file with local images.
|
||||
|
||||
Args:
|
||||
json_filepath: Path to JSON file with your data structure
|
||||
images_dir: Directory containing the images (default: "images")
|
||||
|
||||
Returns:
|
||||
datasets.Dataset object with images loaded
|
||||
"""
|
||||
with open(json_filepath, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
with open(json_filepath_images, "r", encoding="utf-8") as f:
|
||||
data_images = json.load(f)
|
||||
|
||||
images_path = Path(images_dir)
|
||||
|
||||
# Flatten the nested structure and load images
|
||||
flattened_data = {
|
||||
"image": [],
|
||||
"image_url": [],
|
||||
"alt_text": [],
|
||||
"original_alt_text_assessment": [],
|
||||
"assessment": [],
|
||||
"evaluation_result": [],
|
||||
"new_alt_text": [],
|
||||
"page_url": [],
|
||||
"html_context": [],
|
||||
}
|
||||
|
||||
count_entry = 0
|
||||
for entry in data:
|
||||
if (
|
||||
entry["mllm_response"]["original_alt_text_assessment"] is None
|
||||
): # important! skip entries with no MLLM response. not usable data
|
||||
print(
|
||||
f"Skipping entry with image URL: {entry['image_url']} due to missing MLLM response"
|
||||
)
|
||||
count_entry += 1
|
||||
continue # Skip entries with no MLLM response
|
||||
image_url = entry["image_url"]
|
||||
image_filename = url_to_filename(image_url)
|
||||
image_path = images_path / image_filename
|
||||
|
||||
# Load image if it exists
|
||||
if image_path.exists():
|
||||
img = Image.open(image_path)
|
||||
flattened_data["image"].append(img)
|
||||
else:
|
||||
print(f"Warning: Image not found: {image_path}")
|
||||
flattened_data["image"].append(None)
|
||||
|
||||
flattened_data["image_url"].append(image_url)
|
||||
flattened_data["alt_text"].append(entry["alt_text"])
|
||||
flattened_data["original_alt_text_assessment"].append(
|
||||
str(entry["mllm_response"]["original_alt_text_assessment"])
|
||||
)
|
||||
flattened_data["assessment"].append(entry["mllm_response"]["assessment"])
|
||||
flattened_data["evaluation_result"].append(
|
||||
entry["mllm_response"]["evaluation_result"]
|
||||
)
|
||||
flattened_data["new_alt_text"].append(entry["mllm_response"]["new_alt_text"])
|
||||
flattened_data["page_url"].append(data_images[count_entry]["page_url"])
|
||||
flattened_data["html_context"].append(data_images[count_entry]["html_context"])
|
||||
|
||||
count_entry += 1
|
||||
|
||||
print(f"Total valid entries loaded: {len(flattened_data['image_url'])}")
|
||||
return datasets.Dataset.from_dict(flattened_data)
|
||||
|
||||
|
||||
def create_dataset_from_folders(
|
||||
ref_path,
|
||||
json_filename="mllm_alttext_assessments.json",
|
||||
json_filename_images="extracted_images.json",
|
||||
images_dirname="images",
|
||||
):
|
||||
"""
|
||||
Create a merged dataset from multiple folders under ref_path.
|
||||
Each folder should contain a JSON file and an images subdirectory.
|
||||
|
||||
Args:
|
||||
ref_path: Root path containing multiple folders
|
||||
json_filename: Name of JSON file in each folder (default: "data.json")
|
||||
images_dirname: Name of images subdirectory (default: "images")
|
||||
|
||||
Returns:
|
||||
datasets.Dataset object with all entries merged
|
||||
"""
|
||||
ref_path = Path(ref_path)
|
||||
all_datasets = []
|
||||
|
||||
# Find all subdirectories containing the JSON file
|
||||
folders_processed = 0
|
||||
|
||||
for folder in ref_path.iterdir():
|
||||
if not folder.is_dir():
|
||||
continue
|
||||
|
||||
json_path = folder / json_filename
|
||||
json_path_images = folder / json_filename_images
|
||||
images_path = folder / images_dirname
|
||||
|
||||
# Check if both JSON and images directory exist
|
||||
if not json_path.exists():
|
||||
print(f"Skipping {folder.name}: no {json_filename} found")
|
||||
continue
|
||||
|
||||
if not json_path_images.exists():
|
||||
print(f"Skipping {folder.name}: no {json_filename_images} found")
|
||||
continue
|
||||
|
||||
if not images_path.exists():
|
||||
print(f"Warning: {folder.name}: images directory not found")
|
||||
# continue
|
||||
# Continue anyway, images might be optional (from urls only)
|
||||
|
||||
print(f"Processing folder: {folder.name}")
|
||||
|
||||
try:
|
||||
# Create dataset for this folder
|
||||
ds = create_dataset_from_json(
|
||||
str(json_path), str(json_path_images), str(images_path)
|
||||
)
|
||||
all_datasets.append(ds)
|
||||
|
||||
folders_processed += 1
|
||||
print(f" -> Loaded {len(ds)} entries")
|
||||
except Exception as e:
|
||||
print(f"Error processing {folder.name}: {e}")
|
||||
continue
|
||||
|
||||
if not all_datasets:
|
||||
raise ValueError(f"No valid folders found in {ref_path}")
|
||||
|
||||
# Merge all datasets
|
||||
print(f"\n=== Merging {folders_processed} folders ===")
|
||||
merged_dataset = datasets.concatenate_datasets(all_datasets)
|
||||
print(f"Total entries: {len(merged_dataset)}")
|
||||
|
||||
return merged_dataset
|
||||
|
||||
|
||||
def verify_images(json_filepath, images_dir="images"):
|
||||
"""
|
||||
Verify that all images referenced in JSON exist in the images directory.
|
||||
|
||||
Args:
|
||||
json_filepath: Path to JSON file
|
||||
images_dir: Directory containing images
|
||||
|
||||
Returns:
|
||||
Dict with 'found', 'missing', and 'details' keys
|
||||
"""
|
||||
with open(json_filepath, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
images_path = Path(images_dir)
|
||||
|
||||
found = []
|
||||
missing = []
|
||||
|
||||
for entry in data:
|
||||
image_url = entry["image_url"]
|
||||
image_filename = url_to_filename(image_url)
|
||||
image_path = images_path / image_filename
|
||||
print(
|
||||
"image_url:",
|
||||
image_url,
|
||||
"image_filename:",
|
||||
image_filename,
|
||||
"image_path:",
|
||||
image_path,
|
||||
)
|
||||
|
||||
if image_path.exists():
|
||||
found.append(
|
||||
{"url": image_url, "filename": image_filename, "path": str(image_path)}
|
||||
)
|
||||
else:
|
||||
missing.append(
|
||||
{
|
||||
"url": image_url,
|
||||
"filename": image_filename,
|
||||
"expected_path": str(image_path),
|
||||
}
|
||||
)
|
||||
|
||||
return {
|
||||
"found": len(found),
|
||||
"missing": len(missing),
|
||||
"total": len(data),
|
||||
"details": {"found_images": found, "missing_images": missing},
|
||||
}
|
||||
|
||||
|
||||
def verify_images_in_folders(
|
||||
ref_path, json_filename="mllm_alttext_assessments.json", images_dirname="images"
|
||||
):
|
||||
"""
|
||||
Verify images across all folders under ref_path.
|
||||
|
||||
Args:
|
||||
ref_path: Root path containing multiple folders
|
||||
json_filename: Name of JSON file in each folder
|
||||
images_dirname: Name of images subdirectory
|
||||
|
||||
Returns:
|
||||
Dict with aggregated verification results
|
||||
"""
|
||||
ref_path = Path(ref_path)
|
||||
total_found = 0
|
||||
total_missing = 0
|
||||
total_entries = 0
|
||||
folder_results = {}
|
||||
|
||||
for folder in ref_path.iterdir():
|
||||
if not folder.is_dir():
|
||||
continue
|
||||
|
||||
json_path = folder / json_filename
|
||||
images_path = folder / images_dirname
|
||||
|
||||
if not json_path.exists():
|
||||
continue
|
||||
|
||||
print(f"Verifying folder: {folder.name}")
|
||||
|
||||
try:
|
||||
verification = verify_images(str(json_path), str(images_path))
|
||||
folder_results[folder.name] = verification
|
||||
|
||||
total_found += verification["found"]
|
||||
total_missing += verification["missing"]
|
||||
total_entries += verification["total"]
|
||||
|
||||
print(f" Found: {verification['found']}/{verification['total']}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
continue
|
||||
|
||||
return {
|
||||
"found": total_found,
|
||||
"missing": total_missing,
|
||||
"total": total_entries,
|
||||
"folders": folder_results,
|
||||
}
|
||||
|
||||
|
||||
def save_dataset(dataset, output_path):
|
||||
"""Save dataset in Arrow format (includes images)."""
|
||||
dataset.save_to_disk(output_path)
|
||||
# print(f"Dataset saved to {output_path}")
|
||||
|
||||
# Or save as JSON
|
||||
# dataset.to_json(f"{output_path}/data.json")
|
||||
|
||||
# Or save as CSV
|
||||
# dataset.to_csv(f"{output_path}/data.csv")
|
||||
|
||||
# Or save as Parquet
|
||||
# dataset.to_parquet(f"{output_path}/data.parquet")
|
||||
|
||||
|
||||
def load_dataset_from_disk(dataset_path):
|
||||
"""Load a previously saved dataset."""
|
||||
return datasets.load_from_disk(dataset_path)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# EXAMPLE USAGE
|
||||
# ============================================================================
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument(
|
||||
"--ref_path",
|
||||
type=str,
|
||||
help=("Root path containing multiple folders"),
|
||||
default="",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--push_to_hub",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help=("If True push the merged dataset to Hugging Face Hub"),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--token",
|
||||
type=str,
|
||||
help=("Hugging Face authentication token"),
|
||||
default="",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--repo_id",
|
||||
type=str,
|
||||
help=("Hugging Face repository ID"),
|
||||
default="nicolaleo/LLM-alt-text-assessment",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Example 1: Verify images across all folders
|
||||
print("=== Verifying Images in All Folders ===")
|
||||
verification = verify_images_in_folders(args.ref_path)
|
||||
print("\n######## Verifier output ################################")
|
||||
print(f"Total Found: {verification['found']}/{verification['total']}")
|
||||
print(f"Total Missing: {verification['missing']}/{verification['total']}")
|
||||
print("########################################")
|
||||
|
||||
# Show per-folder breakdown
|
||||
print("\n=== Per-Folder Breakdown ===")
|
||||
for folder_name, results in verification["folders"].items():
|
||||
print(f"{folder_name}: {results['found']}/{results['total']} images found")
|
||||
|
||||
# Example 2: Create merged dataset from all folders
|
||||
print("\n=== Creating Merged Dataset ===")
|
||||
ds = create_dataset_from_folders(args.ref_path)
|
||||
print("\n######## Merged Dataset output ################################")
|
||||
print(f"Final dataset size: {len(ds)} entries")
|
||||
print("########################################")
|
||||
|
||||
# Example 3: Analyze the merged dataset
|
||||
print("\n=== Dataset Analysis ===")
|
||||
print(ds)
|
||||
|
||||
# Example 3: Access images and data
|
||||
print("\n=== First Example ===")
|
||||
first_example = ds[0]
|
||||
print(f"Image URL: {first_example['image_url']}")
|
||||
print(f"Alt text: {first_example['alt_text']}")
|
||||
print(f"Assessment: {first_example['assessment']}")
|
||||
print(f"New alt text: {first_example['new_alt_text']}")
|
||||
print(f"Image loaded: {first_example['image'] is not None}")
|
||||
|
||||
if first_example["image"] is not None:
|
||||
img = first_example["image"]
|
||||
print(f"Image size: {img.size}")
|
||||
# img.show() # Uncomment to display image
|
||||
|
||||
# Example 4: Filter and work with merged data
|
||||
print("\n=== Filtering Merged Dataset ===")
|
||||
successful = ds.filter(lambda x: x["assessment"] == "success")
|
||||
print(f"Successful assessments: {len(successful)}")
|
||||
|
||||
high_rated = ds.filter(lambda x: int(x["original_alt_text_assessment"]) >= 4)
|
||||
print(f"High-rated (>=4): {len(high_rated)}")
|
||||
|
||||
# Example 5: Save merged dataset
|
||||
print("\n=== Saving Merged Dataset ===")
|
||||
save_dataset(ds, "alt_text_merged_dataset")
|
||||
|
||||
# Example 6: Load dataset
|
||||
print("\n=== Loading Dataset ===")
|
||||
loaded_ds = load_dataset_from_disk("alt_text_merged_dataset")
|
||||
print(f"Loaded {len(loaded_ds)} entries")
|
||||
|
||||
if args.push_to_hub:
|
||||
# Push to Hugging Face Hub (optional)
|
||||
push_to_hub_example(repo_id=args.repo_id, token=args.token) # function below for details
|
||||
|
|
@ -5,4 +5,5 @@ transformers==4.57.1
|
|||
numpy==2.2.6
|
||||
matplotlib==3.10.7
|
||||
scikit-learn==1.7.2
|
||||
sentence-transformers==5.1.2
|
||||
sentence-transformers==5.1.2
|
||||
datasets==4.4.1
|
||||
Loading…
Reference in New Issue