upgrade e costruzione datasetHF
This commit is contained in:
parent
cde7259ed7
commit
85c03b3a1a
3
UI/.env
3
UI/.env
|
|
@ -1,4 +1,5 @@
|
||||||
DB_PATH=persistence/wcag_validator_ui.db
|
DB_PATH=persistence/wcag_validator_ui.db
|
||||||
WCAG_REST_SERVER_URL=http://localhost:8000
|
WCAG_REST_SERVER_URL=http://localhost:8000
|
||||||
URL_LIST_old=["http://www.amazon.it","https://web.archive.org/web/20230630235957/http://www.amazon.com/", "https://web.archive.org/web/20251130033532/https://www.ebay.com/"]
|
URL_LIST_old=["http://www.amazon.it","https://web.archive.org/web/20230630235957/http://www.amazon.com/", "https://web.archive.org/web/20251130033532/https://www.ebay.com/"]
|
||||||
URL_LIST=["https://amazon.com","https://ebay.com","https://walmart.com","https://etsy.com","https://target.com","https://wayfair.com","https://bestbuy.com","https://macys.com","https://homedepot.com","https://costco.com","https://www.ansa.it","https://en.wikipedia.org/wiki/Main_Page","https://www.lanazione.it","https://www.ansa.it","https://www.bbc.com","https://www.cnn.com","https://www.nytimes.com","https://www.theguardian.com"]
|
URL_LIST_old=["https://www.amazon.com/s?k=magllioni&crid=CGD2UWO33O58&sprefix=magllioni%2Caps%2C209&ref=nb_sb_noss","https://web.archive.org/web/20251011214807/https://www.ilfattoquotidiano.it/","https://amazon.com","https://ebay.com","https://walmart.com","https://etsy.com","https://target.com","https://wayfair.com","https://www.ansa.it","https://en.wikipedia.org/wiki/Main_Page","https://www.lanazione.it","https://www.bbc.com","https://www.cnn.com","https://www.nytimes.com","https://www.theguardian.com"]
|
||||||
|
URL_LIST=["https://giove.isti.cnr.it/users/manca/eBay.html","http://www.amazon.it"]
|
||||||
|
|
@ -31,6 +31,45 @@ import sqlite3
|
||||||
WCAG_VALIDATOR_RESTSERVER_HEADERS = [("Content-Type", "application/json")]
|
WCAG_VALIDATOR_RESTSERVER_HEADERS = [("Content-Type", "application/json")]
|
||||||
|
|
||||||
|
|
||||||
|
def process_dataframe(db_path, url, updated_df, user_state={}):
|
||||||
|
|
||||||
|
print("Processing dataframe to adjust columns...")
|
||||||
|
column_rating_name = "User Assessment for LLM Proposal"
|
||||||
|
|
||||||
|
# Get the assessment column
|
||||||
|
try:
|
||||||
|
updated_df[column_rating_name] = updated_df[column_rating_name].astype(int)
|
||||||
|
except ValueError:
|
||||||
|
return "Error: User Assessment for LLM Proposal must be an integer"
|
||||||
|
|
||||||
|
if (updated_df[column_rating_name] < 1).any() or (
|
||||||
|
updated_df[column_rating_name] > 5
|
||||||
|
).any():
|
||||||
|
return "Error: User Assessment for LLM Proposal must be between 1 and 5"
|
||||||
|
|
||||||
|
dataframe_json = updated_df.to_json(orient="records")
|
||||||
|
connection_db = sqlite3.connect(db_path)
|
||||||
|
json_user_str = json.dumps({"username": user_state["username"]}, ensure_ascii=False)
|
||||||
|
try:
|
||||||
|
# insert after everything to keep datetime aligned
|
||||||
|
db_persistence_insert(
|
||||||
|
connection_db=connection_db,
|
||||||
|
insert_type="wcag_user_llm_alttext_assessments",
|
||||||
|
page_url=url,
|
||||||
|
user=json_user_str,
|
||||||
|
llm_model="",
|
||||||
|
json_in_str=dataframe_json, # to improve
|
||||||
|
json_out_str="done via UI",
|
||||||
|
table="wcag_user_assessments",
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print("Error inserting user assessment into database:", str(e))
|
||||||
|
finally:
|
||||||
|
if connection_db:
|
||||||
|
connection_db.close()
|
||||||
|
return "User assessment saved successfully!"
|
||||||
|
|
||||||
|
|
||||||
def load_images_from_json(json_input):
|
def load_images_from_json(json_input):
|
||||||
"""Extract URLs and alt text from JSON and create HTML gallery"""
|
"""Extract URLs and alt text from JSON and create HTML gallery"""
|
||||||
try:
|
try:
|
||||||
|
|
@ -40,7 +79,7 @@ def load_images_from_json(json_input):
|
||||||
return "No images found in JSON", ""
|
return "No images found in JSON", ""
|
||||||
|
|
||||||
images = data["images"]
|
images = data["images"]
|
||||||
info_text = f"Found {len(images)} image(s)\n"
|
info_text = f"Found {len(images)} image(s)"
|
||||||
print(f"Found {len(data['images'])} image(s)")
|
print(f"Found {len(data['images'])} image(s)")
|
||||||
|
|
||||||
# Create HTML gallery with checkboxes and assessment forms
|
# Create HTML gallery with checkboxes and assessment forms
|
||||||
|
|
@ -58,14 +97,14 @@ def load_images_from_json(json_input):
|
||||||
padding: 10px;
|
padding: 10px;
|
||||||
background: white;
|
background: white;
|
||||||
}
|
}
|
||||||
.image-card:has(input:checked) {
|
.image-card:has(input[type="checkbox"]:checked) {
|
||||||
border-color: #2196F3;
|
border-color: #2196F3;
|
||||||
background: #a7c1c1;
|
background: #a7c1c1;
|
||||||
}
|
}
|
||||||
.image-card img {
|
.image-card img {
|
||||||
width: 100%;
|
width: 100%;
|
||||||
height: 200px;
|
height: 200px;
|
||||||
object-fit: cover;
|
object-fit: scale-down;
|
||||||
border-radius: 4px;
|
border-radius: 4px;
|
||||||
}
|
}
|
||||||
.image-info {
|
.image-info {
|
||||||
|
|
@ -93,7 +132,7 @@ def load_images_from_json(json_input):
|
||||||
display: none;
|
display: none;
|
||||||
margin-top: 15px;
|
margin-top: 15px;
|
||||||
padding: 10px;
|
padding: 10px;
|
||||||
background: #f0f7ff;
|
background: #7896b9;
|
||||||
border-radius: 4px;
|
border-radius: 4px;
|
||||||
border: 1px solid #2196F3;
|
border: 1px solid #2196F3;
|
||||||
}
|
}
|
||||||
|
|
@ -109,18 +148,22 @@ def load_images_from_json(json_input):
|
||||||
margin-bottom: 5px;
|
margin-bottom: 5px;
|
||||||
font-size: 13px;
|
font-size: 13px;
|
||||||
}
|
}
|
||||||
.range-container {
|
|
||||||
|
.radio-container {
|
||||||
|
display: flex;
|
||||||
|
gap: 15px;
|
||||||
|
align-items: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.radio-option {
|
||||||
display: flex;
|
display: flex;
|
||||||
align-items: center;
|
align-items: center;
|
||||||
gap: 10px;
|
gap: 5px;
|
||||||
|
cursor: pointer;
|
||||||
}
|
}
|
||||||
.range-container input[type="range"] {
|
|
||||||
flex: 1;
|
.radio-label {
|
||||||
}
|
font-weight: 500;
|
||||||
.range-value {
|
|
||||||
font-weight: bold;
|
|
||||||
min-width: 20px;
|
|
||||||
text-align: center;
|
|
||||||
}
|
}
|
||||||
textarea {
|
textarea {
|
||||||
width: 100%;
|
width: 100%;
|
||||||
|
|
@ -166,12 +209,28 @@ def load_images_from_json(json_input):
|
||||||
<div id="panel-{idx}" class="assessment-panel">
|
<div id="panel-{idx}" class="assessment-panel">
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<label>Rate current alt-text:</label>
|
<label>Rate current alt-text:</label>
|
||||||
<div class="range-container">
|
<div class="radio-container">
|
||||||
<input type="range" min="1" max="5" value="3"
|
<label class="radio-option">
|
||||||
class="assessment-range" data-index="{idx}"
|
<input type="radio" name="assessment-{idx}" value="1" data-index="{idx}">
|
||||||
oninput="document.getElementById('range-value-{idx}').textContent = this.value">
|
<span class="radio-label">1</span>
|
||||||
<span id="range-value-{idx}" class="range-value">3</span>
|
</label>
|
||||||
</div>
|
<label class="radio-option">
|
||||||
|
<input type="radio" name="assessment-{idx}" value="2" data-index="{idx}">
|
||||||
|
<span class="radio-label">2</span>
|
||||||
|
</label>
|
||||||
|
<label class="radio-option">
|
||||||
|
<input type="radio" name="assessment-{idx}" value="3" data-index="{idx}" checked>
|
||||||
|
<span class="radio-label">3</span>
|
||||||
|
</label>
|
||||||
|
<label class="radio-option">
|
||||||
|
<input type="radio" name="assessment-{idx}" value="4" data-index="{idx}">
|
||||||
|
<span class="radio-label">4</span>
|
||||||
|
</label>
|
||||||
|
<label class="radio-option">
|
||||||
|
<input type="radio" name="assessment-{idx}" value="5" data-index="{idx}">
|
||||||
|
<span class="radio-label">5</span>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<label>New alt-text:</label>
|
<label>New alt-text:</label>
|
||||||
|
|
@ -226,7 +285,7 @@ def load_llm_assessment_from_json(json_input):
|
||||||
{
|
{
|
||||||
"Original Alt Text": alt_text_original,
|
"Original Alt Text": alt_text_original,
|
||||||
"LLM Assessment": original_alt_text_assessment,
|
"LLM Assessment": original_alt_text_assessment,
|
||||||
"Proposed Alt Text": new_alt_text,
|
"LLM Proposed Alt Text": new_alt_text,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -257,7 +316,7 @@ def make_alttext_llm_assessment_api_call(
|
||||||
if not selected_images or len(selected_images) == 0:
|
if not selected_images or len(selected_images) == 0:
|
||||||
info_text = "No images selected"
|
info_text = "No images selected"
|
||||||
print(info_text)
|
print(info_text)
|
||||||
return pd.DataFrame()
|
return "LLM assessment not started", pd.DataFrame()
|
||||||
|
|
||||||
# prepare data for insertion
|
# prepare data for insertion
|
||||||
json_in_str = {}
|
json_in_str = {}
|
||||||
|
|
@ -267,6 +326,7 @@ def make_alttext_llm_assessment_api_call(
|
||||||
user_assessments = []
|
user_assessments = []
|
||||||
user_new_alt_texts = []
|
user_new_alt_texts = []
|
||||||
selected_image_id = []
|
selected_image_id = []
|
||||||
|
user_assessments_llm_proposal = []
|
||||||
for img in selected_images:
|
for img in selected_images:
|
||||||
selected_urls.append(img["image_url"])
|
selected_urls.append(img["image_url"])
|
||||||
selected_alt_text_original.append(img["original_alt_text"])
|
selected_alt_text_original.append(img["original_alt_text"])
|
||||||
|
|
@ -275,6 +335,7 @@ def make_alttext_llm_assessment_api_call(
|
||||||
selected_image_id.append(
|
selected_image_id.append(
|
||||||
int(img["image_index"]) + 1
|
int(img["image_index"]) + 1
|
||||||
) # add the id selected (+1 for index alignment)
|
) # add the id selected (+1 for index alignment)
|
||||||
|
user_assessments_llm_proposal.append(3) # default value for now
|
||||||
json_in_str["images_urls"] = selected_urls
|
json_in_str["images_urls"] = selected_urls
|
||||||
json_in_str["images_alt_text_original"] = selected_alt_text_original
|
json_in_str["images_alt_text_original"] = selected_alt_text_original
|
||||||
json_out_str["user_assessments"] = user_assessments
|
json_out_str["user_assessments"] = user_assessments
|
||||||
|
|
@ -302,9 +363,17 @@ def make_alttext_llm_assessment_api_call(
|
||||||
)
|
)
|
||||||
# return response
|
# return response
|
||||||
info_dataframe = load_llm_assessment_from_json(response)
|
info_dataframe = load_llm_assessment_from_json(response)
|
||||||
|
|
||||||
|
# add the UI ids and other fields to to api response
|
||||||
info_dataframe.insert(
|
info_dataframe.insert(
|
||||||
0, "Image #", selected_image_id
|
0, "Image #", selected_image_id
|
||||||
) # add the UI ids from to api response
|
) # add the UI ids from to api response
|
||||||
|
info_dataframe.insert(2, "User Assessment", user_assessments)
|
||||||
|
|
||||||
|
info_dataframe.insert(3, "User Proposed Alt Text", user_new_alt_texts)
|
||||||
|
info_dataframe["User Assessment for LLM Proposal"] = (
|
||||||
|
user_assessments_llm_proposal
|
||||||
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {"error": str(e)}
|
return {"error": str(e)}
|
||||||
|
|
@ -326,7 +395,7 @@ def make_alttext_llm_assessment_api_call(
|
||||||
finally:
|
finally:
|
||||||
if connection_db:
|
if connection_db:
|
||||||
connection_db.close()
|
connection_db.close()
|
||||||
return info_dataframe
|
return "LLM assessment completed", info_dataframe
|
||||||
|
|
||||||
|
|
||||||
def make_image_extraction_api_call(
|
def make_image_extraction_api_call(
|
||||||
|
|
@ -449,9 +518,10 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
|
||||||
images_number = gr.Slider(
|
images_number = gr.Slider(
|
||||||
5,
|
5,
|
||||||
100,
|
100,
|
||||||
value=30,
|
value=50,
|
||||||
step=5,
|
step=5,
|
||||||
label="Max number of images to retrieve",
|
label="Max number of images to retrieve",
|
||||||
|
visible=False,
|
||||||
)
|
)
|
||||||
with gr.Column():
|
with gr.Column():
|
||||||
|
|
||||||
|
|
@ -459,39 +529,54 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
|
||||||
"Extract Images & Alt Texts", variant="primary"
|
"Extract Images & Alt Texts", variant="primary"
|
||||||
)
|
)
|
||||||
alttext_api_call_btn = gr.Button(
|
alttext_api_call_btn = gr.Button(
|
||||||
"Alt Text LLM Assessment",
|
"Start LLM Assessment",
|
||||||
variant="secondary",
|
variant="secondary",
|
||||||
interactive=False,
|
interactive=False,
|
||||||
)
|
)
|
||||||
|
image_info_output = gr.Textbox(
|
||||||
|
label="Activity tracking", lines=1
|
||||||
|
)
|
||||||
|
|
||||||
with gr.Row():
|
with gr.Row(visible=False) as alttext_results_row:
|
||||||
|
|
||||||
image_info_output = gr.Textbox(label="Managed Images", lines=5)
|
|
||||||
|
|
||||||
# Use DataFrame for tabular output
|
# Use DataFrame for tabular output
|
||||||
alttext_info_output = gr.DataFrame(
|
alttext_info_output = gr.DataFrame(
|
||||||
headers=[
|
headers=[
|
||||||
"Image #",
|
"Image #",
|
||||||
"Original Alt Text",
|
"Original Alt Text",
|
||||||
|
"User Assessment",
|
||||||
|
"User Proposed Alt Text",
|
||||||
"LLM Assessment",
|
"LLM Assessment",
|
||||||
"Proposed Alt Text",
|
"LLM Proposed Alt Text",
|
||||||
|
"User Assessment for LLM Proposal",
|
||||||
],
|
],
|
||||||
label="LLM Assessment Results",
|
label="LLM Assessment Results",
|
||||||
wrap=True, # Wrap text in cells
|
wrap=True, # Wrap text in cells
|
||||||
interactive=False,
|
interactive=True,
|
||||||
|
scale=7,
|
||||||
)
|
)
|
||||||
|
with gr.Column():
|
||||||
|
save_user_assessment_btn = gr.Button(
|
||||||
|
"Save Your Assessment",
|
||||||
|
variant="secondary",
|
||||||
|
interactive=True,
|
||||||
|
scale=1,
|
||||||
|
)
|
||||||
|
gr.Markdown(
|
||||||
|
"ℹ Info: to assess the LLM output, only the values for the 'User Assessment for LLM Proposal' column need to be changed."
|
||||||
|
)
|
||||||
|
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
|
|
||||||
gallery_html = gr.HTML(label="Image Gallery")
|
gallery_html = gr.HTML(label="Image Gallery")
|
||||||
|
|
||||||
image_extraction_api_call_btn.click(
|
image_extraction_api_call_btn.click(
|
||||||
fn=lambda: ("", "", pd.DataFrame(), gr.Button(interactive=False)),
|
fn=lambda: ("", "", gr.update(visible=False), gr.Button(interactive=False)),
|
||||||
inputs=[],
|
inputs=[],
|
||||||
outputs=[
|
outputs=[
|
||||||
image_info_output,
|
image_info_output,
|
||||||
gallery_html,
|
gallery_html,
|
||||||
alttext_info_output,
|
alttext_results_row,
|
||||||
alttext_api_call_btn,
|
alttext_api_call_btn,
|
||||||
],
|
],
|
||||||
).then(
|
).then(
|
||||||
|
|
@ -515,7 +600,7 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
|
||||||
wcag_rest_server_url_state,
|
wcag_rest_server_url_state,
|
||||||
user_state,
|
user_state,
|
||||||
],
|
],
|
||||||
outputs=[alttext_info_output],
|
outputs=[image_info_output, alttext_info_output],
|
||||||
js="""
|
js="""
|
||||||
(url_input,gallery_html) => {
|
(url_input,gallery_html) => {
|
||||||
const checkboxes = document.querySelectorAll('.image-checkbox:checked');
|
const checkboxes = document.querySelectorAll('.image-checkbox:checked');
|
||||||
|
|
@ -533,7 +618,8 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
|
||||||
const index = checkbox.dataset.index;
|
const index = checkbox.dataset.index;
|
||||||
const imageUrl = checkbox.dataset.imgurl;
|
const imageUrl = checkbox.dataset.imgurl;
|
||||||
const originalAlt = document.querySelector('.original-alt[data-index="' + index + '"]').value;
|
const originalAlt = document.querySelector('.original-alt[data-index="' + index + '"]').value;
|
||||||
const assessment = document.querySelector('.assessment-range[data-index="' + index + '"]').value;
|
const assessment = document.querySelector('input[name="assessment-' + index + '"]:checked').value;
|
||||||
|
console.log("assessment:",assessment)
|
||||||
const newAltText = document.querySelector('.new-alt-text[data-index="' + index + '"]').value;
|
const newAltText = document.querySelector('.new-alt-text[data-index="' + index + '"]').value;
|
||||||
|
|
||||||
selectedData.push({
|
selectedData.push({
|
||||||
|
|
@ -548,6 +634,16 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
|
||||||
return [url_input,JSON.stringify(selectedData)];
|
return [url_input,JSON.stringify(selectedData)];
|
||||||
}
|
}
|
||||||
""",
|
""",
|
||||||
|
).then(
|
||||||
|
fn=lambda: gr.update(visible=True),
|
||||||
|
inputs=[],
|
||||||
|
outputs=[alttext_results_row],
|
||||||
|
)
|
||||||
|
|
||||||
|
save_user_assessment_btn.click(
|
||||||
|
fn=process_dataframe,
|
||||||
|
inputs=[db_path_state, url_input, alttext_info_output, user_state],
|
||||||
|
outputs=[image_info_output],
|
||||||
)
|
)
|
||||||
|
|
||||||
# placed here at the end to give full contents visibility to events
|
# placed here at the end to give full contents visibility to events
|
||||||
|
|
|
||||||
|
|
@ -55,7 +55,6 @@ class ImageExtractor:
|
||||||
# Also check query parameters (e.g., format=jpeg)
|
# Also check query parameters (e.g., format=jpeg)
|
||||||
return any(fmt in img_url.lower() for fmt in self.SUPPORTED_FORMATS)
|
return any(fmt in img_url.lower() for fmt in self.SUPPORTED_FORMATS)
|
||||||
|
|
||||||
|
|
||||||
async def _download_image(self, image_url, output_dir="images") -> None:
|
async def _download_image(self, image_url, output_dir="images") -> None:
|
||||||
|
|
||||||
# Parse the URL to get the path without query parameters
|
# Parse the URL to get the path without query parameters
|
||||||
|
|
@ -79,7 +78,7 @@ class ImageExtractor:
|
||||||
|
|
||||||
# Sanitize image name (remove special characters, limit length)
|
# Sanitize image name (remove special characters, limit length)
|
||||||
image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_"))
|
image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_"))
|
||||||
image_name = image_name[:200] # Limit filename length
|
image_name = image_name[:50] # Limit filename length
|
||||||
|
|
||||||
# If name is empty after sanitization, create a hash-based name
|
# If name is empty after sanitization, create a hash-based name
|
||||||
if not image_name:
|
if not image_name:
|
||||||
|
|
@ -88,13 +87,15 @@ class ImageExtractor:
|
||||||
image_name = hashlib.md5(image_url.encode()).hexdigest()[:16]
|
image_name = hashlib.md5(image_url.encode()).hexdigest()[:16]
|
||||||
|
|
||||||
# Download the image
|
# Download the image
|
||||||
print("getting image:", image_url)
|
print("getting image url:", image_url)
|
||||||
|
print("getting image name:", image_name)
|
||||||
response = requests.get(image_url, timeout=10)
|
response = requests.get(image_url, timeout=10)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Save the image
|
# Save the image
|
||||||
output_path = os.path.join(output_dir, f"{image_name}.{ext}")
|
output_path = os.path.join(output_dir, f"{image_name}.{ext}")
|
||||||
|
print("saving image to:", output_path)
|
||||||
with open(output_path, "wb") as f:
|
with open(output_path, "wb") as f:
|
||||||
f.write(response.content)
|
f.write(response.content)
|
||||||
print(f"Saved: {output_path}")
|
print(f"Saved: {output_path}")
|
||||||
|
|
@ -292,43 +293,36 @@ class ImageExtractor:
|
||||||
error_msg = f"Error extracting context: {str(e)}"
|
error_msg = f"Error extracting context: {str(e)}"
|
||||||
return error_msg, error_msg, error_msg
|
return error_msg, error_msg, error_msg
|
||||||
|
|
||||||
async def _get_page_metadata(self, page) -> Dict[str, Optional[str]]:
|
async def _get_page_metadata(self, page):
|
||||||
"""Extract page metadata including title, description, and keywords."""
|
"""Extract page metadata in one fast evaluate call. Batch DOM extraction inside one evaluate()."""
|
||||||
metadata = {
|
return await page.evaluate(
|
||||||
"title": await page.title(),
|
"""
|
||||||
"description": None,
|
() => {
|
||||||
"keywords": None,
|
const metadata = {
|
||||||
"headings": [],
|
title: document.title || null,
|
||||||
}
|
description: null,
|
||||||
|
keywords: null,
|
||||||
|
headings: []
|
||||||
|
};
|
||||||
|
|
||||||
# Extract meta description
|
const desc = document.querySelector('meta[name="description"]');
|
||||||
try:
|
const keys = document.querySelector('meta[name="keywords"]');
|
||||||
description = await page.locator('meta[name="description"]').get_attribute(
|
metadata.description = desc?.content || null;
|
||||||
"content"
|
metadata.keywords = keys?.content || null;
|
||||||
)
|
|
||||||
metadata["description"] = description
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Extract meta keywords
|
// Collect all headings h1–h6
|
||||||
try:
|
const allHeadings = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
|
||||||
keywords = await page.locator('meta[name="keywords"]').get_attribute(
|
metadata.headings = Array.from(allHeadings)
|
||||||
"content"
|
.map(h => ({
|
||||||
)
|
level: parseInt(h.tagName.substring(1), 10),
|
||||||
metadata["keywords"] = keywords
|
text: h.textContent.trim()
|
||||||
except:
|
}))
|
||||||
pass
|
.filter(h => h.text.length > 0);
|
||||||
|
|
||||||
# Extract all headings (h1-h6)
|
return metadata;
|
||||||
for level in range(1, 7):
|
}
|
||||||
headings = await page.locator(f"h{level}").all_text_contents()
|
"""
|
||||||
for heading in headings:
|
)
|
||||||
if heading.strip():
|
|
||||||
metadata["headings"].append(
|
|
||||||
{"level": level, "text": heading.strip()}
|
|
||||||
)
|
|
||||||
|
|
||||||
return metadata
|
|
||||||
|
|
||||||
async def extract_images(
|
async def extract_images(
|
||||||
self, extract_context=True, specific_images_urls=[]
|
self, extract_context=True, specific_images_urls=[]
|
||||||
|
|
@ -344,15 +338,18 @@ class ImageExtractor:
|
||||||
page = await browser.new_page()
|
page = await browser.new_page()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
#await page.goto(self.url, wait_until="networkidle") # method 1: use if the page has unpredictable async content and there is the need to ensure everything loads
|
# await page.goto(self.url, wait_until="networkidle") # method 1: use if the page has unpredictable async content and there is the need to ensure everything loads
|
||||||
# The "networkidle" approach is generally more robust but slower, while the fixed timeout is faster but less adaptive to actual page behavior.
|
# The "networkidle" approach is generally more robust but slower, while the fixed timeout is faster but less adaptive to actual page behavior.
|
||||||
# ---alternative method2: use if there is total awareness of the page's loading pattern and want faster, more reliable execution
|
# ---alternative method2: use if there is total awareness of the page's loading pattern and want faster, more reliable execution
|
||||||
await page.goto(self.url, timeout=50000, wait_until="load")# deafult timeout=30000, 30sec
|
await page.goto(
|
||||||
|
self.url, timeout=50000, wait_until="load"
|
||||||
|
) # deafult timeout=30000, 30sec
|
||||||
# Wait for page to load completely
|
# Wait for page to load completely
|
||||||
await page.wait_for_timeout(2000) # Wait for dynamic content
|
await page.wait_for_timeout(2000) # Wait for dynamic content
|
||||||
# -----
|
# -----
|
||||||
|
|
||||||
if extract_context:
|
if extract_context:
|
||||||
|
print("Getting page metadata...")
|
||||||
# Get page metadata once
|
# Get page metadata once
|
||||||
page_metadata = await self._get_page_metadata(page)
|
page_metadata = await self._get_page_metadata(page)
|
||||||
page_title = page_metadata["title"]
|
page_title = page_metadata["title"]
|
||||||
|
|
@ -367,15 +364,41 @@ class ImageExtractor:
|
||||||
|
|
||||||
if len(specific_images_urls) == 0:
|
if len(specific_images_urls) == 0:
|
||||||
# Find all img elements
|
# Find all img elements
|
||||||
print("Extracting all images from the page",self.url)
|
print("Extracting all images from the page", self.url)
|
||||||
img_elements = await page.locator("img").all()
|
# img_elements = await page.locator("img").all()
|
||||||
else:
|
else:
|
||||||
print(
|
print(
|
||||||
"Extracting specific images from the page:",
|
"Extracting specific images from the page:",
|
||||||
self.url,
|
self.url,
|
||||||
specific_images_urls,
|
specific_images_urls,
|
||||||
)
|
)
|
||||||
img_elements = []
|
# img_elements = await page.locator("img").all()
|
||||||
|
|
||||||
|
""" # method 3: optimized approach
|
||||||
|
# Get all src attributes in one go
|
||||||
|
all_img_elements = await page.locator("img").all()
|
||||||
|
all_srcs = await page.locator("img").evaluate_all(
|
||||||
|
"elements => elements.map(el => el.src || '')"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Filter with the pre-fetched src values
|
||||||
|
img_elements = [
|
||||||
|
elem for elem, src in zip(all_img_elements, all_srcs)
|
||||||
|
if src in specific_images_urls
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
|
||||||
|
""" #method 2: single pass to find matching images
|
||||||
|
for img_element in all_img_elements: #This is more efficient than making separate locator queries for each specific URL and avoids timeout issues.
|
||||||
|
try:
|
||||||
|
src = await img_element.get_attribute("src")
|
||||||
|
print("found image src:", src)
|
||||||
|
if src in specific_images_urls:
|
||||||
|
img_elements.append(img_element)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error getting src attribute from image: {str(e)}")"""
|
||||||
|
|
||||||
|
""" # method 1: separate locator queries for each specific URL
|
||||||
for url in specific_images_urls:
|
for url in specific_images_urls:
|
||||||
try:
|
try:
|
||||||
img_element = await page.locator(
|
img_element = await page.locator(
|
||||||
|
|
@ -384,8 +407,11 @@ class ImageExtractor:
|
||||||
if img_element:
|
if img_element:
|
||||||
img_elements.append(img_element)
|
img_elements.append(img_element)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error locating image with src {url}: {str(e)}")
|
print(f"Error locating image with src {url}: {str(e)}")"""
|
||||||
|
|
||||||
|
img_elements = await page.locator(
|
||||||
|
"img"
|
||||||
|
).all() # unified approach to start with all images and filter later
|
||||||
image_source_list = [] # avoid multiple check for the same image url
|
image_source_list = [] # avoid multiple check for the same image url
|
||||||
images_data = []
|
images_data = []
|
||||||
|
|
||||||
|
|
@ -404,6 +430,12 @@ class ImageExtractor:
|
||||||
if not src:
|
if not src:
|
||||||
print("image has no src attribute. Skipped.")
|
print("image has no src attribute. Skipped.")
|
||||||
continue
|
continue
|
||||||
|
if (
|
||||||
|
src not in specific_images_urls
|
||||||
|
and len(specific_images_urls) > 0
|
||||||
|
):
|
||||||
|
# print("image src",src,"not in the specific images list. Skipped.")
|
||||||
|
continue
|
||||||
|
|
||||||
if src not in image_source_list:
|
if src not in image_source_list:
|
||||||
image_source_list.append(src)
|
image_source_list.append(src)
|
||||||
|
|
@ -434,6 +466,7 @@ class ImageExtractor:
|
||||||
alt_text = await img.get_attribute("alt") or ""
|
alt_text = await img.get_attribute("alt") or ""
|
||||||
|
|
||||||
if extract_context:
|
if extract_context:
|
||||||
|
print("Extracting context for image:", img_url)
|
||||||
# Get surrounding HTML context (full, immediate, and nearby)
|
# Get surrounding HTML context (full, immediate, and nearby)
|
||||||
html_context, immediate_context, nearby_text = (
|
html_context, immediate_context, nearby_text = (
|
||||||
await self._get_element_context(page, img)
|
await self._get_element_context(page, img)
|
||||||
|
|
|
||||||
|
|
@ -76,39 +76,7 @@ class MLLMManager:
|
||||||
return payload
|
return payload
|
||||||
|
|
||||||
def get_alt_text_system_prompt(self):
|
def get_alt_text_system_prompt(self):
|
||||||
system_prompt_old = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for
|
|
||||||
images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present
|
|
||||||
the same information as the image, and should be able to substitute for the non-text content. The text alternative would
|
|
||||||
be brief but as informative as possible.
|
|
||||||
|
|
||||||
Follow these instructions carefully:
|
|
||||||
1. You will be provided as input with the following:
|
|
||||||
- The image found on the webpage.
|
|
||||||
- The associated alternative text. When the alt-text is empty or absent, you will be explicitly informed.
|
|
||||||
- The surrounding context of the image.
|
|
||||||
- The page title, headings and the content of the “keywords” and “description” <meta> tag, if found.
|
|
||||||
|
|
||||||
2. Determine the function and purpose of the image by analyzing these elements. Take into account the purpose and function
|
|
||||||
of the associated image by considering the page context. Check also if the image is, or is associated with, a link or a button,
|
|
||||||
and consider this in your judgement. If the image contains text use that as part of the context.
|
|
||||||
|
|
||||||
3. Provide a final assessment based on the following:
|
|
||||||
- 'success' if you can assess with 'sufficient certainty' the alt-text is appropriate in relation to the image purpose,
|
|
||||||
- 'failure' if you can assess with 'sufficient certainty' that the alt-text is NOT appropriate,
|
|
||||||
- 'warning' if you cannot determine with 'sufficient certainty'.
|
|
||||||
where the level of certainty goes from 1 to 100 and 'sufficient certainty' means > 80
|
|
||||||
|
|
||||||
4. The original alt-text assessment on a scale from 1 to 5, where 5 is the best score. Use an integer number only.
|
|
||||||
|
|
||||||
5. Provide a brief reasoning for your judgment. If the image contains text, write it verbatim. Your response should be in English.
|
|
||||||
|
|
||||||
6. Keep your response within 150 words.
|
|
||||||
|
|
||||||
7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words.
|
|
||||||
|
|
||||||
8. Here is the JSON format the results must have:
|
|
||||||
{"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}"""
|
|
||||||
|
|
||||||
# https://www.w3.org/WAI/WCAG22/Techniques/general/G94 without examples
|
# https://www.w3.org/WAI/WCAG22/Techniques/general/G94 without examples
|
||||||
system_prompt = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for
|
system_prompt = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for
|
||||||
images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present
|
images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present
|
||||||
|
|
@ -122,7 +90,7 @@ class MLLMManager:
|
||||||
What purpose does it fulfill?
|
What purpose does it fulfill?
|
||||||
If I could not use the image content, what words would I use to convey the same function and/or information?
|
If I could not use the image content, what words would I use to convey the same function and/or information?
|
||||||
|
|
||||||
When image content contains words that are important to understanding the content, the alt text should include those words
|
When image content contains words that are important to understanding the content, the alt text should include those words.
|
||||||
|
|
||||||
Follow these instructions carefully:
|
Follow these instructions carefully:
|
||||||
1. You will be provided as input with the following:
|
1. You will be provided as input with the following:
|
||||||
|
|
@ -147,7 +115,7 @@ class MLLMManager:
|
||||||
|
|
||||||
6. Keep your response within 150 words.
|
6. Keep your response within 150 words.
|
||||||
|
|
||||||
7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words.
|
7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words. Use the same language as the original alt-text.
|
||||||
|
|
||||||
8. Here is the JSON format the results must have:
|
8. Here is the JSON format the results must have:
|
||||||
{"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}"""
|
{"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}"""
|
||||||
|
|
@ -181,7 +149,7 @@ class MLLMManager:
|
||||||
print("Using end_point:", self.end_point)
|
print("Using end_point:", self.end_point)
|
||||||
|
|
||||||
alt_text_system_prompt = self.get_alt_text_system_prompt()
|
alt_text_system_prompt = self.get_alt_text_system_prompt()
|
||||||
print("alt_text_system_prompt:", alt_text_system_prompt)
|
#print("alt_text_system_prompt:", alt_text_system_prompt)
|
||||||
|
|
||||||
mllm_responses = []
|
mllm_responses = []
|
||||||
for img_info in images:
|
for img_info in images:
|
||||||
|
|
|
||||||
|
|
@ -46,6 +46,7 @@ class ExtractImagesRoutes:
|
||||||
self, request: Request, data: ExtractImages
|
self, request: Request, data: ExtractImages
|
||||||
) -> JSONResponse:
|
) -> JSONResponse:
|
||||||
"""Return the alt text validation assessment based on WCAG guidelines"""
|
"""Return the alt text validation assessment based on WCAG guidelines"""
|
||||||
|
print("Received extract images request.")
|
||||||
try:
|
try:
|
||||||
json_content = json.loads(data.model_dump_json())
|
json_content = json.loads(data.model_dump_json())
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -53,6 +53,7 @@ class WCAGAltTextValuationRoutes:
|
||||||
) -> JSONResponse:
|
) -> JSONResponse:
|
||||||
"""Return the alt text validation assessment based on WCAG guidelines"""
|
"""Return the alt text validation assessment based on WCAG guidelines"""
|
||||||
try:
|
try:
|
||||||
|
print("Received wcag alttext validation request.")
|
||||||
json_content = json.loads(data.model_dump_json())
|
json_content = json.loads(data.model_dump_json())
|
||||||
mllm_model_id = self.mllm_settings["mllm_model_id"]
|
mllm_model_id = self.mllm_settings["mllm_model_id"]
|
||||||
|
|
||||||
|
|
@ -67,7 +68,12 @@ class WCAGAltTextValuationRoutes:
|
||||||
.replace(":", "")
|
.replace(":", "")
|
||||||
.replace("//", "_")
|
.replace("//", "_")
|
||||||
.replace("/", "_")
|
.replace("/", "_")
|
||||||
|
.replace("%2", "_")
|
||||||
|
.replace("?", "_")
|
||||||
|
.replace("=", "_")
|
||||||
|
.replace("&", "_")
|
||||||
)
|
)
|
||||||
|
url_path=url_path[:50] # limit length
|
||||||
now = datetime.now(timezone.utc)
|
now = datetime.now(timezone.utc)
|
||||||
now_str = now.strftime("%Y_%m_%d-%H_%M_%S")
|
now_str = now.strftime("%Y_%m_%d-%H_%M_%S")
|
||||||
folder_str = mllm_model_id.replace(":", "-") + "_" + now_str
|
folder_str = mllm_model_id.replace(":", "-") + "_" + now_str
|
||||||
|
|
@ -93,7 +99,7 @@ class WCAGAltTextValuationRoutes:
|
||||||
# Extract images
|
# Extract images
|
||||||
logging.info(f"Extracting images from: {json_content['page_url']}")
|
logging.info(f"Extracting images from: {json_content['page_url']}")
|
||||||
images = await image_extractor.extract_images(
|
images = await image_extractor.extract_images(
|
||||||
specific_images_urls=json_content["specific_images_urls"]
|
specific_images_urls=json_content["specific_images_urls"],extract_context=True
|
||||||
)
|
)
|
||||||
# MLLM settings
|
# MLLM settings
|
||||||
mllm_end_point = self.mllm_settings["mllm_end_point"]
|
mllm_end_point = self.mllm_settings["mllm_end_point"]
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,541 @@
|
||||||
|
# to launch: python build_dataset_from_folder.py --ref_path "" --push_to_hub --repo_id "nicolaleo/LLM-alt-text-assessment" --token ""
|
||||||
|
|
||||||
|
from datasets import Dataset, DatasetDict
|
||||||
|
import datasets
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from PIL import Image
|
||||||
|
import hashlib
|
||||||
|
import urllib.parse
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
'''
|
||||||
|
# Dataset metadata
|
||||||
|
_DESCRIPTION = """\
|
||||||
|
Dataset for image alt-text assessment and improvement using MLLM responses.
|
||||||
|
Contains images, original alt-texts, quality assessments, and improved versions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_CITATION = """\
|
||||||
|
@misc{alt_text_assessment,
|
||||||
|
title={Alt-Text Assessment Dataset},
|
||||||
|
year={2024}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class AltTextDataset(datasets.GeneratorBasedBuilder):
|
||||||
|
"""Dataset for alt-text assessment with images and MLLM responses."""
|
||||||
|
|
||||||
|
VERSION = datasets.Version("1.0.0")
|
||||||
|
|
||||||
|
def _info(self):
|
||||||
|
return datasets.DatasetInfo(
|
||||||
|
description=_DESCRIPTION,
|
||||||
|
features=datasets.Features({
|
||||||
|
"image": datasets.Image(),
|
||||||
|
"image_url": datasets.Value("string"),
|
||||||
|
"alt_text": datasets.Value("string"),
|
||||||
|
"original_alt_text_assessment": datasets.Value("string"),
|
||||||
|
"assessment": datasets.Value("string"),
|
||||||
|
"evaluation_result": datasets.Value("string"),
|
||||||
|
"new_alt_text": datasets.Value("string"),
|
||||||
|
#"source_folder": datasets.Value("string"),
|
||||||
|
}),
|
||||||
|
citation=_CITATION,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _split_generators(self, dl_manager):
|
||||||
|
"""Define data splits."""
|
||||||
|
return [
|
||||||
|
datasets.SplitGenerator(
|
||||||
|
name=datasets.Split.TRAIN,
|
||||||
|
gen_kwargs={
|
||||||
|
"json_filepath": "data.json",
|
||||||
|
"images_dir": "images"
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
def _generate_examples(self, json_filepath, images_dir):
|
||||||
|
"""Generate examples from JSON file and image directory."""
|
||||||
|
with open(json_filepath, encoding="utf-8") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
images_path = Path(images_dir)
|
||||||
|
|
||||||
|
for idx, entry in enumerate(data):
|
||||||
|
image_url = entry["image_url"]
|
||||||
|
image_filename = url_to_filename(image_url)
|
||||||
|
image_path = images_path / image_filename
|
||||||
|
|
||||||
|
# Load image if exists, otherwise None
|
||||||
|
image = str(image_path) if image_path.exists() else None
|
||||||
|
|
||||||
|
yield idx, {
|
||||||
|
"image": image,
|
||||||
|
"image_url": image_url,
|
||||||
|
"alt_text": entry["alt_text"],
|
||||||
|
"original_alt_text_assessment": entry["mllm_response"]["original_alt_text_assessment"],
|
||||||
|
"assessment": entry["mllm_response"]["assessment"],
|
||||||
|
"evaluation_result": entry["mllm_response"]["evaluation_result"],
|
||||||
|
"new_alt_text": entry["mllm_response"]["new_alt_text"],
|
||||||
|
}
|
||||||
|
|
||||||
|
'''
|
||||||
|
# ============================================================================
|
||||||
|
# SIMPLE USAGE FUNCTIONS
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
def url_to_filename(image_url): # save step as in the image_extractor dependence
|
||||||
|
"""
|
||||||
|
Convert image URL to sanitized filename following your exact logic.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_url: The image URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Sanitized filename with extension
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Parse the URL to get the path without query parameters
|
||||||
|
parsed_url = urllib.parse.urlparse(image_url)
|
||||||
|
url_path = parsed_url.path
|
||||||
|
|
||||||
|
# Get the filename from the path
|
||||||
|
filename = url_path.split("/")[-1]
|
||||||
|
print(f"Original filename: '{filename}'")
|
||||||
|
|
||||||
|
# Split filename and extension
|
||||||
|
if "." in filename:
|
||||||
|
image_name, ext = filename.rsplit(".", 1)
|
||||||
|
ext = ext.lower()
|
||||||
|
else:
|
||||||
|
image_name = filename
|
||||||
|
ext = "jpg"
|
||||||
|
|
||||||
|
# Validate extension
|
||||||
|
if ext not in ["jpg", "jpeg", "png", "gif", "webp"]:
|
||||||
|
ext = "jpg"
|
||||||
|
|
||||||
|
# Sanitize image name (remove special characters, limit length)
|
||||||
|
image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_"))
|
||||||
|
|
||||||
|
image_name = image_name[:50] # Limit filename length
|
||||||
|
|
||||||
|
# If name is empty after sanitization, create a hash-based name
|
||||||
|
if not image_name:
|
||||||
|
image_name = hashlib.md5(image_url.encode()).hexdigest()[:16]
|
||||||
|
|
||||||
|
return f"{image_name}.{ext}"
|
||||||
|
|
||||||
|
|
||||||
|
def push_to_hub_example(dataset_path="alt_text_merged_dataset", repo_id="",token=None):
|
||||||
|
"""
|
||||||
|
Example of how to push dataset to Hugging Face Hub.
|
||||||
|
You need to authenticate first!
|
||||||
|
"""
|
||||||
|
from huggingface_hub import login
|
||||||
|
|
||||||
|
print("\n=== Pushing Dataset to Hugging Face Hub ===")
|
||||||
|
# Method 1: Login interactively (will prompt for token)
|
||||||
|
# login()
|
||||||
|
|
||||||
|
# Method 2: Login with token directly
|
||||||
|
login(token=token)
|
||||||
|
|
||||||
|
# Method 3: Set token as environment variable
|
||||||
|
# export HF_TOKEN="hf_YourTokenHere"
|
||||||
|
# Then login() will use it automatically
|
||||||
|
|
||||||
|
# Load your dataset
|
||||||
|
ds = load_dataset_from_disk(dataset_path)
|
||||||
|
|
||||||
|
# Combine into DatasetDict
|
||||||
|
ds = DatasetDict(
|
||||||
|
{
|
||||||
|
"train": ds,
|
||||||
|
# #"test": test_dataset
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Push to hub (creates repo if it doesn't exist)
|
||||||
|
ds.push_to_hub( # Automatically converts to Parquet when uploading to Hub
|
||||||
|
repo_id, # Replace with your username
|
||||||
|
private=False, # Set True for private dataset
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Dataset pushed successfully!")
|
||||||
|
print(f"View at: https://huggingface.co/datasets/{repo_id}")
|
||||||
|
|
||||||
|
|
||||||
|
def create_dataset_from_json(json_filepath, json_filepath_images, images_dir="images"):
|
||||||
|
"""
|
||||||
|
Create a Hugging Face Dataset from JSON file with local images.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
json_filepath: Path to JSON file with your data structure
|
||||||
|
images_dir: Directory containing the images (default: "images")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
datasets.Dataset object with images loaded
|
||||||
|
"""
|
||||||
|
with open(json_filepath, "r", encoding="utf-8") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
with open(json_filepath_images, "r", encoding="utf-8") as f:
|
||||||
|
data_images = json.load(f)
|
||||||
|
|
||||||
|
images_path = Path(images_dir)
|
||||||
|
|
||||||
|
# Flatten the nested structure and load images
|
||||||
|
flattened_data = {
|
||||||
|
"image": [],
|
||||||
|
"image_url": [],
|
||||||
|
"alt_text": [],
|
||||||
|
"original_alt_text_assessment": [],
|
||||||
|
"assessment": [],
|
||||||
|
"evaluation_result": [],
|
||||||
|
"new_alt_text": [],
|
||||||
|
"page_url": [],
|
||||||
|
"html_context": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
count_entry = 0
|
||||||
|
for entry in data:
|
||||||
|
if (
|
||||||
|
entry["mllm_response"]["original_alt_text_assessment"] is None
|
||||||
|
): # important! skip entries with no MLLM response. not usable data
|
||||||
|
print(
|
||||||
|
f"Skipping entry with image URL: {entry['image_url']} due to missing MLLM response"
|
||||||
|
)
|
||||||
|
count_entry += 1
|
||||||
|
continue # Skip entries with no MLLM response
|
||||||
|
image_url = entry["image_url"]
|
||||||
|
image_filename = url_to_filename(image_url)
|
||||||
|
image_path = images_path / image_filename
|
||||||
|
|
||||||
|
# Load image if it exists
|
||||||
|
if image_path.exists():
|
||||||
|
img = Image.open(image_path)
|
||||||
|
flattened_data["image"].append(img)
|
||||||
|
else:
|
||||||
|
print(f"Warning: Image not found: {image_path}")
|
||||||
|
flattened_data["image"].append(None)
|
||||||
|
|
||||||
|
flattened_data["image_url"].append(image_url)
|
||||||
|
flattened_data["alt_text"].append(entry["alt_text"])
|
||||||
|
flattened_data["original_alt_text_assessment"].append(
|
||||||
|
str(entry["mllm_response"]["original_alt_text_assessment"])
|
||||||
|
)
|
||||||
|
flattened_data["assessment"].append(entry["mllm_response"]["assessment"])
|
||||||
|
flattened_data["evaluation_result"].append(
|
||||||
|
entry["mllm_response"]["evaluation_result"]
|
||||||
|
)
|
||||||
|
flattened_data["new_alt_text"].append(entry["mllm_response"]["new_alt_text"])
|
||||||
|
flattened_data["page_url"].append(data_images[count_entry]["page_url"])
|
||||||
|
flattened_data["html_context"].append(data_images[count_entry]["html_context"])
|
||||||
|
|
||||||
|
count_entry += 1
|
||||||
|
|
||||||
|
print(f"Total valid entries loaded: {len(flattened_data['image_url'])}")
|
||||||
|
return datasets.Dataset.from_dict(flattened_data)
|
||||||
|
|
||||||
|
|
||||||
|
def create_dataset_from_folders(
|
||||||
|
ref_path,
|
||||||
|
json_filename="mllm_alttext_assessments.json",
|
||||||
|
json_filename_images="extracted_images.json",
|
||||||
|
images_dirname="images",
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Create a merged dataset from multiple folders under ref_path.
|
||||||
|
Each folder should contain a JSON file and an images subdirectory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ref_path: Root path containing multiple folders
|
||||||
|
json_filename: Name of JSON file in each folder (default: "data.json")
|
||||||
|
images_dirname: Name of images subdirectory (default: "images")
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
datasets.Dataset object with all entries merged
|
||||||
|
"""
|
||||||
|
ref_path = Path(ref_path)
|
||||||
|
all_datasets = []
|
||||||
|
|
||||||
|
# Find all subdirectories containing the JSON file
|
||||||
|
folders_processed = 0
|
||||||
|
|
||||||
|
for folder in ref_path.iterdir():
|
||||||
|
if not folder.is_dir():
|
||||||
|
continue
|
||||||
|
|
||||||
|
json_path = folder / json_filename
|
||||||
|
json_path_images = folder / json_filename_images
|
||||||
|
images_path = folder / images_dirname
|
||||||
|
|
||||||
|
# Check if both JSON and images directory exist
|
||||||
|
if not json_path.exists():
|
||||||
|
print(f"Skipping {folder.name}: no {json_filename} found")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not json_path_images.exists():
|
||||||
|
print(f"Skipping {folder.name}: no {json_filename_images} found")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not images_path.exists():
|
||||||
|
print(f"Warning: {folder.name}: images directory not found")
|
||||||
|
# continue
|
||||||
|
# Continue anyway, images might be optional (from urls only)
|
||||||
|
|
||||||
|
print(f"Processing folder: {folder.name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create dataset for this folder
|
||||||
|
ds = create_dataset_from_json(
|
||||||
|
str(json_path), str(json_path_images), str(images_path)
|
||||||
|
)
|
||||||
|
all_datasets.append(ds)
|
||||||
|
|
||||||
|
folders_processed += 1
|
||||||
|
print(f" -> Loaded {len(ds)} entries")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error processing {folder.name}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not all_datasets:
|
||||||
|
raise ValueError(f"No valid folders found in {ref_path}")
|
||||||
|
|
||||||
|
# Merge all datasets
|
||||||
|
print(f"\n=== Merging {folders_processed} folders ===")
|
||||||
|
merged_dataset = datasets.concatenate_datasets(all_datasets)
|
||||||
|
print(f"Total entries: {len(merged_dataset)}")
|
||||||
|
|
||||||
|
return merged_dataset
|
||||||
|
|
||||||
|
|
||||||
|
def verify_images(json_filepath, images_dir="images"):
|
||||||
|
"""
|
||||||
|
Verify that all images referenced in JSON exist in the images directory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
json_filepath: Path to JSON file
|
||||||
|
images_dir: Directory containing images
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with 'found', 'missing', and 'details' keys
|
||||||
|
"""
|
||||||
|
with open(json_filepath, "r", encoding="utf-8") as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
images_path = Path(images_dir)
|
||||||
|
|
||||||
|
found = []
|
||||||
|
missing = []
|
||||||
|
|
||||||
|
for entry in data:
|
||||||
|
image_url = entry["image_url"]
|
||||||
|
image_filename = url_to_filename(image_url)
|
||||||
|
image_path = images_path / image_filename
|
||||||
|
print(
|
||||||
|
"image_url:",
|
||||||
|
image_url,
|
||||||
|
"image_filename:",
|
||||||
|
image_filename,
|
||||||
|
"image_path:",
|
||||||
|
image_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
if image_path.exists():
|
||||||
|
found.append(
|
||||||
|
{"url": image_url, "filename": image_filename, "path": str(image_path)}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
missing.append(
|
||||||
|
{
|
||||||
|
"url": image_url,
|
||||||
|
"filename": image_filename,
|
||||||
|
"expected_path": str(image_path),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"found": len(found),
|
||||||
|
"missing": len(missing),
|
||||||
|
"total": len(data),
|
||||||
|
"details": {"found_images": found, "missing_images": missing},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def verify_images_in_folders(
|
||||||
|
ref_path, json_filename="mllm_alttext_assessments.json", images_dirname="images"
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Verify images across all folders under ref_path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ref_path: Root path containing multiple folders
|
||||||
|
json_filename: Name of JSON file in each folder
|
||||||
|
images_dirname: Name of images subdirectory
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with aggregated verification results
|
||||||
|
"""
|
||||||
|
ref_path = Path(ref_path)
|
||||||
|
total_found = 0
|
||||||
|
total_missing = 0
|
||||||
|
total_entries = 0
|
||||||
|
folder_results = {}
|
||||||
|
|
||||||
|
for folder in ref_path.iterdir():
|
||||||
|
if not folder.is_dir():
|
||||||
|
continue
|
||||||
|
|
||||||
|
json_path = folder / json_filename
|
||||||
|
images_path = folder / images_dirname
|
||||||
|
|
||||||
|
if not json_path.exists():
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"Verifying folder: {folder.name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
verification = verify_images(str(json_path), str(images_path))
|
||||||
|
folder_results[folder.name] = verification
|
||||||
|
|
||||||
|
total_found += verification["found"]
|
||||||
|
total_missing += verification["missing"]
|
||||||
|
total_entries += verification["total"]
|
||||||
|
|
||||||
|
print(f" Found: {verification['found']}/{verification['total']}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
return {
|
||||||
|
"found": total_found,
|
||||||
|
"missing": total_missing,
|
||||||
|
"total": total_entries,
|
||||||
|
"folders": folder_results,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def save_dataset(dataset, output_path):
|
||||||
|
"""Save dataset in Arrow format (includes images)."""
|
||||||
|
dataset.save_to_disk(output_path)
|
||||||
|
# print(f"Dataset saved to {output_path}")
|
||||||
|
|
||||||
|
# Or save as JSON
|
||||||
|
# dataset.to_json(f"{output_path}/data.json")
|
||||||
|
|
||||||
|
# Or save as CSV
|
||||||
|
# dataset.to_csv(f"{output_path}/data.csv")
|
||||||
|
|
||||||
|
# Or save as Parquet
|
||||||
|
# dataset.to_parquet(f"{output_path}/data.parquet")
|
||||||
|
|
||||||
|
|
||||||
|
def load_dataset_from_disk(dataset_path):
|
||||||
|
"""Load a previously saved dataset."""
|
||||||
|
return datasets.load_from_disk(dataset_path)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# EXAMPLE USAGE
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--ref_path",
|
||||||
|
type=str,
|
||||||
|
help=("Root path containing multiple folders"),
|
||||||
|
default="",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--push_to_hub",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help=("If True push the merged dataset to Hugging Face Hub"),
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--token",
|
||||||
|
type=str,
|
||||||
|
help=("Hugging Face authentication token"),
|
||||||
|
default="",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--repo_id",
|
||||||
|
type=str,
|
||||||
|
help=("Hugging Face repository ID"),
|
||||||
|
default="nicolaleo/LLM-alt-text-assessment",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Example 1: Verify images across all folders
|
||||||
|
print("=== Verifying Images in All Folders ===")
|
||||||
|
verification = verify_images_in_folders(args.ref_path)
|
||||||
|
print("\n######## Verifier output ################################")
|
||||||
|
print(f"Total Found: {verification['found']}/{verification['total']}")
|
||||||
|
print(f"Total Missing: {verification['missing']}/{verification['total']}")
|
||||||
|
print("########################################")
|
||||||
|
|
||||||
|
# Show per-folder breakdown
|
||||||
|
print("\n=== Per-Folder Breakdown ===")
|
||||||
|
for folder_name, results in verification["folders"].items():
|
||||||
|
print(f"{folder_name}: {results['found']}/{results['total']} images found")
|
||||||
|
|
||||||
|
# Example 2: Create merged dataset from all folders
|
||||||
|
print("\n=== Creating Merged Dataset ===")
|
||||||
|
ds = create_dataset_from_folders(args.ref_path)
|
||||||
|
print("\n######## Merged Dataset output ################################")
|
||||||
|
print(f"Final dataset size: {len(ds)} entries")
|
||||||
|
print("########################################")
|
||||||
|
|
||||||
|
# Example 3: Analyze the merged dataset
|
||||||
|
print("\n=== Dataset Analysis ===")
|
||||||
|
print(ds)
|
||||||
|
|
||||||
|
# Example 3: Access images and data
|
||||||
|
print("\n=== First Example ===")
|
||||||
|
first_example = ds[0]
|
||||||
|
print(f"Image URL: {first_example['image_url']}")
|
||||||
|
print(f"Alt text: {first_example['alt_text']}")
|
||||||
|
print(f"Assessment: {first_example['assessment']}")
|
||||||
|
print(f"New alt text: {first_example['new_alt_text']}")
|
||||||
|
print(f"Image loaded: {first_example['image'] is not None}")
|
||||||
|
|
||||||
|
if first_example["image"] is not None:
|
||||||
|
img = first_example["image"]
|
||||||
|
print(f"Image size: {img.size}")
|
||||||
|
# img.show() # Uncomment to display image
|
||||||
|
|
||||||
|
# Example 4: Filter and work with merged data
|
||||||
|
print("\n=== Filtering Merged Dataset ===")
|
||||||
|
successful = ds.filter(lambda x: x["assessment"] == "success")
|
||||||
|
print(f"Successful assessments: {len(successful)}")
|
||||||
|
|
||||||
|
high_rated = ds.filter(lambda x: int(x["original_alt_text_assessment"]) >= 4)
|
||||||
|
print(f"High-rated (>=4): {len(high_rated)}")
|
||||||
|
|
||||||
|
# Example 5: Save merged dataset
|
||||||
|
print("\n=== Saving Merged Dataset ===")
|
||||||
|
save_dataset(ds, "alt_text_merged_dataset")
|
||||||
|
|
||||||
|
# Example 6: Load dataset
|
||||||
|
print("\n=== Loading Dataset ===")
|
||||||
|
loaded_ds = load_dataset_from_disk("alt_text_merged_dataset")
|
||||||
|
print(f"Loaded {len(loaded_ds)} entries")
|
||||||
|
|
||||||
|
if args.push_to_hub:
|
||||||
|
# Push to Hugging Face Hub (optional)
|
||||||
|
push_to_hub_example(repo_id=args.repo_id, token=args.token) # function below for details
|
||||||
|
|
@ -5,4 +5,5 @@ transformers==4.57.1
|
||||||
numpy==2.2.6
|
numpy==2.2.6
|
||||||
matplotlib==3.10.7
|
matplotlib==3.10.7
|
||||||
scikit-learn==1.7.2
|
scikit-learn==1.7.2
|
||||||
sentence-transformers==5.1.2
|
sentence-transformers==5.1.2
|
||||||
|
datasets==4.4.1
|
||||||
Loading…
Reference in New Issue