diff --git a/UI/.env b/UI/.env index 1b0e7f3..10f0839 100644 --- a/UI/.env +++ b/UI/.env @@ -1,4 +1,5 @@ DB_PATH=persistence/wcag_validator_ui.db WCAG_REST_SERVER_URL=http://localhost:8000 URL_LIST_old=["http://www.amazon.it","https://web.archive.org/web/20230630235957/http://www.amazon.com/", "https://web.archive.org/web/20251130033532/https://www.ebay.com/"] -URL_LIST=["https://amazon.com","https://ebay.com","https://walmart.com","https://etsy.com","https://target.com","https://wayfair.com","https://bestbuy.com","https://macys.com","https://homedepot.com","https://costco.com","https://www.ansa.it","https://en.wikipedia.org/wiki/Main_Page","https://www.lanazione.it","https://www.ansa.it","https://www.bbc.com","https://www.cnn.com","https://www.nytimes.com","https://www.theguardian.com"] \ No newline at end of file +URL_LIST_old=["https://www.amazon.com/s?k=magllioni&crid=CGD2UWO33O58&sprefix=magllioni%2Caps%2C209&ref=nb_sb_noss","https://web.archive.org/web/20251011214807/https://www.ilfattoquotidiano.it/","https://amazon.com","https://ebay.com","https://walmart.com","https://etsy.com","https://target.com","https://wayfair.com","https://www.ansa.it","https://en.wikipedia.org/wiki/Main_Page","https://www.lanazione.it","https://www.bbc.com","https://www.cnn.com","https://www.nytimes.com","https://www.theguardian.com"] +URL_LIST=["https://giove.isti.cnr.it/users/manca/eBay.html","http://www.amazon.it"] \ No newline at end of file diff --git a/UI/wcag_validator_ui.py b/UI/wcag_validator_ui.py index edfb8f5..7cfacba 100644 --- a/UI/wcag_validator_ui.py +++ b/UI/wcag_validator_ui.py @@ -31,6 +31,45 @@ import sqlite3 WCAG_VALIDATOR_RESTSERVER_HEADERS = [("Content-Type", "application/json")] +def process_dataframe(db_path, url, updated_df, user_state={}): + + print("Processing dataframe to adjust columns...") + column_rating_name = "User Assessment for LLM Proposal" + + # Get the assessment column + try: + updated_df[column_rating_name] = updated_df[column_rating_name].astype(int) + except ValueError: + return "Error: User Assessment for LLM Proposal must be an integer" + + if (updated_df[column_rating_name] < 1).any() or ( + updated_df[column_rating_name] > 5 + ).any(): + return "Error: User Assessment for LLM Proposal must be between 1 and 5" + + dataframe_json = updated_df.to_json(orient="records") + connection_db = sqlite3.connect(db_path) + json_user_str = json.dumps({"username": user_state["username"]}, ensure_ascii=False) + try: + # insert after everything to keep datetime aligned + db_persistence_insert( + connection_db=connection_db, + insert_type="wcag_user_llm_alttext_assessments", + page_url=url, + user=json_user_str, + llm_model="", + json_in_str=dataframe_json, # to improve + json_out_str="done via UI", + table="wcag_user_assessments", + ) + except Exception as e: + print("Error inserting user assessment into database:", str(e)) + finally: + if connection_db: + connection_db.close() + return "User assessment saved successfully!" + + def load_images_from_json(json_input): """Extract URLs and alt text from JSON and create HTML gallery""" try: @@ -40,7 +79,7 @@ def load_images_from_json(json_input): return "No images found in JSON", "" images = data["images"] - info_text = f"Found {len(images)} image(s)\n" + info_text = f"Found {len(images)} image(s)" print(f"Found {len(data['images'])} image(s)") # Create HTML gallery with checkboxes and assessment forms @@ -58,14 +97,14 @@ def load_images_from_json(json_input): padding: 10px; background: white; } - .image-card:has(input:checked) { + .image-card:has(input[type="checkbox"]:checked) { border-color: #2196F3; background: #a7c1c1; } .image-card img { width: 100%; height: 200px; - object-fit: cover; + object-fit: scale-down; border-radius: 4px; } .image-info { @@ -93,7 +132,7 @@ def load_images_from_json(json_input): display: none; margin-top: 15px; padding: 10px; - background: #f0f7ff; + background: #7896b9; border-radius: 4px; border: 1px solid #2196F3; } @@ -109,18 +148,22 @@ def load_images_from_json(json_input): margin-bottom: 5px; font-size: 13px; } - .range-container { + + .radio-container { + display: flex; + gap: 15px; + align-items: center; + } + + .radio-option { display: flex; align-items: center; - gap: 10px; + gap: 5px; + cursor: pointer; } - .range-container input[type="range"] { - flex: 1; - } - .range-value { - font-weight: bold; - min-width: 20px; - text-align: center; + + .radio-label { + font-weight: 500; } textarea { width: 100%; @@ -166,12 +209,28 @@ def load_images_from_json(json_input):
-
- - 3 -
+
+ + + + + +
@@ -226,7 +285,7 @@ def load_llm_assessment_from_json(json_input): { "Original Alt Text": alt_text_original, "LLM Assessment": original_alt_text_assessment, - "Proposed Alt Text": new_alt_text, + "LLM Proposed Alt Text": new_alt_text, } ) @@ -257,7 +316,7 @@ def make_alttext_llm_assessment_api_call( if not selected_images or len(selected_images) == 0: info_text = "No images selected" print(info_text) - return pd.DataFrame() + return "LLM assessment not started", pd.DataFrame() # prepare data for insertion json_in_str = {} @@ -267,6 +326,7 @@ def make_alttext_llm_assessment_api_call( user_assessments = [] user_new_alt_texts = [] selected_image_id = [] + user_assessments_llm_proposal = [] for img in selected_images: selected_urls.append(img["image_url"]) selected_alt_text_original.append(img["original_alt_text"]) @@ -275,6 +335,7 @@ def make_alttext_llm_assessment_api_call( selected_image_id.append( int(img["image_index"]) + 1 ) # add the id selected (+1 for index alignment) + user_assessments_llm_proposal.append(3) # default value for now json_in_str["images_urls"] = selected_urls json_in_str["images_alt_text_original"] = selected_alt_text_original json_out_str["user_assessments"] = user_assessments @@ -302,9 +363,17 @@ def make_alttext_llm_assessment_api_call( ) # return response info_dataframe = load_llm_assessment_from_json(response) + + # add the UI ids and other fields to to api response info_dataframe.insert( 0, "Image #", selected_image_id ) # add the UI ids from to api response + info_dataframe.insert(2, "User Assessment", user_assessments) + + info_dataframe.insert(3, "User Proposed Alt Text", user_new_alt_texts) + info_dataframe["User Assessment for LLM Proposal"] = ( + user_assessments_llm_proposal + ) except Exception as e: return {"error": str(e)} @@ -326,7 +395,7 @@ def make_alttext_llm_assessment_api_call( finally: if connection_db: connection_db.close() - return info_dataframe + return "LLM assessment completed", info_dataframe def make_image_extraction_api_call( @@ -449,9 +518,10 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo: images_number = gr.Slider( 5, 100, - value=30, + value=50, step=5, label="Max number of images to retrieve", + visible=False, ) with gr.Column(): @@ -459,39 +529,54 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo: "Extract Images & Alt Texts", variant="primary" ) alttext_api_call_btn = gr.Button( - "Alt Text LLM Assessment", + "Start LLM Assessment", variant="secondary", interactive=False, ) + image_info_output = gr.Textbox( + label="Activity tracking", lines=1 + ) - with gr.Row(): - - image_info_output = gr.Textbox(label="Managed Images", lines=5) + with gr.Row(visible=False) as alttext_results_row: # Use DataFrame for tabular output alttext_info_output = gr.DataFrame( headers=[ "Image #", "Original Alt Text", + "User Assessment", + "User Proposed Alt Text", "LLM Assessment", - "Proposed Alt Text", + "LLM Proposed Alt Text", + "User Assessment for LLM Proposal", ], label="LLM Assessment Results", wrap=True, # Wrap text in cells - interactive=False, + interactive=True, + scale=7, ) + with gr.Column(): + save_user_assessment_btn = gr.Button( + "Save Your Assessment", + variant="secondary", + interactive=True, + scale=1, + ) + gr.Markdown( + "ℹ Info: to assess the LLM output, only the values ​​for the 'User Assessment for LLM Proposal' column need to be changed." + ) with gr.Row(): gallery_html = gr.HTML(label="Image Gallery") image_extraction_api_call_btn.click( - fn=lambda: ("", "", pd.DataFrame(), gr.Button(interactive=False)), + fn=lambda: ("", "", gr.update(visible=False), gr.Button(interactive=False)), inputs=[], outputs=[ image_info_output, gallery_html, - alttext_info_output, + alttext_results_row, alttext_api_call_btn, ], ).then( @@ -515,7 +600,7 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo: wcag_rest_server_url_state, user_state, ], - outputs=[alttext_info_output], + outputs=[image_info_output, alttext_info_output], js=""" (url_input,gallery_html) => { const checkboxes = document.querySelectorAll('.image-checkbox:checked'); @@ -533,7 +618,8 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo: const index = checkbox.dataset.index; const imageUrl = checkbox.dataset.imgurl; const originalAlt = document.querySelector('.original-alt[data-index="' + index + '"]').value; - const assessment = document.querySelector('.assessment-range[data-index="' + index + '"]').value; + const assessment = document.querySelector('input[name="assessment-' + index + '"]:checked').value; + console.log("assessment:",assessment) const newAltText = document.querySelector('.new-alt-text[data-index="' + index + '"]').value; selectedData.push({ @@ -548,6 +634,16 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo: return [url_input,JSON.stringify(selectedData)]; } """, + ).then( + fn=lambda: gr.update(visible=True), + inputs=[], + outputs=[alttext_results_row], + ) + + save_user_assessment_btn.click( + fn=process_dataframe, + inputs=[db_path_state, url_input, alttext_info_output, user_state], + outputs=[image_info_output], ) # placed here at the end to give full contents visibility to events diff --git a/dependences/image_extractor.py b/dependences/image_extractor.py index 357e41f..eaff27f 100644 --- a/dependences/image_extractor.py +++ b/dependences/image_extractor.py @@ -55,7 +55,6 @@ class ImageExtractor: # Also check query parameters (e.g., format=jpeg) return any(fmt in img_url.lower() for fmt in self.SUPPORTED_FORMATS) - async def _download_image(self, image_url, output_dir="images") -> None: # Parse the URL to get the path without query parameters @@ -79,7 +78,7 @@ class ImageExtractor: # Sanitize image name (remove special characters, limit length) image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_")) - image_name = image_name[:200] # Limit filename length + image_name = image_name[:50] # Limit filename length # If name is empty after sanitization, create a hash-based name if not image_name: @@ -88,13 +87,15 @@ class ImageExtractor: image_name = hashlib.md5(image_url.encode()).hexdigest()[:16] # Download the image - print("getting image:", image_url) + print("getting image url:", image_url) + print("getting image name:", image_name) response = requests.get(image_url, timeout=10) response.raise_for_status() try: # Save the image output_path = os.path.join(output_dir, f"{image_name}.{ext}") + print("saving image to:", output_path) with open(output_path, "wb") as f: f.write(response.content) print(f"Saved: {output_path}") @@ -292,43 +293,36 @@ class ImageExtractor: error_msg = f"Error extracting context: {str(e)}" return error_msg, error_msg, error_msg - async def _get_page_metadata(self, page) -> Dict[str, Optional[str]]: - """Extract page metadata including title, description, and keywords.""" - metadata = { - "title": await page.title(), - "description": None, - "keywords": None, - "headings": [], - } + async def _get_page_metadata(self, page): + """Extract page metadata in one fast evaluate call. Batch DOM extraction inside one evaluate().""" + return await page.evaluate( + """ + () => { + const metadata = { + title: document.title || null, + description: null, + keywords: null, + headings: [] + }; - # Extract meta description - try: - description = await page.locator('meta[name="description"]').get_attribute( - "content" - ) - metadata["description"] = description - except: - pass + const desc = document.querySelector('meta[name="description"]'); + const keys = document.querySelector('meta[name="keywords"]'); + metadata.description = desc?.content || null; + metadata.keywords = keys?.content || null; - # Extract meta keywords - try: - keywords = await page.locator('meta[name="keywords"]').get_attribute( - "content" - ) - metadata["keywords"] = keywords - except: - pass + // Collect all headings h1–h6 + const allHeadings = document.querySelectorAll('h1, h2, h3, h4, h5, h6'); + metadata.headings = Array.from(allHeadings) + .map(h => ({ + level: parseInt(h.tagName.substring(1), 10), + text: h.textContent.trim() + })) + .filter(h => h.text.length > 0); - # Extract all headings (h1-h6) - for level in range(1, 7): - headings = await page.locator(f"h{level}").all_text_contents() - for heading in headings: - if heading.strip(): - metadata["headings"].append( - {"level": level, "text": heading.strip()} - ) - - return metadata + return metadata; + } + """ + ) async def extract_images( self, extract_context=True, specific_images_urls=[] @@ -344,15 +338,18 @@ class ImageExtractor: page = await browser.new_page() try: - #await page.goto(self.url, wait_until="networkidle") # method 1: use if the page has unpredictable async content and there is the need to ensure everything loads - # The "networkidle" approach is generally more robust but slower, while the fixed timeout is faster but less adaptive to actual page behavior. + # await page.goto(self.url, wait_until="networkidle") # method 1: use if the page has unpredictable async content and there is the need to ensure everything loads + # The "networkidle" approach is generally more robust but slower, while the fixed timeout is faster but less adaptive to actual page behavior. # ---alternative method2: use if there is total awareness of the page's loading pattern and want faster, more reliable execution - await page.goto(self.url, timeout=50000, wait_until="load")# deafult timeout=30000, 30sec + await page.goto( + self.url, timeout=50000, wait_until="load" + ) # deafult timeout=30000, 30sec # Wait for page to load completely await page.wait_for_timeout(2000) # Wait for dynamic content # ----- if extract_context: + print("Getting page metadata...") # Get page metadata once page_metadata = await self._get_page_metadata(page) page_title = page_metadata["title"] @@ -367,15 +364,41 @@ class ImageExtractor: if len(specific_images_urls) == 0: # Find all img elements - print("Extracting all images from the page",self.url) - img_elements = await page.locator("img").all() + print("Extracting all images from the page", self.url) + # img_elements = await page.locator("img").all() else: print( "Extracting specific images from the page:", self.url, specific_images_urls, ) - img_elements = [] + # img_elements = await page.locator("img").all() + + """ # method 3: optimized approach + # Get all src attributes in one go + all_img_elements = await page.locator("img").all() + all_srcs = await page.locator("img").evaluate_all( + "elements => elements.map(el => el.src || '')" + ) + + # Filter with the pre-fetched src values + img_elements = [ + elem for elem, src in zip(all_img_elements, all_srcs) + if src in specific_images_urls + ] + """ + + """ #method 2: single pass to find matching images + for img_element in all_img_elements: #This is more efficient than making separate locator queries for each specific URL and avoids timeout issues. + try: + src = await img_element.get_attribute("src") + print("found image src:", src) + if src in specific_images_urls: + img_elements.append(img_element) + except Exception as e: + print(f"Error getting src attribute from image: {str(e)}")""" + + """ # method 1: separate locator queries for each specific URL for url in specific_images_urls: try: img_element = await page.locator( @@ -384,8 +407,11 @@ class ImageExtractor: if img_element: img_elements.append(img_element) except Exception as e: - print(f"Error locating image with src {url}: {str(e)}") + print(f"Error locating image with src {url}: {str(e)}")""" + img_elements = await page.locator( + "img" + ).all() # unified approach to start with all images and filter later image_source_list = [] # avoid multiple check for the same image url images_data = [] @@ -404,6 +430,12 @@ class ImageExtractor: if not src: print("image has no src attribute. Skipped.") continue + if ( + src not in specific_images_urls + and len(specific_images_urls) > 0 + ): + # print("image src",src,"not in the specific images list. Skipped.") + continue if src not in image_source_list: image_source_list.append(src) @@ -434,6 +466,7 @@ class ImageExtractor: alt_text = await img.get_attribute("alt") or "" if extract_context: + print("Extracting context for image:", img_url) # Get surrounding HTML context (full, immediate, and nearby) html_context, immediate_context, nearby_text = ( await self._get_element_context(page, img) diff --git a/dependences/mllm_management.py b/dependences/mllm_management.py index 1aa4f2f..5aec00b 100644 --- a/dependences/mllm_management.py +++ b/dependences/mllm_management.py @@ -76,39 +76,7 @@ class MLLMManager: return payload def get_alt_text_system_prompt(self): - system_prompt_old = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for - images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present - the same information as the image, and should be able to substitute for the non-text content. The text alternative would - be brief but as informative as possible. - - Follow these instructions carefully: - 1. You will be provided as input with the following: - - The image found on the webpage. - - The associated alternative text. When the alt-text is empty or absent, you will be explicitly informed. - - The surrounding context of the image. - - The page title, headings and the content of the “keywords” and “description” tag, if found. - - 2. Determine the function and purpose of the image by analyzing these elements. Take into account the purpose and function - of the associated image by considering the page context. Check also if the image is, or is associated with, a link or a button, - and consider this in your judgement. If the image contains text use that as part of the context. - - 3. Provide a final assessment based on the following: - - 'success' if you can assess with 'sufficient certainty' the alt-text is appropriate in relation to the image purpose, - - 'failure' if you can assess with 'sufficient certainty' that the alt-text is NOT appropriate, - - 'warning' if you cannot determine with 'sufficient certainty'. - where the level of certainty goes from 1 to 100 and 'sufficient certainty' means > 80 - - 4. The original alt-text assessment on a scale from 1 to 5, where 5 is the best score. Use an integer number only. - - 5. Provide a brief reasoning for your judgment. If the image contains text, write it verbatim. Your response should be in English. - - 6. Keep your response within 150 words. - - 7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words. - - 8. Here is the JSON format the results must have: - {"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}""" - + # https://www.w3.org/WAI/WCAG22/Techniques/general/G94 without examples system_prompt = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present @@ -122,7 +90,7 @@ class MLLMManager: What purpose does it fulfill? If I could not use the image content, what words would I use to convey the same function and/or information? - When image content contains words that are important to understanding the content, the alt text should include those words + When image content contains words that are important to understanding the content, the alt text should include those words. Follow these instructions carefully: 1. You will be provided as input with the following: @@ -147,7 +115,7 @@ class MLLMManager: 6. Keep your response within 150 words. - 7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words. + 7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words. Use the same language as the original alt-text. 8. Here is the JSON format the results must have: {"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}""" @@ -181,7 +149,7 @@ class MLLMManager: print("Using end_point:", self.end_point) alt_text_system_prompt = self.get_alt_text_system_prompt() - print("alt_text_system_prompt:", alt_text_system_prompt) + #print("alt_text_system_prompt:", alt_text_system_prompt) mllm_responses = [] for img_info in images: diff --git a/restserver/routers/routes_extract_images.py b/restserver/routers/routes_extract_images.py index e173667..914948b 100644 --- a/restserver/routers/routes_extract_images.py +++ b/restserver/routers/routes_extract_images.py @@ -46,6 +46,7 @@ class ExtractImagesRoutes: self, request: Request, data: ExtractImages ) -> JSONResponse: """Return the alt text validation assessment based on WCAG guidelines""" + print("Received extract images request.") try: json_content = json.loads(data.model_dump_json()) diff --git a/restserver/routers/routes_wcag_alttext.py b/restserver/routers/routes_wcag_alttext.py index 15afa9e..ac8633f 100644 --- a/restserver/routers/routes_wcag_alttext.py +++ b/restserver/routers/routes_wcag_alttext.py @@ -53,6 +53,7 @@ class WCAGAltTextValuationRoutes: ) -> JSONResponse: """Return the alt text validation assessment based on WCAG guidelines""" try: + print("Received wcag alttext validation request.") json_content = json.loads(data.model_dump_json()) mllm_model_id = self.mllm_settings["mllm_model_id"] @@ -67,7 +68,12 @@ class WCAGAltTextValuationRoutes: .replace(":", "") .replace("//", "_") .replace("/", "_") + .replace("%2", "_") + .replace("?", "_") + .replace("=", "_") + .replace("&", "_") ) + url_path=url_path[:50] # limit length now = datetime.now(timezone.utc) now_str = now.strftime("%Y_%m_%d-%H_%M_%S") folder_str = mllm_model_id.replace(":", "-") + "_" + now_str @@ -93,7 +99,7 @@ class WCAGAltTextValuationRoutes: # Extract images logging.info(f"Extracting images from: {json_content['page_url']}") images = await image_extractor.extract_images( - specific_images_urls=json_content["specific_images_urls"] + specific_images_urls=json_content["specific_images_urls"],extract_context=True ) # MLLM settings mllm_end_point = self.mllm_settings["mllm_end_point"] diff --git a/scripts/build_dataset_from_folder.py b/scripts/build_dataset_from_folder.py new file mode 100644 index 0000000..4839c8b --- /dev/null +++ b/scripts/build_dataset_from_folder.py @@ -0,0 +1,541 @@ +# to launch: python build_dataset_from_folder.py --ref_path "" --push_to_hub --repo_id "nicolaleo/LLM-alt-text-assessment" --token "" + +from datasets import Dataset, DatasetDict +import datasets +import json +from pathlib import Path +from PIL import Image +import hashlib +import urllib.parse +import argparse + + +''' +# Dataset metadata +_DESCRIPTION = """\ +Dataset for image alt-text assessment and improvement using MLLM responses. +Contains images, original alt-texts, quality assessments, and improved versions. +""" + +_CITATION = """\ +@misc{alt_text_assessment, + title={Alt-Text Assessment Dataset}, + year={2024} +} +""" + + + + + +class AltTextDataset(datasets.GeneratorBasedBuilder): + """Dataset for alt-text assessment with images and MLLM responses.""" + + VERSION = datasets.Version("1.0.0") + + def _info(self): + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=datasets.Features({ + "image": datasets.Image(), + "image_url": datasets.Value("string"), + "alt_text": datasets.Value("string"), + "original_alt_text_assessment": datasets.Value("string"), + "assessment": datasets.Value("string"), + "evaluation_result": datasets.Value("string"), + "new_alt_text": datasets.Value("string"), + #"source_folder": datasets.Value("string"), + }), + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Define data splits.""" + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "json_filepath": "data.json", + "images_dir": "images" + }, + ), + ] + + def _generate_examples(self, json_filepath, images_dir): + """Generate examples from JSON file and image directory.""" + with open(json_filepath, encoding="utf-8") as f: + data = json.load(f) + + images_path = Path(images_dir) + + for idx, entry in enumerate(data): + image_url = entry["image_url"] + image_filename = url_to_filename(image_url) + image_path = images_path / image_filename + + # Load image if exists, otherwise None + image = str(image_path) if image_path.exists() else None + + yield idx, { + "image": image, + "image_url": image_url, + "alt_text": entry["alt_text"], + "original_alt_text_assessment": entry["mllm_response"]["original_alt_text_assessment"], + "assessment": entry["mllm_response"]["assessment"], + "evaluation_result": entry["mllm_response"]["evaluation_result"], + "new_alt_text": entry["mllm_response"]["new_alt_text"], + } + +''' +# ============================================================================ +# SIMPLE USAGE FUNCTIONS +# ============================================================================ + + +def url_to_filename(image_url): # save step as in the image_extractor dependence + """ + Convert image URL to sanitized filename following your exact logic. + + Args: + image_url: The image URL + + Returns: + Sanitized filename with extension + """ + + # Parse the URL to get the path without query parameters + parsed_url = urllib.parse.urlparse(image_url) + url_path = parsed_url.path + + # Get the filename from the path + filename = url_path.split("/")[-1] + print(f"Original filename: '{filename}'") + + # Split filename and extension + if "." in filename: + image_name, ext = filename.rsplit(".", 1) + ext = ext.lower() + else: + image_name = filename + ext = "jpg" + + # Validate extension + if ext not in ["jpg", "jpeg", "png", "gif", "webp"]: + ext = "jpg" + + # Sanitize image name (remove special characters, limit length) + image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_")) + + image_name = image_name[:50] # Limit filename length + + # If name is empty after sanitization, create a hash-based name + if not image_name: + image_name = hashlib.md5(image_url.encode()).hexdigest()[:16] + + return f"{image_name}.{ext}" + + +def push_to_hub_example(dataset_path="alt_text_merged_dataset", repo_id="",token=None): + """ + Example of how to push dataset to Hugging Face Hub. + You need to authenticate first! + """ + from huggingface_hub import login + + print("\n=== Pushing Dataset to Hugging Face Hub ===") + # Method 1: Login interactively (will prompt for token) + # login() + + # Method 2: Login with token directly + login(token=token) + + # Method 3: Set token as environment variable + # export HF_TOKEN="hf_YourTokenHere" + # Then login() will use it automatically + + # Load your dataset + ds = load_dataset_from_disk(dataset_path) + + # Combine into DatasetDict + ds = DatasetDict( + { + "train": ds, + # #"test": test_dataset + } + ) + + # Push to hub (creates repo if it doesn't exist) + ds.push_to_hub( # Automatically converts to Parquet when uploading to Hub + repo_id, # Replace with your username + private=False, # Set True for private dataset + ) + + print("Dataset pushed successfully!") + print(f"View at: https://huggingface.co/datasets/{repo_id}") + + +def create_dataset_from_json(json_filepath, json_filepath_images, images_dir="images"): + """ + Create a Hugging Face Dataset from JSON file with local images. + + Args: + json_filepath: Path to JSON file with your data structure + images_dir: Directory containing the images (default: "images") + + Returns: + datasets.Dataset object with images loaded + """ + with open(json_filepath, "r", encoding="utf-8") as f: + data = json.load(f) + + with open(json_filepath_images, "r", encoding="utf-8") as f: + data_images = json.load(f) + + images_path = Path(images_dir) + + # Flatten the nested structure and load images + flattened_data = { + "image": [], + "image_url": [], + "alt_text": [], + "original_alt_text_assessment": [], + "assessment": [], + "evaluation_result": [], + "new_alt_text": [], + "page_url": [], + "html_context": [], + } + + count_entry = 0 + for entry in data: + if ( + entry["mllm_response"]["original_alt_text_assessment"] is None + ): # important! skip entries with no MLLM response. not usable data + print( + f"Skipping entry with image URL: {entry['image_url']} due to missing MLLM response" + ) + count_entry += 1 + continue # Skip entries with no MLLM response + image_url = entry["image_url"] + image_filename = url_to_filename(image_url) + image_path = images_path / image_filename + + # Load image if it exists + if image_path.exists(): + img = Image.open(image_path) + flattened_data["image"].append(img) + else: + print(f"Warning: Image not found: {image_path}") + flattened_data["image"].append(None) + + flattened_data["image_url"].append(image_url) + flattened_data["alt_text"].append(entry["alt_text"]) + flattened_data["original_alt_text_assessment"].append( + str(entry["mllm_response"]["original_alt_text_assessment"]) + ) + flattened_data["assessment"].append(entry["mllm_response"]["assessment"]) + flattened_data["evaluation_result"].append( + entry["mllm_response"]["evaluation_result"] + ) + flattened_data["new_alt_text"].append(entry["mllm_response"]["new_alt_text"]) + flattened_data["page_url"].append(data_images[count_entry]["page_url"]) + flattened_data["html_context"].append(data_images[count_entry]["html_context"]) + + count_entry += 1 + + print(f"Total valid entries loaded: {len(flattened_data['image_url'])}") + return datasets.Dataset.from_dict(flattened_data) + + +def create_dataset_from_folders( + ref_path, + json_filename="mllm_alttext_assessments.json", + json_filename_images="extracted_images.json", + images_dirname="images", +): + """ + Create a merged dataset from multiple folders under ref_path. + Each folder should contain a JSON file and an images subdirectory. + + Args: + ref_path: Root path containing multiple folders + json_filename: Name of JSON file in each folder (default: "data.json") + images_dirname: Name of images subdirectory (default: "images") + + Returns: + datasets.Dataset object with all entries merged + """ + ref_path = Path(ref_path) + all_datasets = [] + + # Find all subdirectories containing the JSON file + folders_processed = 0 + + for folder in ref_path.iterdir(): + if not folder.is_dir(): + continue + + json_path = folder / json_filename + json_path_images = folder / json_filename_images + images_path = folder / images_dirname + + # Check if both JSON and images directory exist + if not json_path.exists(): + print(f"Skipping {folder.name}: no {json_filename} found") + continue + + if not json_path_images.exists(): + print(f"Skipping {folder.name}: no {json_filename_images} found") + continue + + if not images_path.exists(): + print(f"Warning: {folder.name}: images directory not found") + # continue + # Continue anyway, images might be optional (from urls only) + + print(f"Processing folder: {folder.name}") + + try: + # Create dataset for this folder + ds = create_dataset_from_json( + str(json_path), str(json_path_images), str(images_path) + ) + all_datasets.append(ds) + + folders_processed += 1 + print(f" -> Loaded {len(ds)} entries") + except Exception as e: + print(f"Error processing {folder.name}: {e}") + continue + + if not all_datasets: + raise ValueError(f"No valid folders found in {ref_path}") + + # Merge all datasets + print(f"\n=== Merging {folders_processed} folders ===") + merged_dataset = datasets.concatenate_datasets(all_datasets) + print(f"Total entries: {len(merged_dataset)}") + + return merged_dataset + + +def verify_images(json_filepath, images_dir="images"): + """ + Verify that all images referenced in JSON exist in the images directory. + + Args: + json_filepath: Path to JSON file + images_dir: Directory containing images + + Returns: + Dict with 'found', 'missing', and 'details' keys + """ + with open(json_filepath, "r", encoding="utf-8") as f: + data = json.load(f) + + images_path = Path(images_dir) + + found = [] + missing = [] + + for entry in data: + image_url = entry["image_url"] + image_filename = url_to_filename(image_url) + image_path = images_path / image_filename + print( + "image_url:", + image_url, + "image_filename:", + image_filename, + "image_path:", + image_path, + ) + + if image_path.exists(): + found.append( + {"url": image_url, "filename": image_filename, "path": str(image_path)} + ) + else: + missing.append( + { + "url": image_url, + "filename": image_filename, + "expected_path": str(image_path), + } + ) + + return { + "found": len(found), + "missing": len(missing), + "total": len(data), + "details": {"found_images": found, "missing_images": missing}, + } + + +def verify_images_in_folders( + ref_path, json_filename="mllm_alttext_assessments.json", images_dirname="images" +): + """ + Verify images across all folders under ref_path. + + Args: + ref_path: Root path containing multiple folders + json_filename: Name of JSON file in each folder + images_dirname: Name of images subdirectory + + Returns: + Dict with aggregated verification results + """ + ref_path = Path(ref_path) + total_found = 0 + total_missing = 0 + total_entries = 0 + folder_results = {} + + for folder in ref_path.iterdir(): + if not folder.is_dir(): + continue + + json_path = folder / json_filename + images_path = folder / images_dirname + + if not json_path.exists(): + continue + + print(f"Verifying folder: {folder.name}") + + try: + verification = verify_images(str(json_path), str(images_path)) + folder_results[folder.name] = verification + + total_found += verification["found"] + total_missing += verification["missing"] + total_entries += verification["total"] + + print(f" Found: {verification['found']}/{verification['total']}") + + except Exception as e: + print(f" Error: {e}") + continue + + return { + "found": total_found, + "missing": total_missing, + "total": total_entries, + "folders": folder_results, + } + + +def save_dataset(dataset, output_path): + """Save dataset in Arrow format (includes images).""" + dataset.save_to_disk(output_path) + # print(f"Dataset saved to {output_path}") + + # Or save as JSON + # dataset.to_json(f"{output_path}/data.json") + + # Or save as CSV + # dataset.to_csv(f"{output_path}/data.csv") + + # Or save as Parquet + # dataset.to_parquet(f"{output_path}/data.parquet") + + +def load_dataset_from_disk(dataset_path): + """Load a previously saved dataset.""" + return datasets.load_from_disk(dataset_path) + + +# ============================================================================ +# EXAMPLE USAGE +# ============================================================================ + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + + parser.add_argument( + "--ref_path", + type=str, + help=("Root path containing multiple folders"), + default="", + ) + + parser.add_argument( + "--push_to_hub", + action="store_true", + default=False, + help=("If True push the merged dataset to Hugging Face Hub"), + ) + parser.add_argument( + "--token", + type=str, + help=("Hugging Face authentication token"), + default="", + ) + parser.add_argument( + "--repo_id", + type=str, + help=("Hugging Face repository ID"), + default="nicolaleo/LLM-alt-text-assessment", + ) + args = parser.parse_args() + + # Example 1: Verify images across all folders + print("=== Verifying Images in All Folders ===") + verification = verify_images_in_folders(args.ref_path) + print("\n######## Verifier output ################################") + print(f"Total Found: {verification['found']}/{verification['total']}") + print(f"Total Missing: {verification['missing']}/{verification['total']}") + print("########################################") + + # Show per-folder breakdown + print("\n=== Per-Folder Breakdown ===") + for folder_name, results in verification["folders"].items(): + print(f"{folder_name}: {results['found']}/{results['total']} images found") + + # Example 2: Create merged dataset from all folders + print("\n=== Creating Merged Dataset ===") + ds = create_dataset_from_folders(args.ref_path) + print("\n######## Merged Dataset output ################################") + print(f"Final dataset size: {len(ds)} entries") + print("########################################") + + # Example 3: Analyze the merged dataset + print("\n=== Dataset Analysis ===") + print(ds) + + # Example 3: Access images and data + print("\n=== First Example ===") + first_example = ds[0] + print(f"Image URL: {first_example['image_url']}") + print(f"Alt text: {first_example['alt_text']}") + print(f"Assessment: {first_example['assessment']}") + print(f"New alt text: {first_example['new_alt_text']}") + print(f"Image loaded: {first_example['image'] is not None}") + + if first_example["image"] is not None: + img = first_example["image"] + print(f"Image size: {img.size}") + # img.show() # Uncomment to display image + + # Example 4: Filter and work with merged data + print("\n=== Filtering Merged Dataset ===") + successful = ds.filter(lambda x: x["assessment"] == "success") + print(f"Successful assessments: {len(successful)}") + + high_rated = ds.filter(lambda x: int(x["original_alt_text_assessment"]) >= 4) + print(f"High-rated (>=4): {len(high_rated)}") + + # Example 5: Save merged dataset + print("\n=== Saving Merged Dataset ===") + save_dataset(ds, "alt_text_merged_dataset") + + # Example 6: Load dataset + print("\n=== Loading Dataset ===") + loaded_ds = load_dataset_from_disk("alt_text_merged_dataset") + print(f"Loaded {len(loaded_ds)} entries") + + if args.push_to_hub: + # Push to Hugging Face Hub (optional) + push_to_hub_example(repo_id=args.repo_id, token=args.token) # function below for details \ No newline at end of file diff --git a/scripts/requirements_extra.txt b/scripts/requirements_extra.txt index 574a36a..d27bdd4 100644 --- a/scripts/requirements_extra.txt +++ b/scripts/requirements_extra.txt @@ -5,4 +5,5 @@ transformers==4.57.1 numpy==2.2.6 matplotlib==3.10.7 scikit-learn==1.7.2 -sentence-transformers==5.1.2 \ No newline at end of file +sentence-transformers==5.1.2 +datasets==4.4.1 \ No newline at end of file