upgrade e costruzione datasetHF

2025-12-09 12:51:12 +01:00 · 2025-12-09 12:51:12 +01:00 · 85c03b3a1a
parent cde7259ed7
commit 85c03b3a1a
8 changed files with 763 additions and 116 deletions
--- a/UI/.env
+++ b/UI/.env
@ -1,4 +1,5 @@
 DB_PATH=persistence/wcag_validator_ui.db
 WCAG_REST_SERVER_URL=http://localhost:8000
 URL_LIST_old=["http://www.amazon.it","https://web.archive.org/web/20230630235957/http://www.amazon.com/", "https://web.archive.org/web/20251130033532/https://www.ebay.com/"]
-URL_LIST=["https://amazon.com","https://ebay.com","https://walmart.com","https://etsy.com","https://target.com","https://wayfair.com","https://bestbuy.com","https://macys.com","https://homedepot.com","https://costco.com","https://www.ansa.it","https://en.wikipedia.org/wiki/Main_Page","https://www.lanazione.it","https://www.ansa.it","https://www.bbc.com","https://www.cnn.com","https://www.nytimes.com","https://www.theguardian.com"]
+URL_LIST_old=["https://www.amazon.com/s?k=magllioni&crid=CGD2UWO33O58&sprefix=magllioni%2Caps%2C209&ref=nb_sb_noss","https://web.archive.org/web/20251011214807/https://www.ilfattoquotidiano.it/","https://amazon.com","https://ebay.com","https://walmart.com","https://etsy.com","https://target.com","https://wayfair.com","https://www.ansa.it","https://en.wikipedia.org/wiki/Main_Page","https://www.lanazione.it","https://www.bbc.com","https://www.cnn.com","https://www.nytimes.com","https://www.theguardian.com"]
 URL_LIST=["https://giove.isti.cnr.it/users/manca/eBay.html","http://www.amazon.it"]
--- a/UI/wcag_validator_ui.py
+++ b/UI/wcag_validator_ui.py
@ -31,6 +31,45 @@ import sqlite3
 WCAG_VALIDATOR_RESTSERVER_HEADERS = [("Content-Type", "application/json")]
 def process_dataframe(db_path, url, updated_df, user_state={}):
    print("Processing dataframe to adjust columns...")
    column_rating_name = "User Assessment for LLM Proposal"
    # Get the assessment column
    try:
        updated_df[column_rating_name] = updated_df[column_rating_name].astype(int)
    except ValueError:
        return "Error: User Assessment for LLM Proposal must be an integer"
    if (updated_df[column_rating_name] < 1).any() or (
        updated_df[column_rating_name] > 5
    ).any():
        return "Error: User Assessment for LLM Proposal must be between 1 and 5"
    dataframe_json = updated_df.to_json(orient="records")
    connection_db = sqlite3.connect(db_path)
    json_user_str = json.dumps({"username": user_state["username"]}, ensure_ascii=False)
    try:
        # insert after everything to keep datetime aligned
        db_persistence_insert(
            connection_db=connection_db,
            insert_type="wcag_user_llm_alttext_assessments",
            page_url=url,
            user=json_user_str,
            llm_model="",
            json_in_str=dataframe_json,  # to improve
            json_out_str="done via UI",
            table="wcag_user_assessments",
        )
    except Exception as e:
        print("Error inserting user assessment into database:", str(e))
    finally:
        if connection_db:
            connection_db.close()
        return "User assessment saved successfully!"
 def load_images_from_json(json_input):
    """Extract URLs and alt text from JSON and create HTML gallery"""
    try:
@ -40,7 +79,7 @@ def load_images_from_json(json_input):
            return "No images found in JSON", ""
        images = data["images"]
-        info_text = f"Found {len(images)} image(s)\n"
+        info_text = f"Found {len(images)} image(s)"
        print(f"Found {len(data['images'])} image(s)")
        # Create HTML gallery with checkboxes and assessment forms
@ -58,14 +97,14 @@ def load_images_from_json(json_input):
                padding: 10px;
                background: white;
            }
-            .image-card:has(input:checked) {
+            .image-card:has(input[type="checkbox"]:checked) {
                border-color: #2196F3;
                background: #a7c1c1;
            }
            .image-card img {
                width: 100%;
                height: 200px;
-                object-fit: cover;
+                object-fit: scale-down;
                border-radius: 4px;
            }
            .image-info {
@ -93,7 +132,7 @@ def load_images_from_json(json_input):
                display: none;
                margin-top: 15px;
                padding: 10px;
-                background: #f0f7ff;
+                background: #7896b9;
                border-radius: 4px;
                border: 1px solid #2196F3;
            }
@ -109,18 +148,22 @@ def load_images_from_json(json_input):
                margin-bottom: 5px;
                font-size: 13px;
            }
-            .range-container {
+            
            .radio-container {
                display: flex;
                gap: 15px;
                align-items: center;
            }
            .radio-option {
                display: flex;
                align-items: center;
-                gap: 10px;
+                gap: 5px;
                cursor: pointer;
            }
-            .range-container input[type="range"] {
+
-                flex: 1;
+            .radio-label {
-            }
+                font-weight: 500;
            .range-value {
                font-weight: bold;
                min-width: 20px;
                text-align: center;
            }
            textarea {
                width: 100%;
@ -166,12 +209,28 @@ def load_images_from_json(json_input):
                    <div id="panel-{idx}" class="assessment-panel">
                        <div class="form-group">
                            <label>Rate current alt-text:</label>
-                            <div class="range-container">
+                            <div class="radio-container">
-                                <input type="range" min="1" max="5" value="3" 
+                            <label class="radio-option">
-                                       class="assessment-range" data-index="{idx}"
+                                <input type="radio" name="assessment-{idx}" value="1" data-index="{idx}">
-                                       oninput="document.getElementById('range-value-{idx}').textContent = this.value">
+                                <span class="radio-label">1</span>
-                                <span id="range-value-{idx}" class="range-value">3</span>
+                            </label>
-                            </div>
+                            <label class="radio-option">
                                <input type="radio" name="assessment-{idx}" value="2" data-index="{idx}">
                                <span class="radio-label">2</span>
                            </label>
                            <label class="radio-option">
                                <input type="radio" name="assessment-{idx}" value="3" data-index="{idx}" checked>
                                <span class="radio-label">3</span>
                            </label>
                            <label class="radio-option">
                                <input type="radio" name="assessment-{idx}" value="4" data-index="{idx}">
                                <span class="radio-label">4</span>
                            </label>
                            <label class="radio-option">
                                <input type="radio" name="assessment-{idx}" value="5" data-index="{idx}">
                                <span class="radio-label">5</span>
                            </label>
                        </div>
                        </div>
                        <div class="form-group">
                            <label>New alt-text:</label>
@ -226,7 +285,7 @@ def load_llm_assessment_from_json(json_input):
                {
                    "Original Alt Text": alt_text_original,
                    "LLM Assessment": original_alt_text_assessment,
-                    "Proposed Alt Text": new_alt_text,
+                    "LLM Proposed Alt Text": new_alt_text,
                }
            )
@ -257,7 +316,7 @@ def make_alttext_llm_assessment_api_call(
    if not selected_images or len(selected_images) == 0:
        info_text = "No images selected"
        print(info_text)
-        return pd.DataFrame()
+        return "LLM assessment not started", pd.DataFrame()
    # prepare data for insertion
    json_in_str = {}
@ -267,6 +326,7 @@ def make_alttext_llm_assessment_api_call(
    user_assessments = []
    user_new_alt_texts = []
    selected_image_id = []
    user_assessments_llm_proposal = []
    for img in selected_images:
        selected_urls.append(img["image_url"])
        selected_alt_text_original.append(img["original_alt_text"])
@ -275,6 +335,7 @@ def make_alttext_llm_assessment_api_call(
        selected_image_id.append(
            int(img["image_index"]) + 1
        )  # add the id selected (+1 for index alignment)
        user_assessments_llm_proposal.append(3)  # default value for now
    json_in_str["images_urls"] = selected_urls
    json_in_str["images_alt_text_original"] = selected_alt_text_original
    json_out_str["user_assessments"] = user_assessments
@ -302,9 +363,17 @@ def make_alttext_llm_assessment_api_call(
        )
        # return response
        info_dataframe = load_llm_assessment_from_json(response)
        # add the UI ids and other fields to to api response
        info_dataframe.insert(
            0, "Image #", selected_image_id
        )  # add the UI ids from to api response
        info_dataframe.insert(2, "User Assessment", user_assessments)
        info_dataframe.insert(3, "User Proposed Alt Text", user_new_alt_texts)
        info_dataframe["User Assessment for LLM Proposal"] = (
            user_assessments_llm_proposal
        )
    except Exception as e:
        return {"error": str(e)}
@ -326,7 +395,7 @@ def make_alttext_llm_assessment_api_call(
    finally:
        if connection_db:
            connection_db.close()
-        return info_dataframe
+        return "LLM assessment completed", info_dataframe
 def make_image_extraction_api_call(
@ -449,9 +518,10 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
                        images_number = gr.Slider(
                            5,
                            100,
-                            value=30,
+                            value=50,
                            step=5,
                            label="Max number of images to retrieve",
                            visible=False,
                        )
                    with gr.Column():
@ -459,39 +529,54 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
                            "Extract Images & Alt Texts", variant="primary"
                        )
                        alttext_api_call_btn = gr.Button(
-                            "Alt Text LLM Assessment",
+                            "Start LLM Assessment",
                            variant="secondary",
                            interactive=False,
                        )
                        image_info_output = gr.Textbox(
                            label="Activity tracking", lines=1
                        )
-        with gr.Row():
+        with gr.Row(visible=False) as alttext_results_row:
            image_info_output = gr.Textbox(label="Managed Images", lines=5)
            # Use DataFrame for tabular output
            alttext_info_output = gr.DataFrame(
                headers=[
                    "Image #",
                    "Original Alt Text",
                    "User Assessment",
                    "User Proposed Alt Text",
                    "LLM Assessment",
-                    "Proposed Alt Text",
+                    "LLM Proposed Alt Text",
                    "User Assessment for LLM Proposal",
                ],
                label="LLM Assessment Results",
                wrap=True,  # Wrap text in cells
-                interactive=False,
+                interactive=True,
                scale=7,
            )
            with gr.Column():
                save_user_assessment_btn = gr.Button(
                    "Save Your Assessment",
                    variant="secondary",
                    interactive=True,
                    scale=1,
                )
                gr.Markdown(
                    "ℹ Info: to assess the LLM output, only the values for the 'User Assessment for LLM Proposal' column need to be changed."
                )
        with gr.Row():
            gallery_html = gr.HTML(label="Image Gallery")
        image_extraction_api_call_btn.click(
-            fn=lambda: ("", "", pd.DataFrame(), gr.Button(interactive=False)),
+            fn=lambda: ("", "", gr.update(visible=False), gr.Button(interactive=False)),
            inputs=[],
            outputs=[
                image_info_output,
                gallery_html,
-                alttext_info_output,
+                alttext_results_row,
                alttext_api_call_btn,
            ],
        ).then(
@ -515,7 +600,7 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
                wcag_rest_server_url_state,
                user_state,
            ],
-            outputs=[alttext_info_output],
+            outputs=[image_info_output, alttext_info_output],
            js="""
         (url_input,gallery_html) => {
            const checkboxes = document.querySelectorAll('.image-checkbox:checked');
@ -533,7 +618,8 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
                const index = checkbox.dataset.index;
                const imageUrl = checkbox.dataset.imgurl;
                const originalAlt = document.querySelector('.original-alt[data-index="' + index + '"]').value;
-                const assessment = document.querySelector('.assessment-range[data-index="' + index + '"]').value;
+                const assessment = document.querySelector('input[name="assessment-' + index + '"]:checked').value;
                console.log("assessment:",assessment)
                const newAltText = document.querySelector('.new-alt-text[data-index="' + index + '"]').value;
                selectedData.push({
@ -548,6 +634,16 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
            return [url_input,JSON.stringify(selectedData)];
        }
        """,
        ).then(
            fn=lambda: gr.update(visible=True),
            inputs=[],
            outputs=[alttext_results_row],
        )
        save_user_assessment_btn.click(
            fn=process_dataframe,
            inputs=[db_path_state, url_input, alttext_info_output, user_state],
            outputs=[image_info_output],
        )
    # placed here at the end to give full contents visibility to events
--- a/dependences/image_extractor.py
+++ b/dependences/image_extractor.py
@ -55,7 +55,6 @@ class ImageExtractor:
        # Also check query parameters (e.g., format=jpeg)
        return any(fmt in img_url.lower() for fmt in self.SUPPORTED_FORMATS)
    async def _download_image(self, image_url, output_dir="images") -> None:
        # Parse the URL to get the path without query parameters
@ -79,7 +78,7 @@ class ImageExtractor:
        # Sanitize image name (remove special characters, limit length)
        image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_"))
-        image_name = image_name[:200]  # Limit filename length
+        image_name = image_name[:50]  # Limit filename length
        # If name is empty after sanitization, create a hash-based name
        if not image_name:
@ -88,13 +87,15 @@ class ImageExtractor:
            image_name = hashlib.md5(image_url.encode()).hexdigest()[:16]
        # Download the image
-        print("getting image:", image_url)
+        print("getting image url:", image_url)
        print("getting image name:", image_name)
        response = requests.get(image_url, timeout=10)
        response.raise_for_status()
        try:
            # Save the image
            output_path = os.path.join(output_dir, f"{image_name}.{ext}")
            print("saving image to:", output_path)
            with open(output_path, "wb") as f:
                f.write(response.content)
            print(f"Saved: {output_path}")
@ -292,43 +293,36 @@ class ImageExtractor:
            error_msg = f"Error extracting context: {str(e)}"
            return error_msg, error_msg, error_msg
-    async def _get_page_metadata(self, page) -> Dict[str, Optional[str]]:
+    async def _get_page_metadata(self, page):
-        """Extract page metadata including title, description, and keywords."""
+        """Extract page metadata in one fast evaluate call. Batch DOM extraction inside one evaluate()."""
-        metadata = {
+        return await page.evaluate(
-            "title": await page.title(),
+            """
-            "description": None,
+            () => {
-            "keywords": None,
+                const metadata = {
-            "headings": [],
+                    title: document.title || null,
-        }
+                    description: null,
                    keywords: null,
                    headings: []
                };
-        # Extract meta description
+                const desc = document.querySelector('meta[name="description"]');
-        try:
+                const keys = document.querySelector('meta[name="keywords"]');
-            description = await page.locator('meta[name="description"]').get_attribute(
+                metadata.description = desc?.content || null;
-                "content"
+                metadata.keywords = keys?.content || null;
            )
            metadata["description"] = description
        except:
            pass
-        # Extract meta keywords
+                // Collect all headings h1–h6
-        try:
+                const allHeadings = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
-            keywords = await page.locator('meta[name="keywords"]').get_attribute(
+                metadata.headings = Array.from(allHeadings)
-                "content"
+                    .map(h => ({
-            )
+                        level: parseInt(h.tagName.substring(1), 10),
-            metadata["keywords"] = keywords
+                        text: h.textContent.trim()
-        except:
+                    }))
-            pass
+                    .filter(h => h.text.length > 0);
-        # Extract all headings (h1-h6)
+                return metadata;
-        for level in range(1, 7):
+            }
-            headings = await page.locator(f"h{level}").all_text_contents()
+        """
-            for heading in headings:
+        )
                if heading.strip():
                    metadata["headings"].append(
                        {"level": level, "text": heading.strip()}
                    )
        return metadata
    async def extract_images(
        self, extract_context=True, specific_images_urls=[]
@ -344,15 +338,18 @@ class ImageExtractor:
            page = await browser.new_page()
            try:
-                #await page.goto(self.url, wait_until="networkidle")  # method 1: use if the page has unpredictable async content and there is the need to ensure everything loads
+                # await page.goto(self.url, wait_until="networkidle")  # method 1: use if the page has unpredictable async content and there is the need to ensure everything loads
-                                                                     # The "networkidle" approach is generally more robust but slower, while the fixed timeout is faster but less adaptive to actual page behavior.
+                # The "networkidle" approach is generally more robust but slower, while the fixed timeout is faster but less adaptive to actual page behavior.
                # ---alternative method2: use if there is total awareness of the page's loading pattern and want faster, more reliable execution
-                await page.goto(self.url, timeout=50000, wait_until="load")# deafult timeout=30000, 30sec
+                await page.goto(
                    self.url, timeout=50000, wait_until="load"
                )  # deafult timeout=30000, 30sec
                # Wait for page to load completely
                await page.wait_for_timeout(2000)  # Wait for dynamic content
                # -----
                if extract_context:
                    print("Getting page metadata...")
                    # Get page metadata once
                    page_metadata = await self._get_page_metadata(page)
                    page_title = page_metadata["title"]
@ -367,15 +364,41 @@ class ImageExtractor:
                if len(specific_images_urls) == 0:
                    # Find all img elements
-                    print("Extracting all images from the page",self.url)
+                    print("Extracting all images from the page", self.url)
-                    img_elements = await page.locator("img").all()
+                    # img_elements = await page.locator("img").all()
                else:
                    print(
                        "Extracting specific images from the page:",
                        self.url,
                        specific_images_urls,
                    )
-                    img_elements = []
+                    # img_elements = await page.locator("img").all()
                    """ # method 3: optimized approach
                    # Get all src attributes in one go
                    all_img_elements = await page.locator("img").all()
                    all_srcs = await page.locator("img").evaluate_all(
                        "elements => elements.map(el => el.src || '')"
                    )
                    # Filter with the pre-fetched src values
                    img_elements = [
                        elem for elem, src in zip(all_img_elements, all_srcs)
                        if src in specific_images_urls
                    ]
                    """
                    """ #method 2: single pass to find matching images
                    for img_element in all_img_elements: #This is more efficient than making separate locator queries for each specific URL and avoids timeout issues.
                        try:
                            src = await img_element.get_attribute("src")
                            print("found image src:", src)
                            if src in specific_images_urls:
                                img_elements.append(img_element)
                        except Exception as e:
                            print(f"Error getting src attribute from image: {str(e)}")"""
                    """    # method 1: separate locator queries for each specific URL
                    for url in specific_images_urls:
                        try:
                            img_element = await page.locator(
@ -384,8 +407,11 @@ class ImageExtractor:
                            if img_element:
                                img_elements.append(img_element)
                        except Exception as e:
-                            print(f"Error locating image with src {url}: {str(e)}")
+                            print(f"Error locating image with src {url}: {str(e)}")"""
                img_elements = await page.locator(
                    "img"
                ).all()  # unified approach to start with all images and filter later
                image_source_list = []  # avoid multiple check for the same image url
                images_data = []
@ -404,6 +430,12 @@ class ImageExtractor:
                        if not src:
                            print("image has no src attribute. Skipped.")
                            continue
                        if (
                            src not in specific_images_urls
                            and len(specific_images_urls) > 0
                        ):
                            # print("image src",src,"not in the specific images list. Skipped.")
                            continue
                        if src not in image_source_list:
                            image_source_list.append(src)
@ -434,6 +466,7 @@ class ImageExtractor:
                        alt_text = await img.get_attribute("alt") or ""
                        if extract_context:
                            print("Extracting context for image:", img_url)
                            # Get surrounding HTML context (full, immediate, and nearby)
                            html_context, immediate_context, nearby_text = (
                                await self._get_element_context(page, img)
--- a/dependences/mllm_management.py
+++ b/dependences/mllm_management.py
@ -76,39 +76,7 @@ class MLLMManager:
        return payload
    def get_alt_text_system_prompt(self):
-        system_prompt_old = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for 
+        
                images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present 
                the same information as the image, and should be able to substitute for the non-text content. The text alternative would 
                be brief but as informative as possible.
                Follow these instructions carefully:
                1. You will be provided as input with the following:
                - The image found on the webpage.
                - The associated alternative text. When the alt-text is empty or absent, you will be explicitly informed.
                - The surrounding context of the image.
                - The page title, headings and the content of the “keywords” and “description” <meta> tag, if found.
                2. Determine the function and purpose of the image by analyzing these elements. Take into account the purpose and function 
                of the associated image by considering the page context. Check also if the image is, or is associated with, a link or a button, 
                and consider this in your judgement. If the image contains text use that as part of the context. 
                3. Provide a final assessment based on the following:
                - 'success' if you can assess with 'sufficient certainty' the alt-text is appropriate in relation to the image purpose,
                - 'failure' if you can assess with 'sufficient certainty' that the alt-text is NOT appropriate,
                - 'warning' if you cannot determine with 'sufficient certainty'.
                where the level of certainty goes from 1 to 100 and 'sufficient certainty' means > 80 
                4. The original alt-text assessment on a scale from 1 to 5, where 5 is the best score. Use an integer number only. 
                5. Provide a brief reasoning for your judgment. If the image contains text, write it verbatim. Your response should be in English.
                6. Keep your response within 150 words.
                7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words.
                8. Here is the JSON format the results must have:
                {"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}"""
        # https://www.w3.org/WAI/WCAG22/Techniques/general/G94 without examples
        system_prompt = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for 
                images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present 
@ -122,7 +90,7 @@ class MLLMManager:
                What purpose does it fulfill?
                If I could not use the image content, what words would I use to convey the same function and/or information?
-                When image content contains words that are important to understanding the content, the alt text should include those words
+                When image content contains words that are important to understanding the content, the alt text should include those words.
                Follow these instructions carefully:
                1. You will be provided as input with the following:
@ -147,7 +115,7 @@ class MLLMManager:
                6. Keep your response within 150 words.
-                7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words.
+                7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words. Use the same language as the original alt-text.
                8. Here is the JSON format the results must have:
                {"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}"""
@ -181,7 +149,7 @@ class MLLMManager:
        print("Using end_point:", self.end_point)
        alt_text_system_prompt = self.get_alt_text_system_prompt()
-        print("alt_text_system_prompt:", alt_text_system_prompt)
+        #print("alt_text_system_prompt:", alt_text_system_prompt)
        mllm_responses = []
        for img_info in images:
--- a/restserver/routers/routes_extract_images.py
+++ b/restserver/routers/routes_extract_images.py
@ -46,6 +46,7 @@ class ExtractImagesRoutes:
        self, request: Request, data: ExtractImages
    ) -> JSONResponse:
        """Return the alt text validation  assessment based on WCAG guidelines"""
        print("Received extract images request.")
        try:
            json_content = json.loads(data.model_dump_json())
--- a/restserver/routers/routes_wcag_alttext.py
+++ b/restserver/routers/routes_wcag_alttext.py
@ -53,6 +53,7 @@ class WCAGAltTextValuationRoutes:
    ) -> JSONResponse:
        """Return the alt text validation  assessment based on WCAG guidelines"""
        try:
            print("Received wcag alttext validation request.")
            json_content = json.loads(data.model_dump_json())
            mllm_model_id = self.mllm_settings["mllm_model_id"]
@ -67,7 +68,12 @@ class WCAGAltTextValuationRoutes:
                    .replace(":", "")
                    .replace("//", "_")
                    .replace("/", "_")
                    .replace("%2", "_")
                    .replace("?", "_")
                    .replace("=", "_")
                    .replace("&", "_")
                )
                url_path=url_path[:50]  # limit length
                now = datetime.now(timezone.utc)
                now_str = now.strftime("%Y_%m_%d-%H_%M_%S")
                folder_str = mllm_model_id.replace(":", "-") + "_" + now_str
@ -93,7 +99,7 @@ class WCAGAltTextValuationRoutes:
            # Extract images
            logging.info(f"Extracting images from: {json_content['page_url']}")
            images = await image_extractor.extract_images(
-                specific_images_urls=json_content["specific_images_urls"]
+                specific_images_urls=json_content["specific_images_urls"],extract_context=True
            )
            # MLLM settings
            mllm_end_point = self.mllm_settings["mllm_end_point"]
--- a/scripts/build_dataset_from_folder.py
+++ b/scripts/build_dataset_from_folder.py
@ -0,0 +1,541 @@
 # to launch: python build_dataset_from_folder.py --ref_path "" --push_to_hub --repo_id "nicolaleo/LLM-alt-text-assessment" --token ""
 from datasets import Dataset, DatasetDict
 import datasets
 import json
 from pathlib import Path
 from PIL import Image
 import hashlib
 import urllib.parse
 import argparse
 '''
 # Dataset metadata
 _DESCRIPTION = """\
 Dataset for image alt-text assessment and improvement using MLLM responses.
 Contains images, original alt-texts, quality assessments, and improved versions.
 """
 _CITATION = """\
@misc{alt_text_assessment,
  title={Alt-Text Assessment Dataset},
  year={2024}
 }
 """
 class AltTextDataset(datasets.GeneratorBasedBuilder):
    """Dataset for alt-text assessment with images and MLLM responses."""
    VERSION = datasets.Version("1.0.0")
    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features({
                "image": datasets.Image(),
                "image_url": datasets.Value("string"),
                "alt_text": datasets.Value("string"),
                "original_alt_text_assessment": datasets.Value("string"),
                "assessment": datasets.Value("string"),
                "evaluation_result": datasets.Value("string"),
                "new_alt_text": datasets.Value("string"),
                #"source_folder": datasets.Value("string"),
            }),
            citation=_CITATION,
        )
    def _split_generators(self, dl_manager):
        """Define data splits."""
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "json_filepath": "data.json",
                    "images_dir": "images"
                },
            ),
        ]
    def _generate_examples(self, json_filepath, images_dir):
        """Generate examples from JSON file and image directory."""
        with open(json_filepath, encoding="utf-8") as f:
            data = json.load(f)
        images_path = Path(images_dir)
        for idx, entry in enumerate(data):
            image_url = entry["image_url"]
            image_filename = url_to_filename(image_url)
            image_path = images_path / image_filename
            # Load image if exists, otherwise None
            image = str(image_path) if image_path.exists() else None
            yield idx, {
                "image": image,
                "image_url": image_url,
                "alt_text": entry["alt_text"],
                "original_alt_text_assessment": entry["mllm_response"]["original_alt_text_assessment"],
                "assessment": entry["mllm_response"]["assessment"],
                "evaluation_result": entry["mllm_response"]["evaluation_result"],
                "new_alt_text": entry["mllm_response"]["new_alt_text"],
            }
 '''
 # ============================================================================
 # SIMPLE USAGE FUNCTIONS
 # ============================================================================
 def url_to_filename(image_url):  # save step as in the image_extractor dependence
    """
    Convert image URL to sanitized filename following your exact logic.
    Args:
        image_url: The image URL
    Returns:
        Sanitized filename with extension
    """
    # Parse the URL to get the path without query parameters
    parsed_url = urllib.parse.urlparse(image_url)
    url_path = parsed_url.path
    # Get the filename from the path
    filename = url_path.split("/")[-1]
    print(f"Original filename: '{filename}'")
    # Split filename and extension
    if "." in filename:
        image_name, ext = filename.rsplit(".", 1)
        ext = ext.lower()
    else:
        image_name = filename
        ext = "jpg"
    # Validate extension
    if ext not in ["jpg", "jpeg", "png", "gif", "webp"]:
        ext = "jpg"
    # Sanitize image name (remove special characters, limit length)
    image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_"))
    image_name = image_name[:50]  # Limit filename length
    # If name is empty after sanitization, create a hash-based name
    if not image_name:
        image_name = hashlib.md5(image_url.encode()).hexdigest()[:16]
    return f"{image_name}.{ext}"
 def push_to_hub_example(dataset_path="alt_text_merged_dataset", repo_id="",token=None):
    """
    Example of how to push dataset to Hugging Face Hub.
    You need to authenticate first!
    """
    from huggingface_hub import login
    print("\n=== Pushing Dataset to Hugging Face Hub ===")
    # Method 1: Login interactively (will prompt for token)
    # login()
    # Method 2: Login with token directly
    login(token=token)
    # Method 3: Set token as environment variable
    # export HF_TOKEN="hf_YourTokenHere"
    # Then login() will use it automatically
    # Load your dataset
    ds = load_dataset_from_disk(dataset_path)
    # Combine into DatasetDict
    ds = DatasetDict(
        {
            "train": ds,
            #    #"test": test_dataset
        }
    )
    # Push to hub (creates repo if it doesn't exist)
    ds.push_to_hub(  # Automatically converts to Parquet when uploading to Hub
        repo_id,  # Replace with your username
        private=False,  # Set True for private dataset
    )
    print("Dataset pushed successfully!")
    print(f"View at: https://huggingface.co/datasets/{repo_id}")
 def create_dataset_from_json(json_filepath, json_filepath_images, images_dir="images"):
    """
    Create a Hugging Face Dataset from JSON file with local images.
    Args:
        json_filepath: Path to JSON file with your data structure
        images_dir: Directory containing the images (default: "images")
    Returns:
        datasets.Dataset object with images loaded
    """
    with open(json_filepath, "r", encoding="utf-8") as f:
        data = json.load(f)
    with open(json_filepath_images, "r", encoding="utf-8") as f:
        data_images = json.load(f)
    images_path = Path(images_dir)
    # Flatten the nested structure and load images
    flattened_data = {
        "image": [],
        "image_url": [],
        "alt_text": [],
        "original_alt_text_assessment": [],
        "assessment": [],
        "evaluation_result": [],
        "new_alt_text": [],
        "page_url": [],
        "html_context": [],
    }
    count_entry = 0
    for entry in data:
        if (
            entry["mllm_response"]["original_alt_text_assessment"] is None
        ):  # important! skip entries with no MLLM response. not usable data
            print(
                f"Skipping entry with image URL: {entry['image_url']} due to missing MLLM response"
            )
            count_entry += 1
            continue  # Skip entries with no MLLM response
        image_url = entry["image_url"]
        image_filename = url_to_filename(image_url)
        image_path = images_path / image_filename
        # Load image if it exists
        if image_path.exists():
            img = Image.open(image_path)
            flattened_data["image"].append(img)
        else:
            print(f"Warning: Image not found: {image_path}")
            flattened_data["image"].append(None)
        flattened_data["image_url"].append(image_url)
        flattened_data["alt_text"].append(entry["alt_text"])
        flattened_data["original_alt_text_assessment"].append(
            str(entry["mllm_response"]["original_alt_text_assessment"])
        )
        flattened_data["assessment"].append(entry["mllm_response"]["assessment"])
        flattened_data["evaluation_result"].append(
            entry["mllm_response"]["evaluation_result"]
        )
        flattened_data["new_alt_text"].append(entry["mllm_response"]["new_alt_text"])
        flattened_data["page_url"].append(data_images[count_entry]["page_url"])
        flattened_data["html_context"].append(data_images[count_entry]["html_context"])
        count_entry += 1
    print(f"Total valid entries loaded: {len(flattened_data['image_url'])}")
    return datasets.Dataset.from_dict(flattened_data)
 def create_dataset_from_folders(
    ref_path,
    json_filename="mllm_alttext_assessments.json",
    json_filename_images="extracted_images.json",
    images_dirname="images",
 ):
    """
    Create a merged dataset from multiple folders under ref_path.
    Each folder should contain a JSON file and an images subdirectory.
    Args:
        ref_path: Root path containing multiple folders
        json_filename: Name of JSON file in each folder (default: "data.json")
        images_dirname: Name of images subdirectory (default: "images")
    Returns:
        datasets.Dataset object with all entries merged
    """
    ref_path = Path(ref_path)
    all_datasets = []
    # Find all subdirectories containing the JSON file
    folders_processed = 0
    for folder in ref_path.iterdir():
        if not folder.is_dir():
            continue
        json_path = folder / json_filename
        json_path_images = folder / json_filename_images
        images_path = folder / images_dirname
        # Check if both JSON and images directory exist
        if not json_path.exists():
            print(f"Skipping {folder.name}: no {json_filename} found")
            continue
        if not json_path_images.exists():
            print(f"Skipping {folder.name}: no {json_filename_images} found")
            continue
        if not images_path.exists():
            print(f"Warning: {folder.name}: images directory not found")
            # continue
            # Continue anyway, images might be optional (from urls only)
        print(f"Processing folder: {folder.name}")
        try:
            # Create dataset for this folder
            ds = create_dataset_from_json(
                str(json_path), str(json_path_images), str(images_path)
            )
            all_datasets.append(ds)
            folders_processed += 1
            print(f"  -> Loaded {len(ds)} entries")
        except Exception as e:
            print(f"Error processing {folder.name}: {e}")
            continue
    if not all_datasets:
        raise ValueError(f"No valid folders found in {ref_path}")
    # Merge all datasets
    print(f"\n=== Merging {folders_processed} folders ===")
    merged_dataset = datasets.concatenate_datasets(all_datasets)
    print(f"Total entries: {len(merged_dataset)}")
    return merged_dataset
 def verify_images(json_filepath, images_dir="images"):
    """
    Verify that all images referenced in JSON exist in the images directory.
    Args:
        json_filepath: Path to JSON file
        images_dir: Directory containing images
    Returns:
        Dict with 'found', 'missing', and 'details' keys
    """
    with open(json_filepath, "r", encoding="utf-8") as f:
        data = json.load(f)
    images_path = Path(images_dir)
    found = []
    missing = []
    for entry in data:
        image_url = entry["image_url"]
        image_filename = url_to_filename(image_url)
        image_path = images_path / image_filename
        print(
            "image_url:",
            image_url,
            "image_filename:",
            image_filename,
            "image_path:",
            image_path,
        )
        if image_path.exists():
            found.append(
                {"url": image_url, "filename": image_filename, "path": str(image_path)}
            )
        else:
            missing.append(
                {
                    "url": image_url,
                    "filename": image_filename,
                    "expected_path": str(image_path),
                }
            )
    return {
        "found": len(found),
        "missing": len(missing),
        "total": len(data),
        "details": {"found_images": found, "missing_images": missing},
    }
 def verify_images_in_folders(
    ref_path, json_filename="mllm_alttext_assessments.json", images_dirname="images"
 ):
    """
    Verify images across all folders under ref_path.
    Args:
        ref_path: Root path containing multiple folders
        json_filename: Name of JSON file in each folder
        images_dirname: Name of images subdirectory
    Returns:
        Dict with aggregated verification results
    """
    ref_path = Path(ref_path)
    total_found = 0
    total_missing = 0
    total_entries = 0
    folder_results = {}
    for folder in ref_path.iterdir():
        if not folder.is_dir():
            continue
        json_path = folder / json_filename
        images_path = folder / images_dirname
        if not json_path.exists():
            continue
        print(f"Verifying folder: {folder.name}")
        try:
            verification = verify_images(str(json_path), str(images_path))
            folder_results[folder.name] = verification
            total_found += verification["found"]
            total_missing += verification["missing"]
            total_entries += verification["total"]
            print(f"  Found: {verification['found']}/{verification['total']}")
        except Exception as e:
            print(f"  Error: {e}")
            continue
    return {
        "found": total_found,
        "missing": total_missing,
        "total": total_entries,
        "folders": folder_results,
    }
 def save_dataset(dataset, output_path):
    """Save dataset in Arrow format (includes images)."""
    dataset.save_to_disk(output_path)
    # print(f"Dataset saved to {output_path}")
    # Or save as JSON
    # dataset.to_json(f"{output_path}/data.json")
    # Or save as CSV
    # dataset.to_csv(f"{output_path}/data.csv")
    # Or save as Parquet
    # dataset.to_parquet(f"{output_path}/data.parquet")
 def load_dataset_from_disk(dataset_path):
    """Load a previously saved dataset."""
    return datasets.load_from_disk(dataset_path)
 # ============================================================================
 # EXAMPLE USAGE
 # ============================================================================
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--ref_path",
        type=str,
        help=("Root path containing multiple folders"),
        default="",
    )
    parser.add_argument(
        "--push_to_hub",
        action="store_true",
        default=False,
        help=("If True push the merged dataset to Hugging Face Hub"),
    )
    parser.add_argument(
        "--token",
        type=str,
        help=("Hugging Face authentication token"),
        default="",
    )
    parser.add_argument(
        "--repo_id",
        type=str,
        help=("Hugging Face repository ID"),
        default="nicolaleo/LLM-alt-text-assessment",
    )
    args = parser.parse_args()
    # Example 1: Verify images across all folders
    print("=== Verifying Images in All Folders ===")
    verification = verify_images_in_folders(args.ref_path)
    print("\n######## Verifier output ################################")
    print(f"Total Found: {verification['found']}/{verification['total']}")
    print(f"Total Missing: {verification['missing']}/{verification['total']}")
    print("########################################")
    # Show per-folder breakdown
    print("\n=== Per-Folder Breakdown ===")
    for folder_name, results in verification["folders"].items():
        print(f"{folder_name}: {results['found']}/{results['total']} images found")
    # Example 2: Create merged dataset from all folders
    print("\n=== Creating Merged Dataset ===")
    ds = create_dataset_from_folders(args.ref_path)
    print("\n######## Merged Dataset output ################################")
    print(f"Final dataset size: {len(ds)} entries")
    print("########################################")
    # Example 3: Analyze the merged dataset
    print("\n=== Dataset Analysis ===")
    print(ds)
    # Example 3: Access images and data
    print("\n=== First Example ===")
    first_example = ds[0]
    print(f"Image URL: {first_example['image_url']}")
    print(f"Alt text: {first_example['alt_text']}")
    print(f"Assessment: {first_example['assessment']}")
    print(f"New alt text: {first_example['new_alt_text']}")
    print(f"Image loaded: {first_example['image'] is not None}")
    if first_example["image"] is not None:
        img = first_example["image"]
        print(f"Image size: {img.size}")
        # img.show()  # Uncomment to display image
    # Example 4: Filter and work with merged data
    print("\n=== Filtering Merged Dataset ===")
    successful = ds.filter(lambda x: x["assessment"] == "success")
    print(f"Successful assessments: {len(successful)}")
    high_rated = ds.filter(lambda x: int(x["original_alt_text_assessment"]) >= 4)
    print(f"High-rated (>=4): {len(high_rated)}")
    # Example 5: Save merged dataset
    print("\n=== Saving Merged Dataset ===")
    save_dataset(ds, "alt_text_merged_dataset")
    # Example 6: Load dataset
    print("\n=== Loading Dataset ===")
    loaded_ds = load_dataset_from_disk("alt_text_merged_dataset")
    print(f"Loaded {len(loaded_ds)} entries")
    if args.push_to_hub:
        # Push to Hugging Face Hub (optional)
        push_to_hub_example(repo_id=args.repo_id, token=args.token)  # function below for details
--- a/scripts/requirements_extra.txt
+++ b/scripts/requirements_extra.txt
@ -5,4 +5,5 @@ transformers==4.57.1
 numpy==2.2.6
 matplotlib==3.10.7
 scikit-learn==1.7.2
-sentence-transformers==5.1.2
+sentence-transformers==5.1.2
 datasets==4.4.1