upgrade e costruzione datasetHF

2025-12-09 12:51:12 +01:00 · 2025-12-09 12:51:12 +01:00 · 85c03b3a1a
parent cde7259ed7
commit 85c03b3a1a
8 changed files with 763 additions and 116 deletions
--- a/UI/.env
+++ b/UI/.env
@ -1,4 +1,5 @@
 DB_PATH=persistence/wcag_validator_ui.db
 WCAG_REST_SERVER_URL=http://localhost:8000
 URL_LIST_old=["http://www.amazon.it","https://web.archive.org/web/20230630235957/http://www.amazon.com/", "https://web.archive.org/web/20251130033532/https://www.ebay.com/"]
-URL_LIST=["https://amazon.com","https://ebay.com","https://walmart.com","https://etsy.com","https://target.com","https://wayfair.com","https://bestbuy.com","https://macys.com","https://homedepot.com","https://costco.com","https://www.ansa.it","https://en.wikipedia.org/wiki/Main_Page","https://www.lanazione.it","https://www.ansa.it","https://www.bbc.com","https://www.cnn.com","https://www.nytimes.com","https://www.theguardian.com"]
+URL_LIST_old=["https://www.amazon.com/s?k=magllioni&crid=CGD2UWO33O58&sprefix=magllioni%2Caps%2C209&ref=nb_sb_noss","https://web.archive.org/web/20251011214807/https://www.ilfattoquotidiano.it/","https://amazon.com","https://ebay.com","https://walmart.com","https://etsy.com","https://target.com","https://wayfair.com","https://www.ansa.it","https://en.wikipedia.org/wiki/Main_Page","https://www.lanazione.it","https://www.bbc.com","https://www.cnn.com","https://www.nytimes.com","https://www.theguardian.com"]
+URL_LIST=["https://giove.isti.cnr.it/users/manca/eBay.html","http://www.amazon.it"]
--- a/UI/wcag_validator_ui.py
+++ b/UI/wcag_validator_ui.py
@ -31,6 +31,45 @@ import sqlite3
 WCAG_VALIDATOR_RESTSERVER_HEADERS = [("Content-Type", "application/json")]


+def process_dataframe(db_path, url, updated_df, user_state={}):
+
+    print("Processing dataframe to adjust columns...")
+    column_rating_name = "User Assessment for LLM Proposal"
+
+    # Get the assessment column
+    try:
+        updated_df[column_rating_name] = updated_df[column_rating_name].astype(int)
+    except ValueError:
+        return "Error: User Assessment for LLM Proposal must be an integer"
+
+    if (updated_df[column_rating_name] < 1).any() or (
+        updated_df[column_rating_name] > 5
+    ).any():
+        return "Error: User Assessment for LLM Proposal must be between 1 and 5"
+
+    dataframe_json = updated_df.to_json(orient="records")
+    connection_db = sqlite3.connect(db_path)
+    json_user_str = json.dumps({"username": user_state["username"]}, ensure_ascii=False)
+    try:
+        # insert after everything to keep datetime aligned
+        db_persistence_insert(
+            connection_db=connection_db,
+            insert_type="wcag_user_llm_alttext_assessments",
+            page_url=url,
+            user=json_user_str,
+            llm_model="",
+            json_in_str=dataframe_json,  # to improve
+            json_out_str="done via UI",
+            table="wcag_user_assessments",
+        )
+    except Exception as e:
+        print("Error inserting user assessment into database:", str(e))
+    finally:
+        if connection_db:
+            connection_db.close()
+        return "User assessment saved successfully!"
+
+
 def load_images_from_json(json_input):
    """Extract URLs and alt text from JSON and create HTML gallery"""
    try:
@ -40,7 +79,7 @@ def load_images_from_json(json_input):
            return "No images found in JSON", ""

        images = data["images"]
-        info_text = f"Found {len(images)} image(s)\n"
+        info_text = f"Found {len(images)} image(s)"
        print(f"Found {len(data['images'])} image(s)")

        # Create HTML gallery with checkboxes and assessment forms
@ -58,14 +97,14 @@ def load_images_from_json(json_input):
                padding: 10px;
                background: white;
            }
-            .image-card:has(input:checked) {
+            .image-card:has(input[type="checkbox"]:checked) {
                border-color: #2196F3;
                background: #a7c1c1;
            }
            .image-card img {
                width: 100%;
                height: 200px;
-                object-fit: cover;
+                object-fit: scale-down;
                border-radius: 4px;
            }
            .image-info {
@ -93,7 +132,7 @@ def load_images_from_json(json_input):
                display: none;
                margin-top: 15px;
                padding: 10px;
-                background: #f0f7ff;
+                background: #7896b9;
                border-radius: 4px;
                border: 1px solid #2196F3;
            }
@ -109,18 +148,22 @@ def load_images_from_json(json_input):
                margin-bottom: 5px;
                font-size: 13px;
            }
-            .range-container {
+            
+            .radio-container {
+                display: flex;
+                gap: 15px;
+                align-items: center;
+            }
+
+            .radio-option {
                display: flex;
                align-items: center;
-                gap: 10px;
+                gap: 5px;
+                cursor: pointer;
            }
-            .range-container input[type="range"] {
-                flex: 1;
-            }
-            .range-value {
-                font-weight: bold;
-                min-width: 20px;
-                text-align: center;
+
+            .radio-label {
+                font-weight: 500;
            }
            textarea {
                width: 100%;
@ -166,12 +209,28 @@ def load_images_from_json(json_input):
                    <div id="panel-{idx}" class="assessment-panel">
                        <div class="form-group">
                            <label>Rate current alt-text:</label>
-                            <div class="range-container">
-                                <input type="range" min="1" max="5" value="3" 
-                                       class="assessment-range" data-index="{idx}"
-                                       oninput="document.getElementById('range-value-{idx}').textContent = this.value">
-                                <span id="range-value-{idx}" class="range-value">3</span>
-                            </div>
+                            <div class="radio-container">
+                            <label class="radio-option">
+                                <input type="radio" name="assessment-{idx}" value="1" data-index="{idx}">
+                                <span class="radio-label">1</span>
+                            </label>
+                            <label class="radio-option">
+                                <input type="radio" name="assessment-{idx}" value="2" data-index="{idx}">
+                                <span class="radio-label">2</span>
+                            </label>
+                            <label class="radio-option">
+                                <input type="radio" name="assessment-{idx}" value="3" data-index="{idx}" checked>
+                                <span class="radio-label">3</span>
+                            </label>
+                            <label class="radio-option">
+                                <input type="radio" name="assessment-{idx}" value="4" data-index="{idx}">
+                                <span class="radio-label">4</span>
+                            </label>
+                            <label class="radio-option">
+                                <input type="radio" name="assessment-{idx}" value="5" data-index="{idx}">
+                                <span class="radio-label">5</span>
+                            </label>
+                        </div>
                        </div>
                        <div class="form-group">
                            <label>New alt-text:</label>
@ -226,7 +285,7 @@ def load_llm_assessment_from_json(json_input):
                {
                    "Original Alt Text": alt_text_original,
                    "LLM Assessment": original_alt_text_assessment,
-                    "Proposed Alt Text": new_alt_text,
+                    "LLM Proposed Alt Text": new_alt_text,
                }
            )

@ -257,7 +316,7 @@ def make_alttext_llm_assessment_api_call(
    if not selected_images or len(selected_images) == 0:
        info_text = "No images selected"
        print(info_text)
-        return pd.DataFrame()
+        return "LLM assessment not started", pd.DataFrame()

    # prepare data for insertion
    json_in_str = {}
@ -267,6 +326,7 @@ def make_alttext_llm_assessment_api_call(
    user_assessments = []
    user_new_alt_texts = []
    selected_image_id = []
+    user_assessments_llm_proposal = []
    for img in selected_images:
        selected_urls.append(img["image_url"])
        selected_alt_text_original.append(img["original_alt_text"])
@ -275,6 +335,7 @@ def make_alttext_llm_assessment_api_call(
        selected_image_id.append(
            int(img["image_index"]) + 1
        )  # add the id selected (+1 for index alignment)
+        user_assessments_llm_proposal.append(3)  # default value for now
    json_in_str["images_urls"] = selected_urls
    json_in_str["images_alt_text_original"] = selected_alt_text_original
    json_out_str["user_assessments"] = user_assessments
@ -302,9 +363,17 @@ def make_alttext_llm_assessment_api_call(
        )
        # return response
        info_dataframe = load_llm_assessment_from_json(response)
+
+        # add the UI ids and other fields to to api response
        info_dataframe.insert(
            0, "Image #", selected_image_id
        )  # add the UI ids from to api response
+        info_dataframe.insert(2, "User Assessment", user_assessments)
+
+        info_dataframe.insert(3, "User Proposed Alt Text", user_new_alt_texts)
+        info_dataframe["User Assessment for LLM Proposal"] = (
+            user_assessments_llm_proposal
+        )

    except Exception as e:
        return {"error": str(e)}
@ -326,7 +395,7 @@ def make_alttext_llm_assessment_api_call(
    finally:
        if connection_db:
            connection_db.close()
-        return info_dataframe
+        return "LLM assessment completed", info_dataframe


 def make_image_extraction_api_call(
@ -449,9 +518,10 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
                        images_number = gr.Slider(
                            5,
                            100,
-                            value=30,
+                            value=50,
                            step=5,
                            label="Max number of images to retrieve",
+                            visible=False,
                        )
                    with gr.Column():

@ -459,39 +529,54 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
                            "Extract Images & Alt Texts", variant="primary"
                        )
                        alttext_api_call_btn = gr.Button(
-                            "Alt Text LLM Assessment",
+                            "Start LLM Assessment",
                            variant="secondary",
                            interactive=False,
                        )
+                        image_info_output = gr.Textbox(
+                            label="Activity tracking", lines=1
+                        )

-        with gr.Row():
-
-            image_info_output = gr.Textbox(label="Managed Images", lines=5)
+        with gr.Row(visible=False) as alttext_results_row:

            # Use DataFrame for tabular output
            alttext_info_output = gr.DataFrame(
                headers=[
                    "Image #",
                    "Original Alt Text",
+                    "User Assessment",
+                    "User Proposed Alt Text",
                    "LLM Assessment",
-                    "Proposed Alt Text",
+                    "LLM Proposed Alt Text",
+                    "User Assessment for LLM Proposal",
                ],
                label="LLM Assessment Results",
                wrap=True,  # Wrap text in cells
-                interactive=False,
+                interactive=True,
+                scale=7,
            )
+            with gr.Column():
+                save_user_assessment_btn = gr.Button(
+                    "Save Your Assessment",
+                    variant="secondary",
+                    interactive=True,
+                    scale=1,
+                )
+                gr.Markdown(
+                    "ℹ Info: to assess the LLM output, only the values for the 'User Assessment for LLM Proposal' column need to be changed."
+                )

        with gr.Row():

            gallery_html = gr.HTML(label="Image Gallery")

        image_extraction_api_call_btn.click(
-            fn=lambda: ("", "", pd.DataFrame(), gr.Button(interactive=False)),
+            fn=lambda: ("", "", gr.update(visible=False), gr.Button(interactive=False)),
            inputs=[],
            outputs=[
                image_info_output,
                gallery_html,
-                alttext_info_output,
+                alttext_results_row,
                alttext_api_call_btn,
            ],
        ).then(
@ -515,7 +600,7 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
                wcag_rest_server_url_state,
                user_state,
            ],
-            outputs=[alttext_info_output],
+            outputs=[image_info_output, alttext_info_output],
            js="""
         (url_input,gallery_html) => {
            const checkboxes = document.querySelectorAll('.image-checkbox:checked');
@ -533,7 +618,8 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
                const index = checkbox.dataset.index;
                const imageUrl = checkbox.dataset.imgurl;
                const originalAlt = document.querySelector('.original-alt[data-index="' + index + '"]').value;
-                const assessment = document.querySelector('.assessment-range[data-index="' + index + '"]').value;
+                const assessment = document.querySelector('input[name="assessment-' + index + '"]:checked').value;
+                console.log("assessment:",assessment)
                const newAltText = document.querySelector('.new-alt-text[data-index="' + index + '"]').value;
                
                selectedData.push({
@ -548,6 +634,16 @@ with gr.Blocks(theme=gr.themes.Glass(), title="WCAG AI Validator") as demo:
            return [url_input,JSON.stringify(selectedData)];
        }
        """,
+        ).then(
+            fn=lambda: gr.update(visible=True),
+            inputs=[],
+            outputs=[alttext_results_row],
+        )
+
+        save_user_assessment_btn.click(
+            fn=process_dataframe,
+            inputs=[db_path_state, url_input, alttext_info_output, user_state],
+            outputs=[image_info_output],
        )

    # placed here at the end to give full contents visibility to events
--- a/dependences/image_extractor.py
+++ b/dependences/image_extractor.py
@ -55,7 +55,6 @@ class ImageExtractor:
        # Also check query parameters (e.g., format=jpeg)
        return any(fmt in img_url.lower() for fmt in self.SUPPORTED_FORMATS)

-
    async def _download_image(self, image_url, output_dir="images") -> None:

        # Parse the URL to get the path without query parameters
@ -79,7 +78,7 @@ class ImageExtractor:

        # Sanitize image name (remove special characters, limit length)
        image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_"))
-        image_name = image_name[:200]  # Limit filename length
+        image_name = image_name[:50]  # Limit filename length

        # If name is empty after sanitization, create a hash-based name
        if not image_name:
@ -88,13 +87,15 @@ class ImageExtractor:
            image_name = hashlib.md5(image_url.encode()).hexdigest()[:16]

        # Download the image
-        print("getting image:", image_url)
+        print("getting image url:", image_url)
+        print("getting image name:", image_name)
        response = requests.get(image_url, timeout=10)
        response.raise_for_status()

        try:
            # Save the image
            output_path = os.path.join(output_dir, f"{image_name}.{ext}")
+            print("saving image to:", output_path)
            with open(output_path, "wb") as f:
                f.write(response.content)
            print(f"Saved: {output_path}")
@ -292,43 +293,36 @@ class ImageExtractor:
            error_msg = f"Error extracting context: {str(e)}"
            return error_msg, error_msg, error_msg

-    async def _get_page_metadata(self, page) -> Dict[str, Optional[str]]:
-        """Extract page metadata including title, description, and keywords."""
-        metadata = {
-            "title": await page.title(),
-            "description": None,
-            "keywords": None,
-            "headings": [],
-        }
+    async def _get_page_metadata(self, page):
+        """Extract page metadata in one fast evaluate call. Batch DOM extraction inside one evaluate()."""
+        return await page.evaluate(
+            """
+            () => {
+                const metadata = {
+                    title: document.title || null,
+                    description: null,
+                    keywords: null,
+                    headings: []
+                };

-        # Extract meta description
-        try:
-            description = await page.locator('meta[name="description"]').get_attribute(
-                "content"
-            )
-            metadata["description"] = description
-        except:
-            pass
+                const desc = document.querySelector('meta[name="description"]');
+                const keys = document.querySelector('meta[name="keywords"]');
+                metadata.description = desc?.content || null;
+                metadata.keywords = keys?.content || null;

-        # Extract meta keywords
-        try:
-            keywords = await page.locator('meta[name="keywords"]').get_attribute(
-                "content"
-            )
-            metadata["keywords"] = keywords
-        except:
-            pass
+                // Collect all headings h1–h6
+                const allHeadings = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
+                metadata.headings = Array.from(allHeadings)
+                    .map(h => ({
+                        level: parseInt(h.tagName.substring(1), 10),
+                        text: h.textContent.trim()
+                    }))
+                    .filter(h => h.text.length > 0);

-        # Extract all headings (h1-h6)
-        for level in range(1, 7):
-            headings = await page.locator(f"h{level}").all_text_contents()
-            for heading in headings:
-                if heading.strip():
-                    metadata["headings"].append(
-                        {"level": level, "text": heading.strip()}
-                    )
-
-        return metadata
+                return metadata;
+            }
+        """
+        )

    async def extract_images(
        self, extract_context=True, specific_images_urls=[]
@ -344,15 +338,18 @@ class ImageExtractor:
            page = await browser.new_page()

            try:
-                #await page.goto(self.url, wait_until="networkidle")  # method 1: use if the page has unpredictable async content and there is the need to ensure everything loads
-                                                                     # The "networkidle" approach is generally more robust but slower, while the fixed timeout is faster but less adaptive to actual page behavior.
+                # await page.goto(self.url, wait_until="networkidle")  # method 1: use if the page has unpredictable async content and there is the need to ensure everything loads
+                # The "networkidle" approach is generally more robust but slower, while the fixed timeout is faster but less adaptive to actual page behavior.
                # ---alternative method2: use if there is total awareness of the page's loading pattern and want faster, more reliable execution
-                await page.goto(self.url, timeout=50000, wait_until="load")# deafult timeout=30000, 30sec
+                await page.goto(
+                    self.url, timeout=50000, wait_until="load"
+                )  # deafult timeout=30000, 30sec
                # Wait for page to load completely
                await page.wait_for_timeout(2000)  # Wait for dynamic content
                # -----

                if extract_context:
+                    print("Getting page metadata...")
                    # Get page metadata once
                    page_metadata = await self._get_page_metadata(page)
                    page_title = page_metadata["title"]
@ -367,15 +364,41 @@ class ImageExtractor:

                if len(specific_images_urls) == 0:
                    # Find all img elements
-                    print("Extracting all images from the page",self.url)
-                    img_elements = await page.locator("img").all()
+                    print("Extracting all images from the page", self.url)
+                    # img_elements = await page.locator("img").all()
                else:
                    print(
                        "Extracting specific images from the page:",
                        self.url,
                        specific_images_urls,
                    )
-                    img_elements = []
+                    # img_elements = await page.locator("img").all()
+
+                    """ # method 3: optimized approach
+                    # Get all src attributes in one go
+                    all_img_elements = await page.locator("img").all()
+                    all_srcs = await page.locator("img").evaluate_all(
+                        "elements => elements.map(el => el.src || '')"
+                    )
+
+                    # Filter with the pre-fetched src values
+                    img_elements = [
+                        elem for elem, src in zip(all_img_elements, all_srcs)
+                        if src in specific_images_urls
+                    ]
+                    """
+
+                    """ #method 2: single pass to find matching images
+                    for img_element in all_img_elements: #This is more efficient than making separate locator queries for each specific URL and avoids timeout issues.
+                        try:
+                            src = await img_element.get_attribute("src")
+                            print("found image src:", src)
+                            if src in specific_images_urls:
+                                img_elements.append(img_element)
+                        except Exception as e:
+                            print(f"Error getting src attribute from image: {str(e)}")"""
+
+                    """    # method 1: separate locator queries for each specific URL
                    for url in specific_images_urls:
                        try:
                            img_element = await page.locator(
@ -384,8 +407,11 @@ class ImageExtractor:
                            if img_element:
                                img_elements.append(img_element)
                        except Exception as e:
-                            print(f"Error locating image with src {url}: {str(e)}")
+                            print(f"Error locating image with src {url}: {str(e)}")"""

+                img_elements = await page.locator(
+                    "img"
+                ).all()  # unified approach to start with all images and filter later
                image_source_list = []  # avoid multiple check for the same image url
                images_data = []

@ -404,6 +430,12 @@ class ImageExtractor:
                        if not src:
                            print("image has no src attribute. Skipped.")
                            continue
+                        if (
+                            src not in specific_images_urls
+                            and len(specific_images_urls) > 0
+                        ):
+                            # print("image src",src,"not in the specific images list. Skipped.")
+                            continue

                        if src not in image_source_list:
                            image_source_list.append(src)
@ -434,6 +466,7 @@ class ImageExtractor:
                        alt_text = await img.get_attribute("alt") or ""

                        if extract_context:
+                            print("Extracting context for image:", img_url)
                            # Get surrounding HTML context (full, immediate, and nearby)
                            html_context, immediate_context, nearby_text = (
                                await self._get_element_context(page, img)
--- a/dependences/mllm_management.py
+++ b/dependences/mllm_management.py
@ -76,39 +76,7 @@ class MLLMManager:
        return payload

    def get_alt_text_system_prompt(self):
-        system_prompt_old = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for 
-                images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present 
-                the same information as the image, and should be able to substitute for the non-text content. The text alternative would 
-                be brief but as informative as possible.
-
-                Follow these instructions carefully:
-                1. You will be provided as input with the following:
-                - The image found on the webpage.
-                - The associated alternative text. When the alt-text is empty or absent, you will be explicitly informed.
-                - The surrounding context of the image.
-                - The page title, headings and the content of the “keywords” and “description” <meta> tag, if found.
-                
-                2. Determine the function and purpose of the image by analyzing these elements. Take into account the purpose and function 
-                of the associated image by considering the page context. Check also if the image is, or is associated with, a link or a button, 
-                and consider this in your judgement. If the image contains text use that as part of the context. 
-                
-                3. Provide a final assessment based on the following:
-                - 'success' if you can assess with 'sufficient certainty' the alt-text is appropriate in relation to the image purpose,
-                - 'failure' if you can assess with 'sufficient certainty' that the alt-text is NOT appropriate,
-                - 'warning' if you cannot determine with 'sufficient certainty'.
-                where the level of certainty goes from 1 to 100 and 'sufficient certainty' means > 80 
-                
-                4. The original alt-text assessment on a scale from 1 to 5, where 5 is the best score. Use an integer number only. 
-                
-                5. Provide a brief reasoning for your judgment. If the image contains text, write it verbatim. Your response should be in English.
-                
-                6. Keep your response within 150 words.
-
-                7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words.
-                 
-                8. Here is the JSON format the results must have:
-                {"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}"""
-
+        
        # https://www.w3.org/WAI/WCAG22/Techniques/general/G94 without examples
        system_prompt = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for 
                images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present 
@ -122,7 +90,7 @@ class MLLMManager:
                What purpose does it fulfill?
                If I could not use the image content, what words would I use to convey the same function and/or information?

-                When image content contains words that are important to understanding the content, the alt text should include those words
+                When image content contains words that are important to understanding the content, the alt text should include those words.

                Follow these instructions carefully:
                1. You will be provided as input with the following:
@ -147,7 +115,7 @@ class MLLMManager:
                
                6. Keep your response within 150 words.

-                7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words.
+                7. Generate the new most appropriate alt-text given the context and the steps before. Keep this within 30 words. Use the same language as the original alt-text.
                 
                8. Here is the JSON format the results must have:
                {"Original alt-text assessment" : "*your original alt-text assessment*", "Assessment" : "*your assessment*", "EvaluationResult": "*your response*", "New alt-text":"*new alt-text*"}"""
@ -181,7 +149,7 @@ class MLLMManager:
        print("Using end_point:", self.end_point)

        alt_text_system_prompt = self.get_alt_text_system_prompt()
-        print("alt_text_system_prompt:", alt_text_system_prompt)
+        #print("alt_text_system_prompt:", alt_text_system_prompt)

        mllm_responses = []
        for img_info in images:
--- a/restserver/routers/routes_extract_images.py
+++ b/restserver/routers/routes_extract_images.py
@ -46,6 +46,7 @@ class ExtractImagesRoutes:
        self, request: Request, data: ExtractImages
    ) -> JSONResponse:
        """Return the alt text validation  assessment based on WCAG guidelines"""
+        print("Received extract images request.")
        try:
            json_content = json.loads(data.model_dump_json())

--- a/restserver/routers/routes_wcag_alttext.py
+++ b/restserver/routers/routes_wcag_alttext.py
@ -53,6 +53,7 @@ class WCAGAltTextValuationRoutes:
    ) -> JSONResponse:
        """Return the alt text validation  assessment based on WCAG guidelines"""
        try:
+            print("Received wcag alttext validation request.")
            json_content = json.loads(data.model_dump_json())
            mllm_model_id = self.mllm_settings["mllm_model_id"]

@ -67,7 +68,12 @@ class WCAGAltTextValuationRoutes:
                    .replace(":", "")
                    .replace("//", "_")
                    .replace("/", "_")
+                    .replace("%2", "_")
+                    .replace("?", "_")
+                    .replace("=", "_")
+                    .replace("&", "_")
                )
+                url_path=url_path[:50]  # limit length
                now = datetime.now(timezone.utc)
                now_str = now.strftime("%Y_%m_%d-%H_%M_%S")
                folder_str = mllm_model_id.replace(":", "-") + "_" + now_str
@ -93,7 +99,7 @@ class WCAGAltTextValuationRoutes:
            # Extract images
            logging.info(f"Extracting images from: {json_content['page_url']}")
            images = await image_extractor.extract_images(
-                specific_images_urls=json_content["specific_images_urls"]
+                specific_images_urls=json_content["specific_images_urls"],extract_context=True
            )
            # MLLM settings
            mllm_end_point = self.mllm_settings["mllm_end_point"]
--- a/scripts/build_dataset_from_folder.py
+++ b/scripts/build_dataset_from_folder.py
@ -0,0 +1,541 @@
+# to launch: python build_dataset_from_folder.py --ref_path "" --push_to_hub --repo_id "nicolaleo/LLM-alt-text-assessment" --token ""
+
+from datasets import Dataset, DatasetDict
+import datasets
+import json
+from pathlib import Path
+from PIL import Image
+import hashlib
+import urllib.parse
+import argparse
+
+
+'''
+# Dataset metadata
+_DESCRIPTION = """\
+Dataset for image alt-text assessment and improvement using MLLM responses.
+Contains images, original alt-texts, quality assessments, and improved versions.
+"""
+
+_CITATION = """\
+@misc{alt_text_assessment,
+  title={Alt-Text Assessment Dataset},
+  year={2024}
+}
+"""
+
+
+
+
+
+class AltTextDataset(datasets.GeneratorBasedBuilder):
+    """Dataset for alt-text assessment with images and MLLM responses."""
+    
+    VERSION = datasets.Version("1.0.0")
+    
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features({
+                "image": datasets.Image(),
+                "image_url": datasets.Value("string"),
+                "alt_text": datasets.Value("string"),
+                "original_alt_text_assessment": datasets.Value("string"),
+                "assessment": datasets.Value("string"),
+                "evaluation_result": datasets.Value("string"),
+                "new_alt_text": datasets.Value("string"),
+                #"source_folder": datasets.Value("string"),
+            }),
+            citation=_CITATION,
+        )
+    
+    def _split_generators(self, dl_manager):
+        """Define data splits."""
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "json_filepath": "data.json",
+                    "images_dir": "images"
+                },
+            ),
+        ]
+    
+    def _generate_examples(self, json_filepath, images_dir):
+        """Generate examples from JSON file and image directory."""
+        with open(json_filepath, encoding="utf-8") as f:
+            data = json.load(f)
+        
+        images_path = Path(images_dir)
+        
+        for idx, entry in enumerate(data):
+            image_url = entry["image_url"]
+            image_filename = url_to_filename(image_url)
+            image_path = images_path / image_filename
+            
+            # Load image if exists, otherwise None
+            image = str(image_path) if image_path.exists() else None
+            
+            yield idx, {
+                "image": image,
+                "image_url": image_url,
+                "alt_text": entry["alt_text"],
+                "original_alt_text_assessment": entry["mllm_response"]["original_alt_text_assessment"],
+                "assessment": entry["mllm_response"]["assessment"],
+                "evaluation_result": entry["mllm_response"]["evaluation_result"],
+                "new_alt_text": entry["mllm_response"]["new_alt_text"],
+            }
+
+'''
+# ============================================================================
+# SIMPLE USAGE FUNCTIONS
+# ============================================================================
+
+
+def url_to_filename(image_url):  # save step as in the image_extractor dependence
+    """
+    Convert image URL to sanitized filename following your exact logic.
+
+    Args:
+        image_url: The image URL
+
+    Returns:
+        Sanitized filename with extension
+    """
+
+    # Parse the URL to get the path without query parameters
+    parsed_url = urllib.parse.urlparse(image_url)
+    url_path = parsed_url.path
+
+    # Get the filename from the path
+    filename = url_path.split("/")[-1]
+    print(f"Original filename: '{filename}'")
+
+    # Split filename and extension
+    if "." in filename:
+        image_name, ext = filename.rsplit(".", 1)
+        ext = ext.lower()
+    else:
+        image_name = filename
+        ext = "jpg"
+
+    # Validate extension
+    if ext not in ["jpg", "jpeg", "png", "gif", "webp"]:
+        ext = "jpg"
+
+    # Sanitize image name (remove special characters, limit length)
+    image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_"))
+
+    image_name = image_name[:50]  # Limit filename length
+
+    # If name is empty after sanitization, create a hash-based name
+    if not image_name:
+        image_name = hashlib.md5(image_url.encode()).hexdigest()[:16]
+
+    return f"{image_name}.{ext}"
+
+
+def push_to_hub_example(dataset_path="alt_text_merged_dataset", repo_id="",token=None):
+    """
+    Example of how to push dataset to Hugging Face Hub.
+    You need to authenticate first!
+    """
+    from huggingface_hub import login
+
+    print("\n=== Pushing Dataset to Hugging Face Hub ===")
+    # Method 1: Login interactively (will prompt for token)
+    # login()
+
+    # Method 2: Login with token directly
+    login(token=token)
+
+    # Method 3: Set token as environment variable
+    # export HF_TOKEN="hf_YourTokenHere"
+    # Then login() will use it automatically
+
+    # Load your dataset
+    ds = load_dataset_from_disk(dataset_path)
+
+    # Combine into DatasetDict
+    ds = DatasetDict(
+        {
+            "train": ds,
+            #    #"test": test_dataset
+        }
+    )
+
+    # Push to hub (creates repo if it doesn't exist)
+    ds.push_to_hub(  # Automatically converts to Parquet when uploading to Hub
+        repo_id,  # Replace with your username
+        private=False,  # Set True for private dataset
+    )
+
+    print("Dataset pushed successfully!")
+    print(f"View at: https://huggingface.co/datasets/{repo_id}")
+
+
+def create_dataset_from_json(json_filepath, json_filepath_images, images_dir="images"):
+    """
+    Create a Hugging Face Dataset from JSON file with local images.
+
+    Args:
+        json_filepath: Path to JSON file with your data structure
+        images_dir: Directory containing the images (default: "images")
+
+    Returns:
+        datasets.Dataset object with images loaded
+    """
+    with open(json_filepath, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    with open(json_filepath_images, "r", encoding="utf-8") as f:
+        data_images = json.load(f)
+
+    images_path = Path(images_dir)
+
+    # Flatten the nested structure and load images
+    flattened_data = {
+        "image": [],
+        "image_url": [],
+        "alt_text": [],
+        "original_alt_text_assessment": [],
+        "assessment": [],
+        "evaluation_result": [],
+        "new_alt_text": [],
+        "page_url": [],
+        "html_context": [],
+    }
+
+    count_entry = 0
+    for entry in data:
+        if (
+            entry["mllm_response"]["original_alt_text_assessment"] is None
+        ):  # important! skip entries with no MLLM response. not usable data
+            print(
+                f"Skipping entry with image URL: {entry['image_url']} due to missing MLLM response"
+            )
+            count_entry += 1
+            continue  # Skip entries with no MLLM response
+        image_url = entry["image_url"]
+        image_filename = url_to_filename(image_url)
+        image_path = images_path / image_filename
+
+        # Load image if it exists
+        if image_path.exists():
+            img = Image.open(image_path)
+            flattened_data["image"].append(img)
+        else:
+            print(f"Warning: Image not found: {image_path}")
+            flattened_data["image"].append(None)
+
+        flattened_data["image_url"].append(image_url)
+        flattened_data["alt_text"].append(entry["alt_text"])
+        flattened_data["original_alt_text_assessment"].append(
+            str(entry["mllm_response"]["original_alt_text_assessment"])
+        )
+        flattened_data["assessment"].append(entry["mllm_response"]["assessment"])
+        flattened_data["evaluation_result"].append(
+            entry["mllm_response"]["evaluation_result"]
+        )
+        flattened_data["new_alt_text"].append(entry["mllm_response"]["new_alt_text"])
+        flattened_data["page_url"].append(data_images[count_entry]["page_url"])
+        flattened_data["html_context"].append(data_images[count_entry]["html_context"])
+
+        count_entry += 1
+
+    print(f"Total valid entries loaded: {len(flattened_data['image_url'])}")
+    return datasets.Dataset.from_dict(flattened_data)
+
+
+def create_dataset_from_folders(
+    ref_path,
+    json_filename="mllm_alttext_assessments.json",
+    json_filename_images="extracted_images.json",
+    images_dirname="images",
+):
+    """
+    Create a merged dataset from multiple folders under ref_path.
+    Each folder should contain a JSON file and an images subdirectory.
+
+    Args:
+        ref_path: Root path containing multiple folders
+        json_filename: Name of JSON file in each folder (default: "data.json")
+        images_dirname: Name of images subdirectory (default: "images")
+
+    Returns:
+        datasets.Dataset object with all entries merged
+    """
+    ref_path = Path(ref_path)
+    all_datasets = []
+
+    # Find all subdirectories containing the JSON file
+    folders_processed = 0
+
+    for folder in ref_path.iterdir():
+        if not folder.is_dir():
+            continue
+
+        json_path = folder / json_filename
+        json_path_images = folder / json_filename_images
+        images_path = folder / images_dirname
+
+        # Check if both JSON and images directory exist
+        if not json_path.exists():
+            print(f"Skipping {folder.name}: no {json_filename} found")
+            continue
+
+        if not json_path_images.exists():
+            print(f"Skipping {folder.name}: no {json_filename_images} found")
+            continue
+
+        if not images_path.exists():
+            print(f"Warning: {folder.name}: images directory not found")
+            # continue
+            # Continue anyway, images might be optional (from urls only)
+
+        print(f"Processing folder: {folder.name}")
+
+        try:
+            # Create dataset for this folder
+            ds = create_dataset_from_json(
+                str(json_path), str(json_path_images), str(images_path)
+            )
+            all_datasets.append(ds)
+
+            folders_processed += 1
+            print(f"  -> Loaded {len(ds)} entries")
+        except Exception as e:
+            print(f"Error processing {folder.name}: {e}")
+            continue
+
+    if not all_datasets:
+        raise ValueError(f"No valid folders found in {ref_path}")
+
+    # Merge all datasets
+    print(f"\n=== Merging {folders_processed} folders ===")
+    merged_dataset = datasets.concatenate_datasets(all_datasets)
+    print(f"Total entries: {len(merged_dataset)}")
+
+    return merged_dataset
+
+
+def verify_images(json_filepath, images_dir="images"):
+    """
+    Verify that all images referenced in JSON exist in the images directory.
+
+    Args:
+        json_filepath: Path to JSON file
+        images_dir: Directory containing images
+
+    Returns:
+        Dict with 'found', 'missing', and 'details' keys
+    """
+    with open(json_filepath, "r", encoding="utf-8") as f:
+        data = json.load(f)
+
+    images_path = Path(images_dir)
+
+    found = []
+    missing = []
+
+    for entry in data:
+        image_url = entry["image_url"]
+        image_filename = url_to_filename(image_url)
+        image_path = images_path / image_filename
+        print(
+            "image_url:",
+            image_url,
+            "image_filename:",
+            image_filename,
+            "image_path:",
+            image_path,
+        )
+
+        if image_path.exists():
+            found.append(
+                {"url": image_url, "filename": image_filename, "path": str(image_path)}
+            )
+        else:
+            missing.append(
+                {
+                    "url": image_url,
+                    "filename": image_filename,
+                    "expected_path": str(image_path),
+                }
+            )
+
+    return {
+        "found": len(found),
+        "missing": len(missing),
+        "total": len(data),
+        "details": {"found_images": found, "missing_images": missing},
+    }
+
+
+def verify_images_in_folders(
+    ref_path, json_filename="mllm_alttext_assessments.json", images_dirname="images"
+):
+    """
+    Verify images across all folders under ref_path.
+
+    Args:
+        ref_path: Root path containing multiple folders
+        json_filename: Name of JSON file in each folder
+        images_dirname: Name of images subdirectory
+
+    Returns:
+        Dict with aggregated verification results
+    """
+    ref_path = Path(ref_path)
+    total_found = 0
+    total_missing = 0
+    total_entries = 0
+    folder_results = {}
+
+    for folder in ref_path.iterdir():
+        if not folder.is_dir():
+            continue
+
+        json_path = folder / json_filename
+        images_path = folder / images_dirname
+
+        if not json_path.exists():
+            continue
+
+        print(f"Verifying folder: {folder.name}")
+
+        try:
+            verification = verify_images(str(json_path), str(images_path))
+            folder_results[folder.name] = verification
+
+            total_found += verification["found"]
+            total_missing += verification["missing"]
+            total_entries += verification["total"]
+
+            print(f"  Found: {verification['found']}/{verification['total']}")
+
+        except Exception as e:
+            print(f"  Error: {e}")
+            continue
+
+    return {
+        "found": total_found,
+        "missing": total_missing,
+        "total": total_entries,
+        "folders": folder_results,
+    }
+
+
+def save_dataset(dataset, output_path):
+    """Save dataset in Arrow format (includes images)."""
+    dataset.save_to_disk(output_path)
+    # print(f"Dataset saved to {output_path}")
+
+    # Or save as JSON
+    # dataset.to_json(f"{output_path}/data.json")
+
+    # Or save as CSV
+    # dataset.to_csv(f"{output_path}/data.csv")
+
+    # Or save as Parquet
+    # dataset.to_parquet(f"{output_path}/data.parquet")
+
+
+def load_dataset_from_disk(dataset_path):
+    """Load a previously saved dataset."""
+    return datasets.load_from_disk(dataset_path)
+
+
+# ============================================================================
+# EXAMPLE USAGE
+# ============================================================================
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--ref_path",
+        type=str,
+        help=("Root path containing multiple folders"),
+        default="",
+    )
+
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        default=False,
+        help=("If True push the merged dataset to Hugging Face Hub"),
+    )
+    parser.add_argument(
+        "--token",
+        type=str,
+        help=("Hugging Face authentication token"),
+        default="",
+    )
+    parser.add_argument(
+        "--repo_id",
+        type=str,
+        help=("Hugging Face repository ID"),
+        default="nicolaleo/LLM-alt-text-assessment",
+    )
+    args = parser.parse_args()
+
+    # Example 1: Verify images across all folders
+    print("=== Verifying Images in All Folders ===")
+    verification = verify_images_in_folders(args.ref_path)
+    print("\n######## Verifier output ################################")
+    print(f"Total Found: {verification['found']}/{verification['total']}")
+    print(f"Total Missing: {verification['missing']}/{verification['total']}")
+    print("########################################")
+
+    # Show per-folder breakdown
+    print("\n=== Per-Folder Breakdown ===")
+    for folder_name, results in verification["folders"].items():
+        print(f"{folder_name}: {results['found']}/{results['total']} images found")
+
+    # Example 2: Create merged dataset from all folders
+    print("\n=== Creating Merged Dataset ===")
+    ds = create_dataset_from_folders(args.ref_path)
+    print("\n######## Merged Dataset output ################################")
+    print(f"Final dataset size: {len(ds)} entries")
+    print("########################################")
+
+    # Example 3: Analyze the merged dataset
+    print("\n=== Dataset Analysis ===")
+    print(ds)
+
+    # Example 3: Access images and data
+    print("\n=== First Example ===")
+    first_example = ds[0]
+    print(f"Image URL: {first_example['image_url']}")
+    print(f"Alt text: {first_example['alt_text']}")
+    print(f"Assessment: {first_example['assessment']}")
+    print(f"New alt text: {first_example['new_alt_text']}")
+    print(f"Image loaded: {first_example['image'] is not None}")
+
+    if first_example["image"] is not None:
+        img = first_example["image"]
+        print(f"Image size: {img.size}")
+        # img.show()  # Uncomment to display image
+
+    # Example 4: Filter and work with merged data
+    print("\n=== Filtering Merged Dataset ===")
+    successful = ds.filter(lambda x: x["assessment"] == "success")
+    print(f"Successful assessments: {len(successful)}")
+
+    high_rated = ds.filter(lambda x: int(x["original_alt_text_assessment"]) >= 4)
+    print(f"High-rated (>=4): {len(high_rated)}")
+
+    # Example 5: Save merged dataset
+    print("\n=== Saving Merged Dataset ===")
+    save_dataset(ds, "alt_text_merged_dataset")
+
+    # Example 6: Load dataset
+    print("\n=== Loading Dataset ===")
+    loaded_ds = load_dataset_from_disk("alt_text_merged_dataset")
+    print(f"Loaded {len(loaded_ds)} entries")
+
+    if args.push_to_hub:
+        # Push to Hugging Face Hub (optional)
+        push_to_hub_example(repo_id=args.repo_id, token=args.token)  # function below for details
--- a/scripts/requirements_extra.txt
+++ b/scripts/requirements_extra.txt
@ -5,4 +5,5 @@ transformers==4.57.1
 numpy==2.2.6
 matplotlib==3.10.7
 scikit-learn==1.7.2
-sentence-transformers==5.1.2
+sentence-transformers==5.1.2
+datasets==4.4.1