import asyncio from playwright.async_api import async_playwright from datetime import datetime, timezone from urllib.parse import urljoin, urlparse from typing import List, Dict, Optional import json import argparse from dependences.utils import disclaim_bool_string, prepare_output_folder, create_folder import requests import os import urllib.parse from pathlib import Path class ImageExtractor: SUPPORTED_FORMATS = {"png", "jpeg", "jpg", "webp", "gif"} def __init__( self, url: str, context_levels: int = 5, pixel_distance_threshold: int = 200, number_of_images: int = 10, save_images=True, save_images_path="", ): """ Initialize the ImageExtractor. Args: url: The page URL to extract images from context_levels: Number of parent/child levels to traverse for context (default=5) pixel_distance_threshold: Maximum pixel distance for nearby text elements (default=200) number_of_images: maximum number for the desired images save_images: if save images save_images_path: path to save images """ self.url = url self.context_levels = context_levels self.pixel_distance_threshold = pixel_distance_threshold self.number_of_images = number_of_images self.save_images = save_images self.save_images_path = save_images_path def _is_supported_format(self, img_url: str) -> bool: """Check if the image URL has a supported format.""" parsed = urlparse(img_url.lower()) path = parsed.path # Check file extension for fmt in self.SUPPORTED_FORMATS: if path.endswith(f".{fmt}"): return True # Also check query parameters (e.g., format=jpeg) return any(fmt in img_url.lower() for fmt in self.SUPPORTED_FORMATS) async def _download_image(self, image_url, output_dir="images") -> None: # Parse the URL to get the path without query parameters parsed_url = urllib.parse.urlparse(image_url) url_path = parsed_url.path # Get the filename from the path filename = url_path.split("/")[-1] # Split filename and extension if "." in filename: image_name, ext = filename.rsplit(".", 1) ext = ext.lower() else: image_name = filename ext = "jpg" # Validate extension if ext not in ["jpg", "jpeg", "png", "gif", "webp"]: ext = "jpg" # Sanitize image name (remove special characters, limit length) image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_")) image_name = image_name[:200] # Limit filename length # If name is empty after sanitization, create a hash-based name if not image_name: import hashlib image_name = hashlib.md5(image_url.encode()).hexdigest()[:16] # Download the image print("getting image:", image_url) response = requests.get(image_url, timeout=10) response.raise_for_status() try: # Save the image output_path = os.path.join(output_dir, f"{image_name}.{ext}") with open(output_path, "wb") as f: f.write(response.content) print(f"Saved: {output_path}") except Exception as e: print(f"Error saving image {image_url}: {e}") async def save_elaboration(self, images, output_dir) -> None: with open(output_dir, "w", encoding="utf-8") as f: json.dump(images, f, indent=2, ensure_ascii=False) print("\nResults saved to extracted_images.json") async def _get_element_context(self, page, img_element) -> tuple[str, str, str]: """ Extract textual context around an image element from text-containing tags. Returns: Tuple of (full_context, immediate_context, nearby_text) where: - full_context: Text extracted with self.context_levels - immediate_context: Text extracted with context_level=1 - nearby_text: Text within pixel_distance_threshold pixels of the image """ try: # JavaScript function to check if element is visible """ Visibility Checks : visibility CSS property - Excludes elements with visibility: hidden or visibility: collapse display CSS property - Excludes elements with display: none opacity CSS property - Excludes elements with opacity: 0 Element dimensions - Excludes elements with zero width or height (collapsed elements) """ visibility_check = """ function isVisible(el) { if (!el) return false; const style = window.getComputedStyle(el); // Check visibility and display properties if (style.visibility === 'hidden' || style.visibility === 'collapse') return false; if (style.display === 'none') return false; if (style.opacity === '0') return false; // Check if element has dimensions const rect = el.getBoundingClientRect(); if (rect.width === 0 || rect.height === 0) return false; return true; } """ # JavaScript function to extract text at a specific context level def get_context_js(levels): return f""" (element) => {{ {visibility_check} // Text-containing tags to extract /*const textTags = ['p', 'span', 'div', 'a', 'li', 'td', 'th', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'label', 'figcaption', 'caption', 'blockquote', 'pre', 'code', 'em', 'strong', 'b', 'i', 'u', 'small', 'mark', 'sub', 'sup', 'time', 'article', 'section'];*/ const textTags = ['p', 'span', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'a']; let textContent = []; // Traverse up the DOM tree let current = element; for (let i = 0; i < {levels} && current.parentElement; i++) {{ current = current.parentElement; }} // Function to extract text from an element and its children function extractText(el, depth = 0) {{ if (depth > {levels}) return; // Skip if element is not visible if (!isVisible(el)) return; // Get direct text content of text-containing elements if (textTags.includes(el.tagName.toLowerCase())) {{ const text = el.textContent.trim(); if (text && text.length > 0) {{ textContent.push({{ tag: el.tagName.toLowerCase(), text: text }}); }} }} // Recursively process children for (let child of el.children) {{ extractText(child, depth + 1); }} }} // Extract text from the context root extractText(current); // Format as readable text //return textContent.map(item => `<${{item.tag}}>: ${{item.text}}`).join('\\n\\n'); return textContent.map(item => `<${{item.tag}}>: ${{item.text}}`).join(' '); }} """ # JavaScript function to extract nearby text based on pixel distance nearby_text_js = f""" (element) => {{ {visibility_check} /*const textTags = ['p', 'span', 'div', 'a', 'li', 'td', 'th', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'label', 'figcaption', 'caption', 'blockquote', 'pre', 'code', 'em', 'strong', 'b', 'i', 'u', 'small', 'mark', 'sub', 'sup', 'time'];*/ const textTags = ['p', 'span', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'a']; const threshold = {self.pixel_distance_threshold}; const imgRect = element.getBoundingClientRect(); const imgCenterX = imgRect.left + imgRect.width / 2; const imgCenterY = imgRect.top + imgRect.height / 2; // Calculate distance between two rectangles. function getDistance(rect1, rect2) {{ // Get centers const x1 = rect1.left + rect1.width / 2; const y1 = rect1.top + rect1.height / 2; const x2 = rect2.left + rect2.width / 2; const y2 = rect2.top + rect2.height / 2; // Euclidean distance return Math.sqrt(Math.pow(x2 - x1, 2) + Math.pow(y2 - y1, 2)); //This can be changed considering not only the distance between the centers but maybe the nearest points }} let nearbyElements = []; // Find all text elements on the page const allElements = document.querySelectorAll(textTags.join(',')); allElements.forEach(el => {{ // Skip if element is not visible if (!isVisible(el)) return; const text = el.textContent.trim(); if (!text || text.length === 0) return; // Skip if it's the image itself or contains the image if (el === element || el.contains(element)) return; const elRect = el.getBoundingClientRect(); const distance = getDistance(imgRect, elRect); if (distance <= threshold) {{ nearbyElements.push({{ tag: el.tagName.toLowerCase(), text: text, distance: Math.round(distance) }}); }} }}); // Sort by distance nearbyElements.sort((a, b) => a.distance - b.distance); // Format output //return nearbyElements.map(item => // `<${{item.tag}}> [${{item.distance}}px]: ${{item.text}}` //).join('\\n\\n'); return nearbyElements.map(item => `<${{item.tag}}> [${{item.distance}}px]: ${{item.text}}` ).join(' '); }} """ # Get full context with self.context_levels full_context_js = get_context_js(self.context_levels) full_context = await img_element.evaluate(full_context_js) full_context = full_context if full_context else "No textual context found" # Get immediate context with level=1 immediate_context_js = get_context_js(1) immediate_context = await img_element.evaluate(immediate_context_js) immediate_context = ( immediate_context if immediate_context else "No immediate context found" ) # Get nearby text based on pixel distance nearby_text = await img_element.evaluate(nearby_text_js) nearby_text = nearby_text if nearby_text else "No nearby text found" return full_context, immediate_context, nearby_text except Exception as e: error_msg = f"Error extracting context: {str(e)}" return error_msg, error_msg, error_msg async def _get_page_metadata(self, page) -> Dict[str, Optional[str]]: """Extract page metadata including title, description, and keywords.""" metadata = { "title": await page.title(), "description": None, "keywords": None, "headings": [], } # Extract meta description try: description = await page.locator('meta[name="description"]').get_attribute( "content" ) metadata["description"] = description except: pass # Extract meta keywords try: keywords = await page.locator('meta[name="keywords"]').get_attribute( "content" ) metadata["keywords"] = keywords except: pass # Extract all headings (h1-h6) for level in range(1, 7): headings = await page.locator(f"h{level}").all_text_contents() for heading in headings: if heading.strip(): metadata["headings"].append( {"level": level, "text": heading.strip()} ) return metadata async def extract_images( self, extract_context=True, specific_images_urls=[] ) -> List[Dict]: """ Extract all images from the page with their metadata and context. Returns: List of dictionaries containing image information """ async with async_playwright() as p: browser = await p.chromium.launch(headless=True) page = await browser.new_page() try: #await page.goto(self.url, wait_until="networkidle") # method 1: use if the page has unpredictable async content and there is the need to ensure everything loads # The "networkidle" approach is generally more robust but slower, while the fixed timeout is faster but less adaptive to actual page behavior. # ---alternative method2: use if there is total awareness of the page's loading pattern and want faster, more reliable execution await page.goto(self.url, timeout=50000, wait_until="load")# deafult timeout=30000, 30sec # Wait for page to load completely await page.wait_for_timeout(2000) # Wait for dynamic content # ----- if extract_context: # Get page metadata once page_metadata = await self._get_page_metadata(page) page_title = page_metadata["title"] page_description = page_metadata["description"] page_keywords = page_metadata["keywords"] page_headings = page_metadata["headings"] else: page_title = "" page_description = "" page_keywords = "" page_headings = [] if len(specific_images_urls) == 0: # Find all img elements print("Extracting all images from the page",self.url) img_elements = await page.locator("img").all() else: print( "Extracting specific images from the page:", self.url, specific_images_urls, ) img_elements = [] for url in specific_images_urls: try: img_element = await page.locator( f'img[src="{url}"]' ).first.element_handle(timeout=0) # Use first() to get only the first match; 0 timeout=No timeout if img_element: img_elements.append(img_element) except Exception as e: print(f"Error locating image with src {url}: {str(e)}") image_source_list = [] # avoid multiple check for the same image url images_data = [] for img in img_elements: if ( len(images_data) >= self.number_of_images ): # limits the effective image list based on the ini param. print( "Reached the maximum number of images to extract.", self.number_of_images, ) break try: # Get image src src = await img.get_attribute("src") if not src: print("image has no src attribute. Skipped.") continue if src not in image_source_list: image_source_list.append(src) else: print("image src", src, "already processed. Skipped.") continue # Convert relative URLs to absolute img_url = urljoin(self.url, src) # Verify format if not self._is_supported_format(img_url): print( "image format not supported for url:", img_url, ". Skipped.", ) continue if disclaim_bool_string(self.save_images) == True: print("save image:", img_url.split("/")[-1]) await self._download_image( image_url=img_url, output_dir=self.save_images_path ) # Get alt text alt_text = await img.get_attribute("alt") or "" if extract_context: # Get surrounding HTML context (full, immediate, and nearby) html_context, immediate_context, nearby_text = ( await self._get_element_context(page, img) ) else: html_context, immediate_context, nearby_text = "", "", "" # Compile image data image_info = { "url": img_url, "alt_text": alt_text, "html_context": html_context, "immediate_context": immediate_context, "nearby_text": nearby_text, "page_url": self.url, "page_title": page_title, "page_description": page_description, "page_keywords": page_keywords, "page_headings": page_headings, } images_data.append(image_info) except Exception as e: print(f"Error processing image: {str(e)}") continue return images_data finally: await browser.close() async def main(args): url = args.page_url context_levels = args.context_levels pixel_distance_threshold = args.pixel_distance_threshold number_of_images = args.number_of_images save_images = args.save_images print( "call ImageExtrcator with-", "page_url:", url, "context_levels:", context_levels, "pixel_distance_threshold:", pixel_distance_threshold, "number_of_images:", number_of_images, "save_images:", save_images, ) if ( disclaim_bool_string(args.save_elaboration) == True or disclaim_bool_string(args.save_images) == True ): # if something to save url_path = url.replace(":", "").replace("//", "_").replace("/", "_") now = datetime.now(timezone.utc) now_str = now.strftime("%Y_%m_%d-%H_%M_%S") output_dir = prepare_output_folder(url_path, now_str) if disclaim_bool_string(args.save_images) == True: images_output_dir = create_folder( output_dir, directory_separator="/", next_path="images" ) print("save images path:", images_output_dir) # Create extractor extractor = ImageExtractor( url, context_levels=context_levels, pixel_distance_threshold=pixel_distance_threshold, number_of_images=number_of_images, save_images=save_images, save_images_path=images_output_dir, ) # Extract images print(f"Extracting images from: {url}") images = await extractor.extract_images(specific_images_urls=[]) print(f"\nFound {len(images)} supported images\n") # Display results for i, img in enumerate(images, 1): print(f"Image {i}:") print(f" URL: {img['url']}") print(f" Alt text: {img['alt_text']}") print(f" Page title: {img['page_title']}") print(f" Full context length: {len(img['html_context'])} characters") print(f" Immediate context length: {len(img['immediate_context'])} characters") print(f" Nearby text length: {len(img['nearby_text'])} characters") print(f" Number of headings on page: {len(img['page_headings'])}") print("-" * 80) if disclaim_bool_string(args.save_elaboration) == True: # Optionally save to JSON await extractor.save_elaboration( images, output_dir=output_dir + "/extracted_images.json" ) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--page_url", type=str, help=("Url page to analyze"), default="https://www.bbc.com", ) parser.add_argument( "--context_levels", type=int, default=5, help=("HTML context levels around the image"), ) parser.add_argument( "--pixel_distance_threshold", type=int, default=200, help=("pixel distance threshold around the image"), ) parser.add_argument( "--number_of_images", type=int, default=10, help=("max number of desired images"), ) parser.add_argument( "--save_elaboration", action="store_true", default=True, help=("If True save the elaborated info in a json file"), ) parser.add_argument( "--save_images", action="store_true", default=True, help=("If True save the images"), ) args = parser.parse_args() asyncio.run(main(args))