# to launch: python build_dataset_from_folder.py --ref_path "" --push_to_hub --repo_id "nicolaleo/LLM-alt-text-assessment" --token "" from datasets import Dataset, DatasetDict import datasets import json from pathlib import Path from PIL import Image import hashlib import urllib.parse import argparse ''' # Dataset metadata _DESCRIPTION = """\ Dataset for image alt-text assessment and improvement using MLLM responses. Contains images, original alt-texts, quality assessments, and improved versions. """ _CITATION = """\ @misc{alt_text_assessment, title={Alt-Text Assessment Dataset}, year={2024} } """ class AltTextDataset(datasets.GeneratorBasedBuilder): """Dataset for alt-text assessment with images and MLLM responses.""" VERSION = datasets.Version("1.0.0") def _info(self): return datasets.DatasetInfo( description=_DESCRIPTION, features=datasets.Features({ "image": datasets.Image(), "image_url": datasets.Value("string"), "alt_text": datasets.Value("string"), "original_alt_text_assessment": datasets.Value("string"), "assessment": datasets.Value("string"), "evaluation_result": datasets.Value("string"), "new_alt_text": datasets.Value("string"), #"source_folder": datasets.Value("string"), }), citation=_CITATION, ) def _split_generators(self, dl_manager): """Define data splits.""" return [ datasets.SplitGenerator( name=datasets.Split.TRAIN, gen_kwargs={ "json_filepath": "data.json", "images_dir": "images" }, ), ] def _generate_examples(self, json_filepath, images_dir): """Generate examples from JSON file and image directory.""" with open(json_filepath, encoding="utf-8") as f: data = json.load(f) images_path = Path(images_dir) for idx, entry in enumerate(data): image_url = entry["image_url"] image_filename = url_to_filename(image_url) image_path = images_path / image_filename # Load image if exists, otherwise None image = str(image_path) if image_path.exists() else None yield idx, { "image": image, "image_url": image_url, "alt_text": entry["alt_text"], "original_alt_text_assessment": entry["mllm_response"]["original_alt_text_assessment"], "assessment": entry["mllm_response"]["assessment"], "evaluation_result": entry["mllm_response"]["evaluation_result"], "new_alt_text": entry["mllm_response"]["new_alt_text"], } ''' # ============================================================================ # SIMPLE USAGE FUNCTIONS # ============================================================================ def url_to_filename(image_url): # save step as in the image_extractor dependence """ Convert image URL to sanitized filename following your exact logic. Args: image_url: The image URL Returns: Sanitized filename with extension """ # Parse the URL to get the path without query parameters parsed_url = urllib.parse.urlparse(image_url) url_path = parsed_url.path # Get the filename from the path filename = url_path.split("/")[-1] print(f"Original filename: '{filename}'") # Split filename and extension if "." in filename: image_name, ext = filename.rsplit(".", 1) ext = ext.lower() else: image_name = filename ext = "jpg" # Validate extension if ext not in ["jpg", "jpeg", "png", "gif", "webp"]: ext = "jpg" # Sanitize image name (remove special characters, limit length) image_name = "".join(c for c in image_name if c.isalnum() or c in ("-", "_")) image_name = image_name[:50] # Limit filename length # If name is empty after sanitization, create a hash-based name if not image_name: image_name = hashlib.md5(image_url.encode()).hexdigest()[:16] return f"{image_name}.{ext}" def push_to_hub_example(dataset_path="alt_text_merged_dataset", repo_id="",token=None): """ Example of how to push dataset to Hugging Face Hub. You need to authenticate first! """ from huggingface_hub import login print("\n=== Pushing Dataset to Hugging Face Hub ===") # Method 1: Login interactively (will prompt for token) # login() # Method 2: Login with token directly login(token=token) # Method 3: Set token as environment variable # export HF_TOKEN="hf_YourTokenHere" # Then login() will use it automatically # Load your dataset ds = load_dataset_from_disk(dataset_path) # Combine into DatasetDict ds = DatasetDict( { "train": ds, # #"test": test_dataset } ) # Push to hub (creates repo if it doesn't exist) ds.push_to_hub( # Automatically converts to Parquet when uploading to Hub repo_id, # Replace with your username private=False, # Set True for private dataset ) print("Dataset pushed successfully!") print(f"View at: https://huggingface.co/datasets/{repo_id}") def create_dataset_from_json(json_filepath, json_filepath_images, images_dir="images"): """ Create a Hugging Face Dataset from JSON file with local images. Args: json_filepath: Path to JSON file with your data structure images_dir: Directory containing the images (default: "images") Returns: datasets.Dataset object with images loaded """ with open(json_filepath, "r", encoding="utf-8") as f: data = json.load(f) with open(json_filepath_images, "r", encoding="utf-8") as f: data_images = json.load(f) images_path = Path(images_dir) # Flatten the nested structure and load images flattened_data = { "image": [], "image_url": [], "alt_text": [], "original_alt_text_assessment": [], "assessment": [], "evaluation_result": [], "new_alt_text": [], "page_url": [], "html_context": [], } count_entry = 0 for entry in data: if ( entry["mllm_response"]["original_alt_text_assessment"] is None ): # important! skip entries with no MLLM response. not usable data print( f"Skipping entry with image URL: {entry['image_url']} due to missing MLLM response" ) count_entry += 1 continue # Skip entries with no MLLM response image_url = entry["image_url"] image_filename = url_to_filename(image_url) image_path = images_path / image_filename # Load image if it exists if image_path.exists(): img = Image.open(image_path) flattened_data["image"].append(img) else: print(f"Warning: Image not found: {image_path}") flattened_data["image"].append(None) flattened_data["image_url"].append(image_url) flattened_data["alt_text"].append(entry["alt_text"]) flattened_data["original_alt_text_assessment"].append( str(entry["mllm_response"]["original_alt_text_assessment"]) ) flattened_data["assessment"].append(entry["mllm_response"]["assessment"]) flattened_data["evaluation_result"].append( entry["mllm_response"]["evaluation_result"] ) flattened_data["new_alt_text"].append(entry["mllm_response"]["new_alt_text"]) flattened_data["page_url"].append(data_images[count_entry]["page_url"]) flattened_data["html_context"].append(data_images[count_entry]["html_context"]) count_entry += 1 print(f"Total valid entries loaded: {len(flattened_data['image_url'])}") return datasets.Dataset.from_dict(flattened_data) def create_dataset_from_folders( ref_path, json_filename="mllm_alttext_assessments.json", json_filename_images="extracted_images.json", images_dirname="images", ): """ Create a merged dataset from multiple folders under ref_path. Each folder should contain a JSON file and an images subdirectory. Args: ref_path: Root path containing multiple folders json_filename: Name of JSON file in each folder (default: "data.json") images_dirname: Name of images subdirectory (default: "images") Returns: datasets.Dataset object with all entries merged """ ref_path = Path(ref_path) all_datasets = [] # Find all subdirectories containing the JSON file folders_processed = 0 for folder in ref_path.iterdir(): if not folder.is_dir(): continue json_path = folder / json_filename json_path_images = folder / json_filename_images images_path = folder / images_dirname # Check if both JSON and images directory exist if not json_path.exists(): print(f"Skipping {folder.name}: no {json_filename} found") continue if not json_path_images.exists(): print(f"Skipping {folder.name}: no {json_filename_images} found") continue if not images_path.exists(): print(f"Warning: {folder.name}: images directory not found") # continue # Continue anyway, images might be optional (from urls only) print(f"Processing folder: {folder.name}") try: # Create dataset for this folder ds = create_dataset_from_json( str(json_path), str(json_path_images), str(images_path) ) all_datasets.append(ds) folders_processed += 1 print(f" -> Loaded {len(ds)} entries") except Exception as e: print(f"Error processing {folder.name}: {e}") continue if not all_datasets: raise ValueError(f"No valid folders found in {ref_path}") # Merge all datasets print(f"\n=== Merging {folders_processed} folders ===") merged_dataset = datasets.concatenate_datasets(all_datasets) print(f"Total entries: {len(merged_dataset)}") return merged_dataset def verify_images(json_filepath, images_dir="images"): """ Verify that all images referenced in JSON exist in the images directory. Args: json_filepath: Path to JSON file images_dir: Directory containing images Returns: Dict with 'found', 'missing', and 'details' keys """ with open(json_filepath, "r", encoding="utf-8") as f: data = json.load(f) images_path = Path(images_dir) found = [] missing = [] for entry in data: image_url = entry["image_url"] image_filename = url_to_filename(image_url) image_path = images_path / image_filename print( "image_url:", image_url, "image_filename:", image_filename, "image_path:", image_path, ) if image_path.exists(): found.append( {"url": image_url, "filename": image_filename, "path": str(image_path)} ) else: missing.append( { "url": image_url, "filename": image_filename, "expected_path": str(image_path), } ) return { "found": len(found), "missing": len(missing), "total": len(data), "details": {"found_images": found, "missing_images": missing}, } def verify_images_in_folders( ref_path, json_filename="mllm_alttext_assessments.json", images_dirname="images" ): """ Verify images across all folders under ref_path. Args: ref_path: Root path containing multiple folders json_filename: Name of JSON file in each folder images_dirname: Name of images subdirectory Returns: Dict with aggregated verification results """ ref_path = Path(ref_path) total_found = 0 total_missing = 0 total_entries = 0 folder_results = {} for folder in ref_path.iterdir(): if not folder.is_dir(): continue json_path = folder / json_filename images_path = folder / images_dirname if not json_path.exists(): continue print(f"Verifying folder: {folder.name}") try: verification = verify_images(str(json_path), str(images_path)) folder_results[folder.name] = verification total_found += verification["found"] total_missing += verification["missing"] total_entries += verification["total"] print(f" Found: {verification['found']}/{verification['total']}") except Exception as e: print(f" Error: {e}") continue return { "found": total_found, "missing": total_missing, "total": total_entries, "folders": folder_results, } def save_dataset(dataset, output_path): """Save dataset in Arrow format (includes images).""" dataset.save_to_disk(output_path) # print(f"Dataset saved to {output_path}") # Or save as JSON # dataset.to_json(f"{output_path}/data.json") # Or save as CSV # dataset.to_csv(f"{output_path}/data.csv") # Or save as Parquet # dataset.to_parquet(f"{output_path}/data.parquet") def load_dataset_from_disk(dataset_path): """Load a previously saved dataset.""" return datasets.load_from_disk(dataset_path) # ============================================================================ # EXAMPLE USAGE # ============================================================================ if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--ref_path", type=str, help=("Root path containing multiple folders"), default="", ) parser.add_argument( "--push_to_hub", action="store_true", default=False, help=("If True push the merged dataset to Hugging Face Hub"), ) parser.add_argument( "--token", type=str, help=("Hugging Face authentication token"), default="", ) parser.add_argument( "--repo_id", type=str, help=("Hugging Face repository ID"), default="nicolaleo/LLM-alt-text-assessment", ) args = parser.parse_args() # Example 1: Verify images across all folders print("=== Verifying Images in All Folders ===") verification = verify_images_in_folders(args.ref_path) print("\n######## Verifier output ################################") print(f"Total Found: {verification['found']}/{verification['total']}") print(f"Total Missing: {verification['missing']}/{verification['total']}") print("########################################") # Show per-folder breakdown print("\n=== Per-Folder Breakdown ===") for folder_name, results in verification["folders"].items(): print(f"{folder_name}: {results['found']}/{results['total']} images found") # Example 2: Create merged dataset from all folders print("\n=== Creating Merged Dataset ===") ds = create_dataset_from_folders(args.ref_path) print("\n######## Merged Dataset output ################################") print(f"Final dataset size: {len(ds)} entries") print("########################################") # Example 3: Analyze the merged dataset print("\n=== Dataset Analysis ===") print(ds) # Example 3: Access images and data print("\n=== First Example ===") first_example = ds[0] print(f"Image URL: {first_example['image_url']}") print(f"Alt text: {first_example['alt_text']}") print(f"Assessment: {first_example['assessment']}") print(f"New alt text: {first_example['new_alt_text']}") print(f"Image loaded: {first_example['image'] is not None}") if first_example["image"] is not None: img = first_example["image"] print(f"Image size: {img.size}") # img.show() # Uncomment to display image # Example 4: Filter and work with merged data print("\n=== Filtering Merged Dataset ===") successful = ds.filter(lambda x: x["assessment"] == "success") print(f"Successful assessments: {len(successful)}") high_rated = ds.filter(lambda x: int(x["original_alt_text_assessment"]) >= 4) print(f"High-rated (>=4): {len(high_rated)}") # Example 5: Save merged dataset print("\n=== Saving Merged Dataset ===") save_dataset(ds, "alt_text_merged_dataset") # Example 6: Load dataset print("\n=== Loading Dataset ===") loaded_ds = load_dataset_from_disk("alt_text_merged_dataset") print(f"Loaded {len(loaded_ds)} entries") if args.push_to_hub: # Push to Hugging Face Hub (optional) push_to_hub_example(repo_id=args.repo_id, token=args.token) # function below for details