import os import re import time import requests from urllib.parse import urljoin, urlparse, unquote from bs4 import BeautifulSoup HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36" } def sanitize_filename(name: str) -> str: # Replace spaces with underscores, strip invalid filename characters name = name.strip() name = name.replace(" ", "_") return re.sub(r'[^A-Za-z0-9_\-().]', '', name) def candidate_url_from_img(img): for attr in ("data-orig-src", "data-src", "src"): val = img.get(attr) if not val: continue val = val.strip() if not val or val.startswith("data:") or val.startswith("javascript:"): continue return unquote(val) return None def download_images(page_url, target_folder): os.makedirs(target_folder, exist_ok=True) resp = requests.get(page_url, headers=HEADERS, timeout=15) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") cards = soup.select("div.teacher-card") print(f"Found {len(cards)} teacher cards") downloaded = 0 for card in cards: # name name_tag = card.select_one(".card-title") if not name_tag: continue person_name = name_tag.get_text(strip=True) filename = sanitize_filename(person_name) + ".jpg" # image img = card.find("img") if not img: continue img_url = candidate_url_from_img(img) if not img_url: continue img_url = urljoin(page_url, img_url) # save local_path = os.path.join(target_folder, filename) if os.path.exists(local_path): print(f"⚡ Skipping (exists): {filename}") continue try: print(f"⬇️ {person_name} -> {filename}") r = requests.get(img_url, headers=HEADERS, stream=True, timeout=20) r.raise_for_status() with open(local_path, "wb") as f: for chunk in r.iter_content(8192): if chunk: f.write(chunk) downloaded += 1 time.sleep(0.1) # polite delay except Exception as e: print(f"❌ Failed for {person_name}: {e}") print(f"\n✅ Done. Downloaded {downloaded} images.") if __name__ == "__main__": page = "https://www.gym-hartberg.ac.at/lehrerinnen/" downloads = os.path.join(os.path.expanduser("~"), "Downloads") target = os.path.join(downloads, "images") download_images(page, target)