INF6B/scraper/main.py

import os
import re
import time
import requests
from urllib.parse import urljoin, urlparse, unquote
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"
}

def sanitize_filename(name: str) -> str:
    # Replace spaces with underscores, strip invalid filename characters
    name = name.strip()
    name = name.replace(" ", "_")
    return re.sub(r'[^A-Za-z0-9_\-().]', '', name)

def candidate_url_from_img(img):
    for attr in ("data-orig-src", "data-src", "src"):
        val = img.get(attr)
        if not val:
            continue
        val = val.strip()
        if not val or val.startswith("data:") or val.startswith("javascript:"):
            continue
        return unquote(val)
    return None

def download_images(page_url, target_folder):
    os.makedirs(target_folder, exist_ok=True)

    resp = requests.get(page_url, headers=HEADERS, timeout=15)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    cards = soup.select("div.teacher-card")
    print(f"Found {len(cards)} teacher cards")

    downloaded = 0
    for card in cards:
        # name
        name_tag = card.select_one(".card-title")
        if not name_tag:
            continue
        person_name = name_tag.get_text(strip=True)
        filename = sanitize_filename(person_name) + ".jpg"

        # image
        img = card.find("img")
        if not img:
            continue
        img_url = candidate_url_from_img(img)
        if not img_url:
            continue
        img_url = urljoin(page_url, img_url)

        # save
        local_path = os.path.join(target_folder, filename)
        if os.path.exists(local_path):
            print(f"⚡ Skipping (exists): {filename}")
            continue

        try:
            print(f"⬇️ {person_name} -> {filename}")
            r = requests.get(img_url, headers=HEADERS, stream=True, timeout=20)
            r.raise_for_status()
            with open(local_path, "wb") as f:
                for chunk in r.iter_content(8192):
                    if chunk:
                        f.write(chunk)
            downloaded += 1
            time.sleep(0.1)  # polite delay
        except Exception as e:
            print(f"❌ Failed for {person_name}: {e}")

    print(f"\n✅ Done. Downloaded {downloaded} images.")

if __name__ == "__main__":
    page = "https://www.gym-hartberg.ac.at/lehrerinnen/"
    downloads = os.path.join(os.path.expanduser("~"), "Downloads")
    target = os.path.join(downloads, "images")
    download_images(page, target)