Files
INF6B/scraper/main.py
2025-10-01 10:46:21 +02:00

84 lines
2.6 KiB
Python

import os
import re
import time
import requests
from urllib.parse import urljoin, urlparse, unquote
from bs4 import BeautifulSoup
HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"
}
def sanitize_filename(name: str) -> str:
# Replace spaces with underscores, strip invalid filename characters
name = name.strip()
name = name.replace(" ", "_")
return re.sub(r'[^A-Za-z0-9_\-().]', '', name)
def candidate_url_from_img(img):
for attr in ("data-orig-src", "data-src", "src"):
val = img.get(attr)
if not val:
continue
val = val.strip()
if not val or val.startswith("data:") or val.startswith("javascript:"):
continue
return unquote(val)
return None
def download_images(page_url, target_folder):
os.makedirs(target_folder, exist_ok=True)
resp = requests.get(page_url, headers=HEADERS, timeout=15)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
cards = soup.select("div.teacher-card")
print(f"Found {len(cards)} teacher cards")
downloaded = 0
for card in cards:
# name
name_tag = card.select_one(".card-title")
if not name_tag:
continue
person_name = name_tag.get_text(strip=True)
filename = sanitize_filename(person_name) + ".jpg"
# image
img = card.find("img")
if not img:
continue
img_url = candidate_url_from_img(img)
if not img_url:
continue
img_url = urljoin(page_url, img_url)
# save
local_path = os.path.join(target_folder, filename)
if os.path.exists(local_path):
print(f"⚡ Skipping (exists): {filename}")
continue
try:
print(f"⬇️ {person_name} -> {filename}")
r = requests.get(img_url, headers=HEADERS, stream=True, timeout=20)
r.raise_for_status()
with open(local_path, "wb") as f:
for chunk in r.iter_content(8192):
if chunk:
f.write(chunk)
downloaded += 1
time.sleep(0.1) # polite delay
except Exception as e:
print(f"❌ Failed for {person_name}: {e}")
print(f"\n✅ Done. Downloaded {downloaded} images.")
if __name__ == "__main__":
page = "https://www.gym-hartberg.ac.at/lehrerinnen/"
downloads = os.path.join(os.path.expanduser("~"), "Downloads")
target = os.path.join(downloads, "images")
download_images(page, target)