84 lines
2.6 KiB
Python
84 lines
2.6 KiB
Python
import os
|
|
import re
|
|
import time
|
|
import requests
|
|
from urllib.parse import urljoin, urlparse, unquote
|
|
from bs4 import BeautifulSoup
|
|
|
|
HEADERS = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"
|
|
}
|
|
|
|
def sanitize_filename(name: str) -> str:
|
|
# Replace spaces with underscores, strip invalid filename characters
|
|
name = name.strip()
|
|
name = name.replace(" ", "_")
|
|
return re.sub(r'[^A-Za-z0-9_\-().]', '', name)
|
|
|
|
def candidate_url_from_img(img):
|
|
for attr in ("data-orig-src", "data-src", "src"):
|
|
val = img.get(attr)
|
|
if not val:
|
|
continue
|
|
val = val.strip()
|
|
if not val or val.startswith("data:") or val.startswith("javascript:"):
|
|
continue
|
|
return unquote(val)
|
|
return None
|
|
|
|
def download_images(page_url, target_folder):
|
|
os.makedirs(target_folder, exist_ok=True)
|
|
|
|
resp = requests.get(page_url, headers=HEADERS, timeout=15)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
cards = soup.select("div.teacher-card")
|
|
print(f"Found {len(cards)} teacher cards")
|
|
|
|
downloaded = 0
|
|
for card in cards:
|
|
# name
|
|
name_tag = card.select_one(".card-title")
|
|
if not name_tag:
|
|
continue
|
|
person_name = name_tag.get_text(strip=True)
|
|
filename = sanitize_filename(person_name) + ".jpg"
|
|
|
|
# image
|
|
img = card.find("img")
|
|
if not img:
|
|
continue
|
|
img_url = candidate_url_from_img(img)
|
|
if not img_url:
|
|
continue
|
|
img_url = urljoin(page_url, img_url)
|
|
|
|
# save
|
|
local_path = os.path.join(target_folder, filename)
|
|
if os.path.exists(local_path):
|
|
print(f"⚡ Skipping (exists): {filename}")
|
|
continue
|
|
|
|
try:
|
|
print(f"⬇️ {person_name} -> {filename}")
|
|
r = requests.get(img_url, headers=HEADERS, stream=True, timeout=20)
|
|
r.raise_for_status()
|
|
with open(local_path, "wb") as f:
|
|
for chunk in r.iter_content(8192):
|
|
if chunk:
|
|
f.write(chunk)
|
|
downloaded += 1
|
|
time.sleep(0.1) # polite delay
|
|
except Exception as e:
|
|
print(f"❌ Failed for {person_name}: {e}")
|
|
|
|
print(f"\n✅ Done. Downloaded {downloaded} images.")
|
|
|
|
if __name__ == "__main__":
|
|
page = "https://www.gym-hartberg.ac.at/lehrerinnen/"
|
|
downloads = os.path.join(os.path.expanduser("~"), "Downloads")
|
|
target = os.path.join(downloads, "images")
|
|
download_images(page, target)
|