This repository has been archived on 2025-11-17. You can view files and clone it, but cannot push or open issues or pull requests.
Files
pypages/modules/course_scraper.py
2025-11-16 18:01:30 +01:00

229 lines
9.5 KiB
Python

"""
Module for scraping and organizing Python course content.
"""
import requests
from bs4 import BeautifulSoup
from typing import Dict, List, Optional
import re
class CourseScraper:
"""Scrapes Python course content from external sources."""
@staticmethod
def scrape_course_content(url: str = None) -> Dict[str, any]:
"""
Scrape course content from URLs.
Args:
url: Base URL to scrape (optional, will use default if not provided)
Returns:
Dictionary with course structure and content
"""
course_data = {
'title': 'Python Kurs - Gymnasium Hartberg',
'sections': [],
'navigation': []
}
# List of course pages to scrape
course_pages = [
('https://benschi11.github.io/python/class5.html', '5. Klasse'),
('https://benschi11.github.io/python/', 'Overview')
]
for page_url, section_title in course_pages:
try:
response = requests.get(page_url, timeout=10, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Find main content
main_content = soup.find('main', {'id': 'content'}) or soup.find('main') or soup.find('body')
if not main_content:
continue
# Extract markdown content from main content
markdown_content = []
current_section = None
current_subsection = None
section_id = None
# Process all elements in order to build markdown
for elem in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'p', 'ul', 'li', 'div', 'pre', 'code']):
tag_name = elem.name
# Handle headings
if tag_name in ['h1', 'h2']:
# Save previous section
if current_section and markdown_content:
current_section['markdown'] = '\n\n'.join(markdown_content)
course_data['sections'].append(current_section)
course_data['navigation'].append({
'title': current_section['title'],
'level': current_section['level'],
'id': section_id
})
# Start new section
text = elem.get_text().strip()
if text and not text.startswith('Python Kurs'):
section_id = CourseScraper._slugify(text)
current_section = {
'title': text,
'level': int(tag_name[1]),
'markdown': '',
'id': section_id
}
markdown_content = [f"{'#' * int(tag_name[1])} {text}"]
current_subsection = None
elif tag_name == 'h3' and current_section:
# Subsection heading
text = elem.get_text().strip()
if text:
markdown_content.append(f"\n### {text}")
current_subsection = text
elif tag_name == 'h4' and current_section:
# Sub-subsection heading
text = elem.get_text().strip()
if text:
markdown_content.append(f"\n#### {text}")
elif current_section and tag_name == 'p':
# Paragraph
text = elem.get_text().strip()
if text and len(text) > 5:
markdown_content.append(text)
elif current_section and tag_name == 'ul':
# Unordered list
list_items = []
for li in elem.find_all('li', recursive=False):
li_text = li.get_text().strip()
if li_text:
list_items.append(f"- {li_text}")
if list_items:
markdown_content.append('\n'.join(list_items))
elif current_section and tag_name == 'div':
# Check for code blocks
code_elem = elem.find('code', class_=lambda x: x and 'language-python' in ' '.join(x))
if code_elem:
code_text = code_elem.get_text().strip()
if code_text:
markdown_content.append(f"```python\n{code_text}\n```")
elif 'language-python' in str(elem.get('class', [])):
code_text = elem.get_text().strip()
if code_text:
markdown_content.append(f"```python\n{code_text}\n```")
elif current_section and tag_name == 'pre':
# Preformatted code
code_elem = elem.find('code')
if code_elem:
code_text = code_elem.get_text().strip()
if code_text:
# Check if it's Python code
lang = 'python'
if 'language-python' in str(code_elem.get('class', [])):
lang = 'python'
markdown_content.append(f"```{lang}\n{code_text}\n```")
# Save last section
if current_section:
if markdown_content:
current_section['markdown'] = '\n\n'.join(markdown_content)
course_data['sections'].append(current_section)
course_data['navigation'].append({
'title': current_section['title'],
'level': current_section['level'],
'id': section_id
})
except Exception as e:
print(f"Error scraping {page_url}: {e}")
continue
# If no content was scraped, return default
if not course_data['sections']:
return CourseScraper._get_default_course()
return course_data
@staticmethod
def _slugify(text: str) -> str:
"""Convert text to URL-friendly slug."""
text = text.lower()
text = re.sub(r'[^\w\s-]', '', text)
text = re.sub(r'[-\s]+', '-', text)
return text.strip('-')
@staticmethod
def _get_default_course() -> Dict[str, any]:
"""Get default course structure when scraping fails."""
return {
'title': 'Python Kurs - Gymnasium Hartberg',
'sections': [
{
'title': '5. Klasse',
'level': 2,
'content': [
'Grundlagen der Programmierung mit Python',
'Variablen und Datentypen',
'Eingabe und Ausgabe',
'Bedingte Anweisungen',
'Schleifen',
'Listen und Dictionaries'
],
'subsections': []
},
{
'title': '6. Klasse',
'level': 2,
'content': [
'Funktionen',
'Module und Pakete',
'Dateiverarbeitung',
'Fehlerbehandlung',
'Objektorientierte Programmierung'
],
'subsections': []
},
{
'title': 'Objektorientierte Programmierung',
'level': 2,
'content': [
'Klassen und Objekte',
'Vererbung',
'Polymorphismus',
'Abstrakte Klassen'
],
'subsections': []
},
{
'title': 'Grafische Oberflächen',
'level': 2,
'content': [
'Einführung in GUI-Programmierung',
'Tkinter Grundlagen',
'Event-Handling',
'Layout-Management'
],
'subsections': []
}
],
'navigation': [
{'title': '5. Klasse', 'level': 2, 'id': '5-klasse'},
{'title': '6. Klasse', 'level': 2, 'id': '6-klasse'},
{'title': 'Objektorientierte Programmierung', 'level': 2, 'id': 'objektorientierte-programmierung'},
{'title': 'Grafische Oberflächen', 'level': 2, 'id': 'grafische-oberflaechen'}
]
}