This commit is contained in:
2025-11-16 18:01:30 +01:00
commit 858003cb0b
26 changed files with 4712 additions and 0 deletions

228
modules/course_scraper.py Normal file
View File

@@ -0,0 +1,228 @@
"""
Module for scraping and organizing Python course content.
"""
import requests
from bs4 import BeautifulSoup
from typing import Dict, List, Optional
import re
class CourseScraper:
"""Scrapes Python course content from external sources."""
@staticmethod
def scrape_course_content(url: str = None) -> Dict[str, any]:
"""
Scrape course content from URLs.
Args:
url: Base URL to scrape (optional, will use default if not provided)
Returns:
Dictionary with course structure and content
"""
course_data = {
'title': 'Python Kurs - Gymnasium Hartberg',
'sections': [],
'navigation': []
}
# List of course pages to scrape
course_pages = [
('https://benschi11.github.io/python/class5.html', '5. Klasse'),
('https://benschi11.github.io/python/', 'Overview')
]
for page_url, section_title in course_pages:
try:
response = requests.get(page_url, timeout=10, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Find main content
main_content = soup.find('main', {'id': 'content'}) or soup.find('main') or soup.find('body')
if not main_content:
continue
# Extract markdown content from main content
markdown_content = []
current_section = None
current_subsection = None
section_id = None
# Process all elements in order to build markdown
for elem in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'p', 'ul', 'li', 'div', 'pre', 'code']):
tag_name = elem.name
# Handle headings
if tag_name in ['h1', 'h2']:
# Save previous section
if current_section and markdown_content:
current_section['markdown'] = '\n\n'.join(markdown_content)
course_data['sections'].append(current_section)
course_data['navigation'].append({
'title': current_section['title'],
'level': current_section['level'],
'id': section_id
})
# Start new section
text = elem.get_text().strip()
if text and not text.startswith('Python Kurs'):
section_id = CourseScraper._slugify(text)
current_section = {
'title': text,
'level': int(tag_name[1]),
'markdown': '',
'id': section_id
}
markdown_content = [f"{'#' * int(tag_name[1])} {text}"]
current_subsection = None
elif tag_name == 'h3' and current_section:
# Subsection heading
text = elem.get_text().strip()
if text:
markdown_content.append(f"\n### {text}")
current_subsection = text
elif tag_name == 'h4' and current_section:
# Sub-subsection heading
text = elem.get_text().strip()
if text:
markdown_content.append(f"\n#### {text}")
elif current_section and tag_name == 'p':
# Paragraph
text = elem.get_text().strip()
if text and len(text) > 5:
markdown_content.append(text)
elif current_section and tag_name == 'ul':
# Unordered list
list_items = []
for li in elem.find_all('li', recursive=False):
li_text = li.get_text().strip()
if li_text:
list_items.append(f"- {li_text}")
if list_items:
markdown_content.append('\n'.join(list_items))
elif current_section and tag_name == 'div':
# Check for code blocks
code_elem = elem.find('code', class_=lambda x: x and 'language-python' in ' '.join(x))
if code_elem:
code_text = code_elem.get_text().strip()
if code_text:
markdown_content.append(f"```python\n{code_text}\n```")
elif 'language-python' in str(elem.get('class', [])):
code_text = elem.get_text().strip()
if code_text:
markdown_content.append(f"```python\n{code_text}\n```")
elif current_section and tag_name == 'pre':
# Preformatted code
code_elem = elem.find('code')
if code_elem:
code_text = code_elem.get_text().strip()
if code_text:
# Check if it's Python code
lang = 'python'
if 'language-python' in str(code_elem.get('class', [])):
lang = 'python'
markdown_content.append(f"```{lang}\n{code_text}\n```")
# Save last section
if current_section:
if markdown_content:
current_section['markdown'] = '\n\n'.join(markdown_content)
course_data['sections'].append(current_section)
course_data['navigation'].append({
'title': current_section['title'],
'level': current_section['level'],
'id': section_id
})
except Exception as e:
print(f"Error scraping {page_url}: {e}")
continue
# If no content was scraped, return default
if not course_data['sections']:
return CourseScraper._get_default_course()
return course_data
@staticmethod
def _slugify(text: str) -> str:
"""Convert text to URL-friendly slug."""
text = text.lower()
text = re.sub(r'[^\w\s-]', '', text)
text = re.sub(r'[-\s]+', '-', text)
return text.strip('-')
@staticmethod
def _get_default_course() -> Dict[str, any]:
"""Get default course structure when scraping fails."""
return {
'title': 'Python Kurs - Gymnasium Hartberg',
'sections': [
{
'title': '5. Klasse',
'level': 2,
'content': [
'Grundlagen der Programmierung mit Python',
'Variablen und Datentypen',
'Eingabe und Ausgabe',
'Bedingte Anweisungen',
'Schleifen',
'Listen und Dictionaries'
],
'subsections': []
},
{
'title': '6. Klasse',
'level': 2,
'content': [
'Funktionen',
'Module und Pakete',
'Dateiverarbeitung',
'Fehlerbehandlung',
'Objektorientierte Programmierung'
],
'subsections': []
},
{
'title': 'Objektorientierte Programmierung',
'level': 2,
'content': [
'Klassen und Objekte',
'Vererbung',
'Polymorphismus',
'Abstrakte Klassen'
],
'subsections': []
},
{
'title': 'Grafische Oberflächen',
'level': 2,
'content': [
'Einführung in GUI-Programmierung',
'Tkinter Grundlagen',
'Event-Handling',
'Layout-Management'
],
'subsections': []
}
],
'navigation': [
{'title': '5. Klasse', 'level': 2, 'id': '5-klasse'},
{'title': '6. Klasse', 'level': 2, 'id': '6-klasse'},
{'title': 'Objektorientierte Programmierung', 'level': 2, 'id': 'objektorientierte-programmierung'},
{'title': 'Grafische Oberflächen', 'level': 2, 'id': 'grafische-oberflaechen'}
]
}