""" Module for scraping and organizing Python course content. """ import requests from bs4 import BeautifulSoup from typing import Dict, List, Optional import re class CourseScraper: """Scrapes Python course content from external sources.""" @staticmethod def scrape_course_content(url: str = None) -> Dict[str, any]: """ Scrape course content from URLs. Args: url: Base URL to scrape (optional, will use default if not provided) Returns: Dictionary with course structure and content """ course_data = { 'title': 'Python Kurs - Gymnasium Hartberg', 'sections': [], 'navigation': [] } # List of course pages to scrape course_pages = [ ('https://benschi11.github.io/python/class5.html', '5. Klasse'), ('https://benschi11.github.io/python/', 'Overview') ] for page_url, section_title in course_pages: try: response = requests.get(page_url, timeout=10, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Find main content main_content = soup.find('main', {'id': 'content'}) or soup.find('main') or soup.find('body') if not main_content: continue # Extract markdown content from main content markdown_content = [] current_section = None current_subsection = None section_id = None # Process all elements in order to build markdown for elem in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'p', 'ul', 'li', 'div', 'pre', 'code']): tag_name = elem.name # Handle headings if tag_name in ['h1', 'h2']: # Save previous section if current_section and markdown_content: current_section['markdown'] = '\n\n'.join(markdown_content) course_data['sections'].append(current_section) course_data['navigation'].append({ 'title': current_section['title'], 'level': current_section['level'], 'id': section_id }) # Start new section text = elem.get_text().strip() if text and not text.startswith('Python Kurs'): section_id = CourseScraper._slugify(text) current_section = { 'title': text, 'level': int(tag_name[1]), 'markdown': '', 'id': section_id } markdown_content = [f"{'#' * int(tag_name[1])} {text}"] current_subsection = None elif tag_name == 'h3' and current_section: # Subsection heading text = elem.get_text().strip() if text: markdown_content.append(f"\n### {text}") current_subsection = text elif tag_name == 'h4' and current_section: # Sub-subsection heading text = elem.get_text().strip() if text: markdown_content.append(f"\n#### {text}") elif current_section and tag_name == 'p': # Paragraph text = elem.get_text().strip() if text and len(text) > 5: markdown_content.append(text) elif current_section and tag_name == 'ul': # Unordered list list_items = [] for li in elem.find_all('li', recursive=False): li_text = li.get_text().strip() if li_text: list_items.append(f"- {li_text}") if list_items: markdown_content.append('\n'.join(list_items)) elif current_section and tag_name == 'div': # Check for code blocks code_elem = elem.find('code', class_=lambda x: x and 'language-python' in ' '.join(x)) if code_elem: code_text = code_elem.get_text().strip() if code_text: markdown_content.append(f"```python\n{code_text}\n```") elif 'language-python' in str(elem.get('class', [])): code_text = elem.get_text().strip() if code_text: markdown_content.append(f"```python\n{code_text}\n```") elif current_section and tag_name == 'pre': # Preformatted code code_elem = elem.find('code') if code_elem: code_text = code_elem.get_text().strip() if code_text: # Check if it's Python code lang = 'python' if 'language-python' in str(code_elem.get('class', [])): lang = 'python' markdown_content.append(f"```{lang}\n{code_text}\n```") # Save last section if current_section: if markdown_content: current_section['markdown'] = '\n\n'.join(markdown_content) course_data['sections'].append(current_section) course_data['navigation'].append({ 'title': current_section['title'], 'level': current_section['level'], 'id': section_id }) except Exception as e: print(f"Error scraping {page_url}: {e}") continue # If no content was scraped, return default if not course_data['sections']: return CourseScraper._get_default_course() return course_data @staticmethod def _slugify(text: str) -> str: """Convert text to URL-friendly slug.""" text = text.lower() text = re.sub(r'[^\w\s-]', '', text) text = re.sub(r'[-\s]+', '-', text) return text.strip('-') @staticmethod def _get_default_course() -> Dict[str, any]: """Get default course structure when scraping fails.""" return { 'title': 'Python Kurs - Gymnasium Hartberg', 'sections': [ { 'title': '5. Klasse', 'level': 2, 'content': [ 'Grundlagen der Programmierung mit Python', 'Variablen und Datentypen', 'Eingabe und Ausgabe', 'Bedingte Anweisungen', 'Schleifen', 'Listen und Dictionaries' ], 'subsections': [] }, { 'title': '6. Klasse', 'level': 2, 'content': [ 'Funktionen', 'Module und Pakete', 'Dateiverarbeitung', 'Fehlerbehandlung', 'Objektorientierte Programmierung' ], 'subsections': [] }, { 'title': 'Objektorientierte Programmierung', 'level': 2, 'content': [ 'Klassen und Objekte', 'Vererbung', 'Polymorphismus', 'Abstrakte Klassen' ], 'subsections': [] }, { 'title': 'Grafische Oberflächen', 'level': 2, 'content': [ 'Einführung in GUI-Programmierung', 'Tkinter Grundlagen', 'Event-Handling', 'Layout-Management' ], 'subsections': [] } ], 'navigation': [ {'title': '5. Klasse', 'level': 2, 'id': '5-klasse'}, {'title': '6. Klasse', 'level': 2, 'id': '6-klasse'}, {'title': 'Objektorientierte Programmierung', 'level': 2, 'id': 'objektorientierte-programmierung'}, {'title': 'Grafische Oberflächen', 'level': 2, 'id': 'grafische-oberflaechen'} ] }