initial

2025-11-16 18:01:30 +01:00
commit 858003cb0b
26 changed files with 4712 additions and 0 deletions
--- a/modules/course_scraper.py
+++ b/modules/course_scraper.py
@@ -0,0 +1,228 @@
+"""
+Module for scraping and organizing Python course content.
+"""
+import requests
+from bs4 import BeautifulSoup
+from typing import Dict, List, Optional
+import re
+
+
+class CourseScraper:
+    """Scrapes Python course content from external sources."""
+    
+    @staticmethod
+    def scrape_course_content(url: str = None) -> Dict[str, any]:
+        """
+        Scrape course content from URLs.
+        
+        Args:
+            url: Base URL to scrape (optional, will use default if not provided)
+        
+        Returns:
+            Dictionary with course structure and content
+        """
+        course_data = {
+            'title': 'Python Kurs - Gymnasium Hartberg',
+            'sections': [],
+            'navigation': []
+        }
+        
+        # List of course pages to scrape
+        course_pages = [
+            ('https://benschi11.github.io/python/class5.html', '5. Klasse'),
+            ('https://benschi11.github.io/python/', 'Overview')
+        ]
+        
+        for page_url, section_title in course_pages:
+            try:
+                response = requests.get(page_url, timeout=10, headers={
+                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
+                })
+                response.raise_for_status()
+                soup = BeautifulSoup(response.content, 'html.parser')
+                
+                # Find main content
+                main_content = soup.find('main', {'id': 'content'}) or soup.find('main') or soup.find('body')
+                
+                if not main_content:
+                    continue
+                
+                # Extract markdown content from main content
+                markdown_content = []
+                current_section = None
+                current_subsection = None
+                section_id = None
+                
+                # Process all elements in order to build markdown
+                for elem in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'p', 'ul', 'li', 'div', 'pre', 'code']):
+                    tag_name = elem.name
+                    
+                    # Handle headings
+                    if tag_name in ['h1', 'h2']:
+                        # Save previous section
+                        if current_section and markdown_content:
+                            current_section['markdown'] = '\n\n'.join(markdown_content)
+                            course_data['sections'].append(current_section)
+                            course_data['navigation'].append({
+                                'title': current_section['title'],
+                                'level': current_section['level'],
+                                'id': section_id
+                            })
+                        
+                        # Start new section
+                        text = elem.get_text().strip()
+                        if text and not text.startswith('Python Kurs'):
+                            section_id = CourseScraper._slugify(text)
+                            current_section = {
+                                'title': text,
+                                'level': int(tag_name[1]),
+                                'markdown': '',
+                                'id': section_id
+                            }
+                            markdown_content = [f"{'#' * int(tag_name[1])} {text}"]
+                            current_subsection = None
+                    
+                    elif tag_name == 'h3' and current_section:
+                        # Subsection heading
+                        text = elem.get_text().strip()
+                        if text:
+                            markdown_content.append(f"\n### {text}")
+                            current_subsection = text
+                    
+                    elif tag_name == 'h4' and current_section:
+                        # Sub-subsection heading
+                        text = elem.get_text().strip()
+                        if text:
+                            markdown_content.append(f"\n#### {text}")
+                    
+                    elif current_section and tag_name == 'p':
+                        # Paragraph
+                        text = elem.get_text().strip()
+                        if text and len(text) > 5:
+                            markdown_content.append(text)
+                    
+                    elif current_section and tag_name == 'ul':
+                        # Unordered list
+                        list_items = []
+                        for li in elem.find_all('li', recursive=False):
+                            li_text = li.get_text().strip()
+                            if li_text:
+                                list_items.append(f"- {li_text}")
+                        if list_items:
+                            markdown_content.append('\n'.join(list_items))
+                    
+                    elif current_section and tag_name == 'div':
+                        # Check for code blocks
+                        code_elem = elem.find('code', class_=lambda x: x and 'language-python' in ' '.join(x))
+                        if code_elem:
+                            code_text = code_elem.get_text().strip()
+                            if code_text:
+                                markdown_content.append(f"```python\n{code_text}\n```")
+                        elif 'language-python' in str(elem.get('class', [])):
+                            code_text = elem.get_text().strip()
+                            if code_text:
+                                markdown_content.append(f"```python\n{code_text}\n```")
+                    
+                    elif current_section and tag_name == 'pre':
+                        # Preformatted code
+                        code_elem = elem.find('code')
+                        if code_elem:
+                            code_text = code_elem.get_text().strip()
+                            if code_text:
+                                # Check if it's Python code
+                                lang = 'python'
+                                if 'language-python' in str(code_elem.get('class', [])):
+                                    lang = 'python'
+                                markdown_content.append(f"```{lang}\n{code_text}\n```")
+                
+                # Save last section
+                if current_section:
+                    if markdown_content:
+                        current_section['markdown'] = '\n\n'.join(markdown_content)
+                    course_data['sections'].append(current_section)
+                    course_data['navigation'].append({
+                        'title': current_section['title'],
+                        'level': current_section['level'],
+                        'id': section_id
+                    })
+                
+            except Exception as e:
+                print(f"Error scraping {page_url}: {e}")
+                continue
+        
+        # If no content was scraped, return default
+        if not course_data['sections']:
+            return CourseScraper._get_default_course()
+        
+        return course_data
+    
+    @staticmethod
+    def _slugify(text: str) -> str:
+        """Convert text to URL-friendly slug."""
+        text = text.lower()
+        text = re.sub(r'[^\w\s-]', '', text)
+        text = re.sub(r'[-\s]+', '-', text)
+        return text.strip('-')
+    
+    @staticmethod
+    def _get_default_course() -> Dict[str, any]:
+        """Get default course structure when scraping fails."""
+        return {
+            'title': 'Python Kurs - Gymnasium Hartberg',
+            'sections': [
+                {
+                    'title': '5. Klasse',
+                    'level': 2,
+                    'content': [
+                        'Grundlagen der Programmierung mit Python',
+                        'Variablen und Datentypen',
+                        'Eingabe und Ausgabe',
+                        'Bedingte Anweisungen',
+                        'Schleifen',
+                        'Listen und Dictionaries'
+                    ],
+                    'subsections': []
+                },
+                {
+                    'title': '6. Klasse',
+                    'level': 2,
+                    'content': [
+                        'Funktionen',
+                        'Module und Pakete',
+                        'Dateiverarbeitung',
+                        'Fehlerbehandlung',
+                        'Objektorientierte Programmierung'
+                    ],
+                    'subsections': []
+                },
+                {
+                    'title': 'Objektorientierte Programmierung',
+                    'level': 2,
+                    'content': [
+                        'Klassen und Objekte',
+                        'Vererbung',
+                        'Polymorphismus',
+                        'Abstrakte Klassen'
+                    ],
+                    'subsections': []
+                },
+                {
+                    'title': 'Grafische Oberflächen',
+                    'level': 2,
+                    'content': [
+                        'Einführung in GUI-Programmierung',
+                        'Tkinter Grundlagen',
+                        'Event-Handling',
+                        'Layout-Management'
+                    ],
+                    'subsections': []
+                }
+            ],
+            'navigation': [
+                {'title': '5. Klasse', 'level': 2, 'id': '5-klasse'},
+                {'title': '6. Klasse', 'level': 2, 'id': '6-klasse'},
+                {'title': 'Objektorientierte Programmierung', 'level': 2, 'id': 'objektorientierte-programmierung'},
+                {'title': 'Grafische Oberflächen', 'level': 2, 'id': 'grafische-oberflaechen'}
+            ]
+        }
+