pypages/modules/course_scraper.py

"""
Module for scraping and organizing Python course content.
"""
import requests
from bs4 import BeautifulSoup
from typing import Dict, List, Optional
import re


class CourseScraper:
    """Scrapes Python course content from external sources."""

    @staticmethod
    def scrape_course_content(url: str = None) -> Dict[str, any]:
        """
        Scrape course content from URLs.

        Args:
            url: Base URL to scrape (optional, will use default if not provided)

        Returns:
            Dictionary with course structure and content
        """
        course_data = {
            'title': 'Python Kurs - Gymnasium Hartberg',
            'sections': [],
            'navigation': []
        }

        # List of course pages to scrape
        course_pages = [
            ('https://benschi11.github.io/python/class5.html', '5. Klasse'),
            ('https://benschi11.github.io/python/', 'Overview')
        ]

        for page_url, section_title in course_pages:
            try:
                response = requests.get(page_url, timeout=10, headers={
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                })
                response.raise_for_status()
                soup = BeautifulSoup(response.content, 'html.parser')

                # Find main content
                main_content = soup.find('main', {'id': 'content'}) or soup.find('main') or soup.find('body')

                if not main_content:
                    continue

                # Extract markdown content from main content
                markdown_content = []
                current_section = None
                current_subsection = None
                section_id = None

                # Process all elements in order to build markdown
                for elem in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'p', 'ul', 'li', 'div', 'pre', 'code']):
                    tag_name = elem.name

                    # Handle headings
                    if tag_name in ['h1', 'h2']:
                        # Save previous section
                        if current_section and markdown_content:
                            current_section['markdown'] = '\n\n'.join(markdown_content)
                            course_data['sections'].append(current_section)
                            course_data['navigation'].append({
                                'title': current_section['title'],
                                'level': current_section['level'],
                                'id': section_id
                            })

                        # Start new section
                        text = elem.get_text().strip()
                        if text and not text.startswith('Python Kurs'):
                            section_id = CourseScraper._slugify(text)
                            current_section = {
                                'title': text,
                                'level': int(tag_name[1]),
                                'markdown': '',
                                'id': section_id
                            }
                            markdown_content = [f"{'#' * int(tag_name[1])} {text}"]
                            current_subsection = None

                    elif tag_name == 'h3' and current_section:
                        # Subsection heading
                        text = elem.get_text().strip()
                        if text:
                            markdown_content.append(f"\n### {text}")
                            current_subsection = text

                    elif tag_name == 'h4' and current_section:
                        # Sub-subsection heading
                        text = elem.get_text().strip()
                        if text:
                            markdown_content.append(f"\n#### {text}")

                    elif current_section and tag_name == 'p':
                        # Paragraph
                        text = elem.get_text().strip()
                        if text and len(text) > 5:
                            markdown_content.append(text)

                    elif current_section and tag_name == 'ul':
                        # Unordered list
                        list_items = []
                        for li in elem.find_all('li', recursive=False):
                            li_text = li.get_text().strip()
                            if li_text:
                                list_items.append(f"- {li_text}")
                        if list_items:
                            markdown_content.append('\n'.join(list_items))

                    elif current_section and tag_name == 'div':
                        # Check for code blocks
                        code_elem = elem.find('code', class_=lambda x: x and 'language-python' in ' '.join(x))
                        if code_elem:
                            code_text = code_elem.get_text().strip()
                            if code_text:
                                markdown_content.append(f"```python\n{code_text}\n```")
                        elif 'language-python' in str(elem.get('class', [])):
                            code_text = elem.get_text().strip()
                            if code_text:
                                markdown_content.append(f"```python\n{code_text}\n```")

                    elif current_section and tag_name == 'pre':
                        # Preformatted code
                        code_elem = elem.find('code')
                        if code_elem:
                            code_text = code_elem.get_text().strip()
                            if code_text:
                                # Check if it's Python code
                                lang = 'python'
                                if 'language-python' in str(code_elem.get('class', [])):
                                    lang = 'python'
                                markdown_content.append(f"```{lang}\n{code_text}\n```")

                # Save last section
                if current_section:
                    if markdown_content:
                        current_section['markdown'] = '\n\n'.join(markdown_content)
                    course_data['sections'].append(current_section)
                    course_data['navigation'].append({
                        'title': current_section['title'],
                        'level': current_section['level'],
                        'id': section_id
                    })

            except Exception as e:
                print(f"Error scraping {page_url}: {e}")
                continue

        # If no content was scraped, return default
        if not course_data['sections']:
            return CourseScraper._get_default_course()

        return course_data

    @staticmethod
    def _slugify(text: str) -> str:
        """Convert text to URL-friendly slug."""
        text = text.lower()
        text = re.sub(r'[^\w\s-]', '', text)
        text = re.sub(r'[-\s]+', '-', text)
        return text.strip('-')

    @staticmethod
    def _get_default_course() -> Dict[str, any]:
        """Get default course structure when scraping fails."""
        return {
            'title': 'Python Kurs - Gymnasium Hartberg',
            'sections': [
                {
                    'title': '5. Klasse',
                    'level': 2,
                    'content': [
                        'Grundlagen der Programmierung mit Python',
                        'Variablen und Datentypen',
                        'Eingabe und Ausgabe',
                        'Bedingte Anweisungen',
                        'Schleifen',
                        'Listen und Dictionaries'
                    ],
                    'subsections': []
                },
                {
                    'title': '6. Klasse',
                    'level': 2,
                    'content': [
                        'Funktionen',
                        'Module und Pakete',
                        'Dateiverarbeitung',
                        'Fehlerbehandlung',
                        'Objektorientierte Programmierung'
                    ],
                    'subsections': []
                },
                {
                    'title': 'Objektorientierte Programmierung',
                    'level': 2,
                    'content': [
                        'Klassen und Objekte',
                        'Vererbung',
                        'Polymorphismus',
                        'Abstrakte Klassen'
                    ],
                    'subsections': []
                },
                {
                    'title': 'Grafische Oberflächen',
                    'level': 2,
                    'content': [
                        'Einführung in GUI-Programmierung',
                        'Tkinter Grundlagen',
                        'Event-Handling',
                        'Layout-Management'
                    ],
                    'subsections': []
                }
            ],
            'navigation': [
                {'title': '5. Klasse', 'level': 2, 'id': '5-klasse'},
                {'title': '6. Klasse', 'level': 2, 'id': '6-klasse'},
                {'title': 'Objektorientierte Programmierung', 'level': 2, 'id': 'objektorientierte-programmierung'},
                {'title': 'Grafische Oberflächen', 'level': 2, 'id': 'grafische-oberflaechen'}
            ]
        }