229 lines
9.5 KiB
Python
229 lines
9.5 KiB
Python
"""
|
|
Module for scraping and organizing Python course content.
|
|
"""
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from typing import Dict, List, Optional
|
|
import re
|
|
|
|
|
|
class CourseScraper:
|
|
"""Scrapes Python course content from external sources."""
|
|
|
|
@staticmethod
|
|
def scrape_course_content(url: str = None) -> Dict[str, any]:
|
|
"""
|
|
Scrape course content from URLs.
|
|
|
|
Args:
|
|
url: Base URL to scrape (optional, will use default if not provided)
|
|
|
|
Returns:
|
|
Dictionary with course structure and content
|
|
"""
|
|
course_data = {
|
|
'title': 'Python Kurs - Gymnasium Hartberg',
|
|
'sections': [],
|
|
'navigation': []
|
|
}
|
|
|
|
# List of course pages to scrape
|
|
course_pages = [
|
|
('https://benschi11.github.io/python/class5.html', '5. Klasse'),
|
|
('https://benschi11.github.io/python/', 'Overview')
|
|
]
|
|
|
|
for page_url, section_title in course_pages:
|
|
try:
|
|
response = requests.get(page_url, timeout=10, headers={
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
})
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
# Find main content
|
|
main_content = soup.find('main', {'id': 'content'}) or soup.find('main') or soup.find('body')
|
|
|
|
if not main_content:
|
|
continue
|
|
|
|
# Extract markdown content from main content
|
|
markdown_content = []
|
|
current_section = None
|
|
current_subsection = None
|
|
section_id = None
|
|
|
|
# Process all elements in order to build markdown
|
|
for elem in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'p', 'ul', 'li', 'div', 'pre', 'code']):
|
|
tag_name = elem.name
|
|
|
|
# Handle headings
|
|
if tag_name in ['h1', 'h2']:
|
|
# Save previous section
|
|
if current_section and markdown_content:
|
|
current_section['markdown'] = '\n\n'.join(markdown_content)
|
|
course_data['sections'].append(current_section)
|
|
course_data['navigation'].append({
|
|
'title': current_section['title'],
|
|
'level': current_section['level'],
|
|
'id': section_id
|
|
})
|
|
|
|
# Start new section
|
|
text = elem.get_text().strip()
|
|
if text and not text.startswith('Python Kurs'):
|
|
section_id = CourseScraper._slugify(text)
|
|
current_section = {
|
|
'title': text,
|
|
'level': int(tag_name[1]),
|
|
'markdown': '',
|
|
'id': section_id
|
|
}
|
|
markdown_content = [f"{'#' * int(tag_name[1])} {text}"]
|
|
current_subsection = None
|
|
|
|
elif tag_name == 'h3' and current_section:
|
|
# Subsection heading
|
|
text = elem.get_text().strip()
|
|
if text:
|
|
markdown_content.append(f"\n### {text}")
|
|
current_subsection = text
|
|
|
|
elif tag_name == 'h4' and current_section:
|
|
# Sub-subsection heading
|
|
text = elem.get_text().strip()
|
|
if text:
|
|
markdown_content.append(f"\n#### {text}")
|
|
|
|
elif current_section and tag_name == 'p':
|
|
# Paragraph
|
|
text = elem.get_text().strip()
|
|
if text and len(text) > 5:
|
|
markdown_content.append(text)
|
|
|
|
elif current_section and tag_name == 'ul':
|
|
# Unordered list
|
|
list_items = []
|
|
for li in elem.find_all('li', recursive=False):
|
|
li_text = li.get_text().strip()
|
|
if li_text:
|
|
list_items.append(f"- {li_text}")
|
|
if list_items:
|
|
markdown_content.append('\n'.join(list_items))
|
|
|
|
elif current_section and tag_name == 'div':
|
|
# Check for code blocks
|
|
code_elem = elem.find('code', class_=lambda x: x and 'language-python' in ' '.join(x))
|
|
if code_elem:
|
|
code_text = code_elem.get_text().strip()
|
|
if code_text:
|
|
markdown_content.append(f"```python\n{code_text}\n```")
|
|
elif 'language-python' in str(elem.get('class', [])):
|
|
code_text = elem.get_text().strip()
|
|
if code_text:
|
|
markdown_content.append(f"```python\n{code_text}\n```")
|
|
|
|
elif current_section and tag_name == 'pre':
|
|
# Preformatted code
|
|
code_elem = elem.find('code')
|
|
if code_elem:
|
|
code_text = code_elem.get_text().strip()
|
|
if code_text:
|
|
# Check if it's Python code
|
|
lang = 'python'
|
|
if 'language-python' in str(code_elem.get('class', [])):
|
|
lang = 'python'
|
|
markdown_content.append(f"```{lang}\n{code_text}\n```")
|
|
|
|
# Save last section
|
|
if current_section:
|
|
if markdown_content:
|
|
current_section['markdown'] = '\n\n'.join(markdown_content)
|
|
course_data['sections'].append(current_section)
|
|
course_data['navigation'].append({
|
|
'title': current_section['title'],
|
|
'level': current_section['level'],
|
|
'id': section_id
|
|
})
|
|
|
|
except Exception as e:
|
|
print(f"Error scraping {page_url}: {e}")
|
|
continue
|
|
|
|
# If no content was scraped, return default
|
|
if not course_data['sections']:
|
|
return CourseScraper._get_default_course()
|
|
|
|
return course_data
|
|
|
|
@staticmethod
|
|
def _slugify(text: str) -> str:
|
|
"""Convert text to URL-friendly slug."""
|
|
text = text.lower()
|
|
text = re.sub(r'[^\w\s-]', '', text)
|
|
text = re.sub(r'[-\s]+', '-', text)
|
|
return text.strip('-')
|
|
|
|
@staticmethod
|
|
def _get_default_course() -> Dict[str, any]:
|
|
"""Get default course structure when scraping fails."""
|
|
return {
|
|
'title': 'Python Kurs - Gymnasium Hartberg',
|
|
'sections': [
|
|
{
|
|
'title': '5. Klasse',
|
|
'level': 2,
|
|
'content': [
|
|
'Grundlagen der Programmierung mit Python',
|
|
'Variablen und Datentypen',
|
|
'Eingabe und Ausgabe',
|
|
'Bedingte Anweisungen',
|
|
'Schleifen',
|
|
'Listen und Dictionaries'
|
|
],
|
|
'subsections': []
|
|
},
|
|
{
|
|
'title': '6. Klasse',
|
|
'level': 2,
|
|
'content': [
|
|
'Funktionen',
|
|
'Module und Pakete',
|
|
'Dateiverarbeitung',
|
|
'Fehlerbehandlung',
|
|
'Objektorientierte Programmierung'
|
|
],
|
|
'subsections': []
|
|
},
|
|
{
|
|
'title': 'Objektorientierte Programmierung',
|
|
'level': 2,
|
|
'content': [
|
|
'Klassen und Objekte',
|
|
'Vererbung',
|
|
'Polymorphismus',
|
|
'Abstrakte Klassen'
|
|
],
|
|
'subsections': []
|
|
},
|
|
{
|
|
'title': 'Grafische Oberflächen',
|
|
'level': 2,
|
|
'content': [
|
|
'Einführung in GUI-Programmierung',
|
|
'Tkinter Grundlagen',
|
|
'Event-Handling',
|
|
'Layout-Management'
|
|
],
|
|
'subsections': []
|
|
}
|
|
],
|
|
'navigation': [
|
|
{'title': '5. Klasse', 'level': 2, 'id': '5-klasse'},
|
|
{'title': '6. Klasse', 'level': 2, 'id': '6-klasse'},
|
|
{'title': 'Objektorientierte Programmierung', 'level': 2, 'id': 'objektorientierte-programmierung'},
|
|
{'title': 'Grafische Oberflächen', 'level': 2, 'id': 'grafische-oberflaechen'}
|
|
]
|
|
}
|
|
|