initial
This commit is contained in:
228
modules/course_scraper.py
Normal file
228
modules/course_scraper.py
Normal file
@@ -0,0 +1,228 @@
|
||||
"""
|
||||
Module for scraping and organizing Python course content.
|
||||
"""
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Dict, List, Optional
|
||||
import re
|
||||
|
||||
|
||||
class CourseScraper:
|
||||
"""Scrapes Python course content from external sources."""
|
||||
|
||||
@staticmethod
|
||||
def scrape_course_content(url: str = None) -> Dict[str, any]:
|
||||
"""
|
||||
Scrape course content from URLs.
|
||||
|
||||
Args:
|
||||
url: Base URL to scrape (optional, will use default if not provided)
|
||||
|
||||
Returns:
|
||||
Dictionary with course structure and content
|
||||
"""
|
||||
course_data = {
|
||||
'title': 'Python Kurs - Gymnasium Hartberg',
|
||||
'sections': [],
|
||||
'navigation': []
|
||||
}
|
||||
|
||||
# List of course pages to scrape
|
||||
course_pages = [
|
||||
('https://benschi11.github.io/python/class5.html', '5. Klasse'),
|
||||
('https://benschi11.github.io/python/', 'Overview')
|
||||
]
|
||||
|
||||
for page_url, section_title in course_pages:
|
||||
try:
|
||||
response = requests.get(page_url, timeout=10, headers={
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
})
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Find main content
|
||||
main_content = soup.find('main', {'id': 'content'}) or soup.find('main') or soup.find('body')
|
||||
|
||||
if not main_content:
|
||||
continue
|
||||
|
||||
# Extract markdown content from main content
|
||||
markdown_content = []
|
||||
current_section = None
|
||||
current_subsection = None
|
||||
section_id = None
|
||||
|
||||
# Process all elements in order to build markdown
|
||||
for elem in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'p', 'ul', 'li', 'div', 'pre', 'code']):
|
||||
tag_name = elem.name
|
||||
|
||||
# Handle headings
|
||||
if tag_name in ['h1', 'h2']:
|
||||
# Save previous section
|
||||
if current_section and markdown_content:
|
||||
current_section['markdown'] = '\n\n'.join(markdown_content)
|
||||
course_data['sections'].append(current_section)
|
||||
course_data['navigation'].append({
|
||||
'title': current_section['title'],
|
||||
'level': current_section['level'],
|
||||
'id': section_id
|
||||
})
|
||||
|
||||
# Start new section
|
||||
text = elem.get_text().strip()
|
||||
if text and not text.startswith('Python Kurs'):
|
||||
section_id = CourseScraper._slugify(text)
|
||||
current_section = {
|
||||
'title': text,
|
||||
'level': int(tag_name[1]),
|
||||
'markdown': '',
|
||||
'id': section_id
|
||||
}
|
||||
markdown_content = [f"{'#' * int(tag_name[1])} {text}"]
|
||||
current_subsection = None
|
||||
|
||||
elif tag_name == 'h3' and current_section:
|
||||
# Subsection heading
|
||||
text = elem.get_text().strip()
|
||||
if text:
|
||||
markdown_content.append(f"\n### {text}")
|
||||
current_subsection = text
|
||||
|
||||
elif tag_name == 'h4' and current_section:
|
||||
# Sub-subsection heading
|
||||
text = elem.get_text().strip()
|
||||
if text:
|
||||
markdown_content.append(f"\n#### {text}")
|
||||
|
||||
elif current_section and tag_name == 'p':
|
||||
# Paragraph
|
||||
text = elem.get_text().strip()
|
||||
if text and len(text) > 5:
|
||||
markdown_content.append(text)
|
||||
|
||||
elif current_section and tag_name == 'ul':
|
||||
# Unordered list
|
||||
list_items = []
|
||||
for li in elem.find_all('li', recursive=False):
|
||||
li_text = li.get_text().strip()
|
||||
if li_text:
|
||||
list_items.append(f"- {li_text}")
|
||||
if list_items:
|
||||
markdown_content.append('\n'.join(list_items))
|
||||
|
||||
elif current_section and tag_name == 'div':
|
||||
# Check for code blocks
|
||||
code_elem = elem.find('code', class_=lambda x: x and 'language-python' in ' '.join(x))
|
||||
if code_elem:
|
||||
code_text = code_elem.get_text().strip()
|
||||
if code_text:
|
||||
markdown_content.append(f"```python\n{code_text}\n```")
|
||||
elif 'language-python' in str(elem.get('class', [])):
|
||||
code_text = elem.get_text().strip()
|
||||
if code_text:
|
||||
markdown_content.append(f"```python\n{code_text}\n```")
|
||||
|
||||
elif current_section and tag_name == 'pre':
|
||||
# Preformatted code
|
||||
code_elem = elem.find('code')
|
||||
if code_elem:
|
||||
code_text = code_elem.get_text().strip()
|
||||
if code_text:
|
||||
# Check if it's Python code
|
||||
lang = 'python'
|
||||
if 'language-python' in str(code_elem.get('class', [])):
|
||||
lang = 'python'
|
||||
markdown_content.append(f"```{lang}\n{code_text}\n```")
|
||||
|
||||
# Save last section
|
||||
if current_section:
|
||||
if markdown_content:
|
||||
current_section['markdown'] = '\n\n'.join(markdown_content)
|
||||
course_data['sections'].append(current_section)
|
||||
course_data['navigation'].append({
|
||||
'title': current_section['title'],
|
||||
'level': current_section['level'],
|
||||
'id': section_id
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error scraping {page_url}: {e}")
|
||||
continue
|
||||
|
||||
# If no content was scraped, return default
|
||||
if not course_data['sections']:
|
||||
return CourseScraper._get_default_course()
|
||||
|
||||
return course_data
|
||||
|
||||
@staticmethod
|
||||
def _slugify(text: str) -> str:
|
||||
"""Convert text to URL-friendly slug."""
|
||||
text = text.lower()
|
||||
text = re.sub(r'[^\w\s-]', '', text)
|
||||
text = re.sub(r'[-\s]+', '-', text)
|
||||
return text.strip('-')
|
||||
|
||||
@staticmethod
|
||||
def _get_default_course() -> Dict[str, any]:
|
||||
"""Get default course structure when scraping fails."""
|
||||
return {
|
||||
'title': 'Python Kurs - Gymnasium Hartberg',
|
||||
'sections': [
|
||||
{
|
||||
'title': '5. Klasse',
|
||||
'level': 2,
|
||||
'content': [
|
||||
'Grundlagen der Programmierung mit Python',
|
||||
'Variablen und Datentypen',
|
||||
'Eingabe und Ausgabe',
|
||||
'Bedingte Anweisungen',
|
||||
'Schleifen',
|
||||
'Listen und Dictionaries'
|
||||
],
|
||||
'subsections': []
|
||||
},
|
||||
{
|
||||
'title': '6. Klasse',
|
||||
'level': 2,
|
||||
'content': [
|
||||
'Funktionen',
|
||||
'Module und Pakete',
|
||||
'Dateiverarbeitung',
|
||||
'Fehlerbehandlung',
|
||||
'Objektorientierte Programmierung'
|
||||
],
|
||||
'subsections': []
|
||||
},
|
||||
{
|
||||
'title': 'Objektorientierte Programmierung',
|
||||
'level': 2,
|
||||
'content': [
|
||||
'Klassen und Objekte',
|
||||
'Vererbung',
|
||||
'Polymorphismus',
|
||||
'Abstrakte Klassen'
|
||||
],
|
||||
'subsections': []
|
||||
},
|
||||
{
|
||||
'title': 'Grafische Oberflächen',
|
||||
'level': 2,
|
||||
'content': [
|
||||
'Einführung in GUI-Programmierung',
|
||||
'Tkinter Grundlagen',
|
||||
'Event-Handling',
|
||||
'Layout-Management'
|
||||
],
|
||||
'subsections': []
|
||||
}
|
||||
],
|
||||
'navigation': [
|
||||
{'title': '5. Klasse', 'level': 2, 'id': '5-klasse'},
|
||||
{'title': '6. Klasse', 'level': 2, 'id': '6-klasse'},
|
||||
{'title': 'Objektorientierte Programmierung', 'level': 2, 'id': 'objektorientierte-programmierung'},
|
||||
{'title': 'Grafische Oberflächen', 'level': 2, 'id': 'grafische-oberflaechen'}
|
||||
]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user