this works fine for now

2025-07-03 20:39:41 +02:00
parent efb93878ff
commit 014f7a873d
5 changed files with 392 additions and 9 deletions
@@ -3,9 +3,6 @@
 # will have compiled files and executables
 debug/
 target/
 # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
 # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 Cargo.lock
 # These are backup files generated by rustfmt
@@ -15,16 +12,43 @@ Cargo.lock
 *.pdb
 # ---> JupyterNotebooks
 # gitignore template for Jupyter Notebooks
 # website: http://jupyter.org/
 .ipynb_checkpoints
 */.ipynb_checkpoints/*
 # IPython
 profile_default/
 ipython_config.py
 # Remove previous ipynb_checkpoints
 #   git rm -r .ipynb_checkpoints/
 # === Cursor IDE ===
 .cursor.rules.yaml
 .cursor/
 cursor.log
 cursor.db
 cursor.history
 cursor.workspace
 .cursor_tmp/
 # === Saved scraped sites ===
 saved_site/
 saved_site/*
 # === Editor/system junk ===
 *.swp
 *.swo
 *.bak
 *.tmp
 *.log
 .DS_Store
 *~
 Thumbs.db
 ehthumbs.db
 Desktop.ini
 # === VSCode (if used ; we only use AI in this household) ===
 .vscode/
 # === Node/npm junk (just in case frontend is added) ===
 node_modules/
 dist/
 cursor.rules.yaml
@@ -0,0 +1,14 @@
 [package]
 name = "prawn"
 version = "0.1.0"
 edition = "2024"
 [dependencies]
 reqwest = { version = "0.12", features = ["json", "blocking", "cookies", "gzip", "brotli", "deflate", "stream"] }
 tokio = { version = "1.37", features = ["full"] }
 scraper = "0.18"
 rayon = "1.10"
 url = "2.5"
 futures = "0.3"
 tui = "0.19"
 crossterm = "0.27"
@@ -1,3 +1,47 @@
 # prawn
-prawncrawler
+<!-- 
    Logo Image 
    Sadly I cant do my cool Styling for the Div :C
 -->
 <div
    style = "
        display: flex;
        justify-content: center;
    ">
    <img 
        src =   "assets/logo.png" 
        alt =   "logo"
        style = "width:50%"
    />
 </div>
 prawn is an extremely fast Rust web scraper that downloads a webpage's HTML and all linked CSS and JS resources, saving them into a local folder for offline use.
 ## Features
 - High-performance: Uses `reqwest` (with connection pooling), `tokio`, and `rayon` for parallelism.
 - CLI tool: Accepts a URL as an argument.
 - Downloads and parses HTML as fast as possible.
 - Extracts and concurrently downloads all `<link rel="stylesheet">` and `<script src="...">` resources.
 - Rewrites HTML to point to local files and saves it as `saved_site/index.html`.
 - All CSS and JS files are saved into `saved_site/css/` and `saved_site/js/` respectively.
 ## Usage
 ```
 cargo run -- https://example.com
 ```
 This will download the HTML, CSS, and JS concurrently and save them to `./saved_site/` within seconds.
 ## Constraints
 - Uses async Rust (`tokio`) for HTTP I/O and `rayon` or `futures` for concurrent downloads.
 - Uses `scraper` for fast DOM-like parsing.
 - No GUI dependencies or headless browsers (pure HTTP and HTML/CSS/JS).
 - Avoids unsafe code unless absolutely justified and documented.
 - Minimizes unnecessary allocations or cloning.
 ## License
 MIT
@@ -0,0 +1,301 @@
 use std::fs::{self, File};
 use std::io::Write;
 use std::path::Path;
 use std::collections::HashSet;
 use reqwest::Client;
 use scraper::{Html, Selector};
 use tokio::task;
 use rayon::prelude::*;
 use url::{Url};
 use futures;
 use std::pin::Pin;
 use futures::Future;
 use std::sync::{Arc, Mutex};
 use std::thread;
 use tui::Terminal;
 use tui::backend::CrosstermBackend;
 use tui::widgets::{Block, Borders, Paragraph};
 use tui::layout::{Layout, Constraint, Direction};
 use tui::text::{Span, Spans};
 use crossterm::event::{self, DisableMouseCapture, EnableMouseCapture, Event, KeyCode};
 use crossterm::terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen};
 use crossterm::execute;
 const SPINNER_FRAMES: &[&str] = &["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
 #[derive(Default)]
 struct Status {
    current_url: String,
    pages_visited: usize,
    done: bool,
    spinner_idx: usize,
 }
 fn run_tui(status: Arc<Mutex<Status>>) {
    enable_raw_mode().unwrap();
    let mut stdout = std::io::stdout();
    execute!(stdout, EnterAlternateScreen, EnableMouseCapture).unwrap();
    let backend = CrosstermBackend::new(stdout);
    let mut terminal = Terminal::new(backend).unwrap();
    let mut frame_count = 0;
    loop {
        let mut s = status.lock().unwrap();
        // Animate spinner
        if !s.done {
            s.spinner_idx = (s.spinner_idx + 1) % SPINNER_FRAMES.len();
        }
        let spinner = SPINNER_FRAMES[s.spinner_idx];
        terminal.draw(|f| {
            let size = f.size();
            // Layout: center box and status bar at bottom
            let layout = Layout::default()
                .direction(Direction::Vertical)
                .margin(2)
                .constraints([
                    Constraint::Percentage(40),
                    Constraint::Length(9), // main box
                    Constraint::Percentage(40),
                    Constraint::Length(1), // status bar
                ]);
            let chunks = layout.split(size);
            // Centered main status box
            let block = Block::default().title(Span::styled(
                format!(" {} Prawn Web Scraper {} ", spinner, spinner),
                tui::style::Style::default().fg(tui::style::Color::Cyan).add_modifier(tui::style::Modifier::BOLD),
            )).borders(Borders::ALL);
            let text = vec![
                Spans::from(vec![Span::styled(
                    format!("Pages visited: {}", s.pages_visited),
                    tui::style::Style::default().fg(tui::style::Color::Green).add_modifier(tui::style::Modifier::BOLD),
                )]),
                Spans::from(vec![Span::raw("")]),
                Spans::from(vec![Span::styled(
                    "Current URL:",
                    tui::style::Style::default().fg(tui::style::Color::Yellow),
                )]),
                Spans::from(vec![Span::styled(
                    format!("{}", s.current_url),
                    tui::style::Style::default().fg(tui::style::Color::White),
                )]),
                Spans::from(vec![Span::raw("")]),
                Spans::from(vec![Span::styled(
                    if s.done {"Done! Press Ctrl+C to exit."} else {"Scraping..."},
                    tui::style::Style::default().fg(if s.done {tui::style::Color::Green} else {tui::style::Color::Blue}).add_modifier(tui::style::Modifier::BOLD),
                )]),
            ];
            let paragraph = Paragraph::new(text).block(block).alignment(tui::layout::Alignment::Center);
            f.render_widget(paragraph, chunks[1]);
            // Status bar at the bottom
            let status_bar = Paragraph::new(Spans::from(vec![
                Span::styled(
                    format!(" {} Prawn scraping in progress...  ", spinner),
                    tui::style::Style::default().fg(tui::style::Color::Magenta).add_modifier(tui::style::Modifier::BOLD),
                ),
                Span::raw(" | Press Ctrl+C to quit "),
            ]))
            .block(Block::default().borders(Borders::NONE))
            .alignment(tui::layout::Alignment::Center);
            f.render_widget(status_bar, chunks[3]);
        }).unwrap();
        if s.done {
            break;
        }
        drop(s);
        std::thread::sleep(std::time::Duration::from_millis(80));
    }
    disable_raw_mode().unwrap();
    let mut stdout = std::io::stdout();
    execute!(stdout, LeaveAlternateScreen, DisableMouseCapture).unwrap();
 }
 #[tokio::main]
 async fn main() {
    let args: Vec<String> = std::env::args().collect();
    if args.len() != 2 {
        eprintln!("Usage: cargo run -- <URL>");
        std::process::exit(1);
    }
    let mut url = args[1].clone();
    if !url.starts_with("http://") && !url.starts_with("https://") {
        url = format!("https://{}", url);
    }
    let base_url = Url::parse(&url).expect("Invalid URL");
    let client = Client::builder().build().unwrap();
    // Prepare output directories
    fs::create_dir_all("saved_site/css").unwrap();
    fs::create_dir_all("saved_site/js").unwrap();
    // Shared status for TUI
    let status = Arc::new(Mutex::new(Status::default()));
    let status_tui = status.clone();
    // Start TUI in a separate thread
    let tui_handle = thread::spawn(move || {
        run_tui(status_tui);
    });
    // Start recursive scraping
    let mut visited = HashSet::new();
    scrape_page(&client, &base_url, &base_url, "saved_site", &mut visited, 0, 2, &status).await;
    // Wait for TUI to finish (send quit signal)
    {
        let mut s = status.lock().unwrap();
        s.done = true;
    }
    tui_handle.join().unwrap();
    println!("Site saved to ./saved_site/");
 }
 fn scrape_page<'a>(
    client: &'a Client,
    base_url: &'a Url,
    page_url: &'a Url,
    save_dir: &'a str,
    visited: &'a mut HashSet<String>,
    depth: usize,
    max_depth: usize,
    status: &'a Arc<Mutex<Status>>,
 ) -> Pin<Box<dyn Future<Output = ()> + 'a>> {
    Box::pin(async move {
        {
            let mut s = status.lock().unwrap();
            s.current_url = page_url.as_str().to_string();
        }
        if depth > max_depth {
            return;
        }
        let url_str = page_url.as_str().to_string();
        if visited.contains(&url_str) {
            return;
        }
        visited.insert(url_str.clone());
        {
            let mut s = status.lock().unwrap();
            s.pages_visited += 1;
        }
        let html = match client.get(page_url.clone()).send().await {
            Ok(resp) => match resp.text().await {
                Ok(text) => text,
                Err(_) => return,
            },
            Err(_) => return,
        };
        let document = Html::parse_document(&html);
        // Extract CSS and JS links
        let css_selector = Selector::parse("link[rel=stylesheet]").unwrap();
        let js_selector = Selector::parse("script[src]").unwrap();
        let mut css_links = Vec::new();
        for element in document.select(&css_selector) {
            if let Some(href) = element.value().attr("href") {
                if let Ok(abs_url) = page_url.join(href) {
                    css_links.push((href.to_string(), abs_url.to_string()));
                }
            }
        }
        let mut js_links = Vec::new();
        for element in document.select(&js_selector) {
            if let Some(src) = element.value().attr("src") {
                if let Ok(abs_url) = page_url.join(src) {
                    js_links.push((src.to_string(), abs_url.to_string()));
                }
            }
        }
        // Download CSS and JS concurrently
        let css_results: Vec<(String, String)> = css_links.iter().map(|(orig, _abs)| {
            let filename = Path::new(orig).file_name().unwrap_or_else(|| std::ffi::OsStr::new("style.css")).to_string_lossy().to_string();
            (filename, orig.clone())
        }).collect();
        let css_futures: Vec<Pin<Box<dyn Future<Output = ()>>>> = css_links.iter().map(|(orig, abs)| {
            let filename = Path::new(orig).file_name().unwrap_or_else(|| std::ffi::OsStr::new("style.css")).to_string_lossy().to_string();
            let save_path = format!("{}/css/{}", save_dir, filename);
            let abs = abs.clone();
            Box::pin(async move {
                let client = Client::new();
                match client.get(&abs).send().await {
                    Ok(resp) => {
                        let bytes = resp.bytes().await.unwrap_or_default();
                        let mut file = File::create(&save_path).unwrap();
                        file.write_all(&bytes).unwrap();
                    },
                    Err(_) => {}
                }
            }) as Pin<Box<dyn Future<Output = ()>>>
        }).collect();
        let js_results: Vec<(String, String)> = js_links.iter().map(|(orig, _abs)| {
            let filename = Path::new(orig).file_name().unwrap_or_else(|| std::ffi::OsStr::new("script.js")).to_string_lossy().to_string();
            (filename, orig.clone())
        }).collect();
        let js_futures: Vec<Pin<Box<dyn Future<Output = ()>>>> = js_links.iter().map(|(orig, abs)| {
            let filename = Path::new(orig).file_name().unwrap_or_else(|| std::ffi::OsStr::new("script.js")).to_string_lossy().to_string();
            let save_path = format!("{}/js/{}", save_dir, filename);
            let abs = abs.clone();
            Box::pin(async move {
                let client = Client::new();
                match client.get(&abs).send().await {
                    Ok(resp) => {
                        let bytes = resp.bytes().await.unwrap_or_default();
                        let mut file = File::create(&save_path).unwrap();
                        file.write_all(&bytes).unwrap();
                    },
                    Err(_) => {}
                }
            }) as Pin<Box<dyn Future<Output = ()>>>
        }).collect();
        futures::future::join_all(css_futures).await;
        futures::future::join_all(js_futures).await;
        // Rewrite HTML to point to local files
        let mut new_html = html.clone();
        for (filename, orig) in &css_results {
            let local_path = format!("css/{}", filename);
            new_html = new_html.replace(orig, &local_path);
        }
        for (filename, orig) in &js_results {
            let local_path = format!("js/{}", filename);
            new_html = new_html.replace(orig, &local_path);
        }
        // Save rewritten HTML
        let page_path = if page_url.path() == "/" || page_url.path().is_empty() {
            format!("{}/index.html", save_dir)
        } else {
            let mut path = page_url.path().trim_start_matches('/').replace('/', "_");
            if !path.ends_with(".html") {
                path.push_str(".html");
            }
            format!("{}/{}", save_dir, path)
        };
        if let Some(parent) = Path::new(&page_path).parent() {
            fs::create_dir_all(parent).unwrap();
        }
        let mut file = File::create(&page_path).unwrap();
        file.write_all(new_html.as_bytes()).unwrap();
        // Extract and recursively scrape <a href> links
        let a_selector = Selector::parse("a[href]").unwrap();
        for element in document.select(&a_selector) {
            if let Some(href) = element.value().attr("href") {
                if let Ok(link_url) = page_url.join(href) {
                    // Only follow links within the same domain
                    if link_url.domain() == base_url.domain() {
                        scrape_page(client, base_url, &link_url, save_dir, visited, depth + 1, max_depth, status).await;
                    }
                }
            }
        }
    })
 }