this works fine for now

2025-07-03 20:39:41 +02:00
parent efb93878ff
commit 014f7a873d
5 changed files with 392 additions and 9 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -3,9 +3,6 @@
 # will have compiled files and executables
 debug/
 target/
-
-# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
-# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
 Cargo.lock

 # These are backup files generated by rustfmt
@@ -15,16 +12,43 @@ Cargo.lock
 *.pdb

 # ---> JupyterNotebooks
-# gitignore template for Jupyter Notebooks
-# website: http://jupyter.org/
-
 .ipynb_checkpoints
 */.ipynb_checkpoints/*
-
-# IPython
 profile_default/
 ipython_config.py

 # Remove previous ipynb_checkpoints
 #   git rm -r .ipynb_checkpoints/

+# === Cursor IDE ===
+.cursor.rules.yaml
+.cursor/
+cursor.log
+cursor.db
+cursor.history
+cursor.workspace
+.cursor_tmp/
+
+# === Saved scraped sites ===
+saved_site/
+saved_site/*
+
+# === Editor/system junk ===
+*.swp
+*.swo
+*.bak
+*.tmp
+*.log
+.DS_Store
+*~
+Thumbs.db
+ehthumbs.db
+Desktop.ini
+
+# === VSCode (if used ; we only use AI in this household) ===
+.vscode/
+
+# === Node/npm junk (just in case frontend is added) ===
+node_modules/
+dist/
+cursor.rules.yaml
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "prawn"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+reqwest = { version = "0.12", features = ["json", "blocking", "cookies", "gzip", "brotli", "deflate", "stream"] }
+tokio = { version = "1.37", features = ["full"] }
+scraper = "0.18"
+rayon = "1.10"
+url = "2.5"
+futures = "0.3"
+tui = "0.19"
+crossterm = "0.27"
--- a/README.md
+++ b/README.md
@@ -1,3 +1,47 @@
 # prawn

-prawncrawler
+<!-- 
+
+    Logo Image 
+    Sadly I cant do my cool Styling for the Div :C
+-->
+<div
+    style = "
+        display: flex;
+        justify-content: center;
+    ">
+    <img 
+        src =   "assets/logo.png" 
+        alt =   "logo"
+        style = "width:50%"
+    />
+</div>
+
+
+prawn is an extremely fast Rust web scraper that downloads a webpage's HTML and all linked CSS and JS resources, saving them into a local folder for offline use.
+
+## Features
+- High-performance: Uses `reqwest` (with connection pooling), `tokio`, and `rayon` for parallelism.
+- CLI tool: Accepts a URL as an argument.
+- Downloads and parses HTML as fast as possible.
+- Extracts and concurrently downloads all `<link rel="stylesheet">` and `<script src="...">` resources.
+- Rewrites HTML to point to local files and saves it as `saved_site/index.html`.
+- All CSS and JS files are saved into `saved_site/css/` and `saved_site/js/` respectively.
+
+## Usage
+
+```
+cargo run -- https://example.com
+```
+
+This will download the HTML, CSS, and JS concurrently and save them to `./saved_site/` within seconds.
+
+## Constraints
+- Uses async Rust (`tokio`) for HTTP I/O and `rayon` or `futures` for concurrent downloads.
+- Uses `scraper` for fast DOM-like parsing.
+- No GUI dependencies or headless browsers (pure HTTP and HTML/CSS/JS).
+- Avoids unsafe code unless absolutely justified and documented.
+- Minimizes unnecessary allocations or cloning.
+
+## License
+MIT
--- a/assets/logo.png
+++ b/assets/logo.png
--- a/src/main.rs
+++ b/src/main.rs
@@ -0,0 +1,301 @@
+use std::fs::{self, File};
+use std::io::Write;
+use std::path::Path;
+use std::collections::HashSet;
+use reqwest::Client;
+use scraper::{Html, Selector};
+use tokio::task;
+use rayon::prelude::*;
+use url::{Url};
+use futures;
+use std::pin::Pin;
+use futures::Future;
+use std::sync::{Arc, Mutex};
+use std::thread;
+use tui::Terminal;
+use tui::backend::CrosstermBackend;
+use tui::widgets::{Block, Borders, Paragraph};
+use tui::layout::{Layout, Constraint, Direction};
+use tui::text::{Span, Spans};
+use crossterm::event::{self, DisableMouseCapture, EnableMouseCapture, Event, KeyCode};
+use crossterm::terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen};
+use crossterm::execute;
+
+const SPINNER_FRAMES: &[&str] = &["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
+
+#[derive(Default)]
+struct Status {
+    current_url: String,
+    pages_visited: usize,
+    done: bool,
+    spinner_idx: usize,
+}
+
+fn run_tui(status: Arc<Mutex<Status>>) {
+    enable_raw_mode().unwrap();
+    let mut stdout = std::io::stdout();
+    execute!(stdout, EnterAlternateScreen, EnableMouseCapture).unwrap();
+    let backend = CrosstermBackend::new(stdout);
+    let mut terminal = Terminal::new(backend).unwrap();
+
+    let mut frame_count = 0;
+
+    loop {
+        let mut s = status.lock().unwrap();
+        // Animate spinner
+        if !s.done {
+            s.spinner_idx = (s.spinner_idx + 1) % SPINNER_FRAMES.len();
+        }
+        let spinner = SPINNER_FRAMES[s.spinner_idx];
+        terminal.draw(|f| {
+            let size = f.size();
+            // Layout: center box and status bar at bottom
+            let layout = Layout::default()
+                .direction(Direction::Vertical)
+                .margin(2)
+                .constraints([
+                    Constraint::Percentage(40),
+                    Constraint::Length(9), // main box
+                    Constraint::Percentage(40),
+                    Constraint::Length(1), // status bar
+                ]);
+            let chunks = layout.split(size);
+
+            // Centered main status box
+            let block = Block::default().title(Span::styled(
+                format!(" {} Prawn Web Scraper {} ", spinner, spinner),
+                tui::style::Style::default().fg(tui::style::Color::Cyan).add_modifier(tui::style::Modifier::BOLD),
+            )).borders(Borders::ALL);
+            let text = vec![
+                Spans::from(vec![Span::styled(
+                    format!("Pages visited: {}", s.pages_visited),
+                    tui::style::Style::default().fg(tui::style::Color::Green).add_modifier(tui::style::Modifier::BOLD),
+                )]),
+                Spans::from(vec![Span::raw("")]),
+                Spans::from(vec![Span::styled(
+                    "Current URL:",
+                    tui::style::Style::default().fg(tui::style::Color::Yellow),
+                )]),
+                Spans::from(vec![Span::styled(
+                    format!("{}", s.current_url),
+                    tui::style::Style::default().fg(tui::style::Color::White),
+                )]),
+                Spans::from(vec![Span::raw("")]),
+                Spans::from(vec![Span::styled(
+                    if s.done {"Done! Press Ctrl+C to exit."} else {"Scraping..."},
+                    tui::style::Style::default().fg(if s.done {tui::style::Color::Green} else {tui::style::Color::Blue}).add_modifier(tui::style::Modifier::BOLD),
+                )]),
+            ];
+            let paragraph = Paragraph::new(text).block(block).alignment(tui::layout::Alignment::Center);
+            f.render_widget(paragraph, chunks[1]);
+
+            // Status bar at the bottom
+            let status_bar = Paragraph::new(Spans::from(vec![
+                Span::styled(
+                    format!(" {} Prawn scraping in progress...  ", spinner),
+                    tui::style::Style::default().fg(tui::style::Color::Magenta).add_modifier(tui::style::Modifier::BOLD),
+                ),
+                Span::raw(" | Press Ctrl+C to quit "),
+            ]))
+            .block(Block::default().borders(Borders::NONE))
+            .alignment(tui::layout::Alignment::Center);
+            f.render_widget(status_bar, chunks[3]);
+        }).unwrap();
+        if s.done {
+            break;
+        }
+        drop(s);
+        std::thread::sleep(std::time::Duration::from_millis(80));
+    }
+    disable_raw_mode().unwrap();
+    let mut stdout = std::io::stdout();
+    execute!(stdout, LeaveAlternateScreen, DisableMouseCapture).unwrap();
+}
+
+#[tokio::main]
+async fn main() {
+    let args: Vec<String> = std::env::args().collect();
+    if args.len() != 2 {
+        eprintln!("Usage: cargo run -- <URL>");
+        std::process::exit(1);
+    }
+    let mut url = args[1].clone();
+    if !url.starts_with("http://") && !url.starts_with("https://") {
+        url = format!("https://{}", url);
+    }
+    let base_url = Url::parse(&url).expect("Invalid URL");
+    let client = Client::builder().build().unwrap();
+
+    // Prepare output directories
+    fs::create_dir_all("saved_site/css").unwrap();
+    fs::create_dir_all("saved_site/js").unwrap();
+
+    // Shared status for TUI
+    let status = Arc::new(Mutex::new(Status::default()));
+    let status_tui = status.clone();
+
+    // Start TUI in a separate thread
+    let tui_handle = thread::spawn(move || {
+        run_tui(status_tui);
+    });
+
+    // Start recursive scraping
+    let mut visited = HashSet::new();
+    scrape_page(&client, &base_url, &base_url, "saved_site", &mut visited, 0, 2, &status).await;
+
+    // Wait for TUI to finish (send quit signal)
+    {
+        let mut s = status.lock().unwrap();
+        s.done = true;
+    }
+    tui_handle.join().unwrap();
+    println!("Site saved to ./saved_site/");
+}
+
+fn scrape_page<'a>(
+    client: &'a Client,
+    base_url: &'a Url,
+    page_url: &'a Url,
+    save_dir: &'a str,
+    visited: &'a mut HashSet<String>,
+    depth: usize,
+    max_depth: usize,
+    status: &'a Arc<Mutex<Status>>,
+) -> Pin<Box<dyn Future<Output = ()> + 'a>> {
+    Box::pin(async move {
+        {
+            let mut s = status.lock().unwrap();
+            s.current_url = page_url.as_str().to_string();
+        }
+        if depth > max_depth {
+            return;
+        }
+        let url_str = page_url.as_str().to_string();
+        if visited.contains(&url_str) {
+            return;
+        }
+        visited.insert(url_str.clone());
+
+        {
+            let mut s = status.lock().unwrap();
+            s.pages_visited += 1;
+        }
+
+        let html = match client.get(page_url.clone()).send().await {
+            Ok(resp) => match resp.text().await {
+                Ok(text) => text,
+                Err(_) => return,
+            },
+            Err(_) => return,
+        };
+        let document = Html::parse_document(&html);
+
+        // Extract CSS and JS links
+        let css_selector = Selector::parse("link[rel=stylesheet]").unwrap();
+        let js_selector = Selector::parse("script[src]").unwrap();
+
+        let mut css_links = Vec::new();
+        for element in document.select(&css_selector) {
+            if let Some(href) = element.value().attr("href") {
+                if let Ok(abs_url) = page_url.join(href) {
+                    css_links.push((href.to_string(), abs_url.to_string()));
+                }
+            }
+        }
+
+        let mut js_links = Vec::new();
+        for element in document.select(&js_selector) {
+            if let Some(src) = element.value().attr("src") {
+                if let Ok(abs_url) = page_url.join(src) {
+                    js_links.push((src.to_string(), abs_url.to_string()));
+                }
+            }
+        }
+
+        // Download CSS and JS concurrently
+        let css_results: Vec<(String, String)> = css_links.iter().map(|(orig, _abs)| {
+            let filename = Path::new(orig).file_name().unwrap_or_else(|| std::ffi::OsStr::new("style.css")).to_string_lossy().to_string();
+            (filename, orig.clone())
+        }).collect();
+        let css_futures: Vec<Pin<Box<dyn Future<Output = ()>>>> = css_links.iter().map(|(orig, abs)| {
+            let filename = Path::new(orig).file_name().unwrap_or_else(|| std::ffi::OsStr::new("style.css")).to_string_lossy().to_string();
+            let save_path = format!("{}/css/{}", save_dir, filename);
+            let abs = abs.clone();
+            Box::pin(async move {
+                let client = Client::new();
+                match client.get(&abs).send().await {
+                    Ok(resp) => {
+                        let bytes = resp.bytes().await.unwrap_or_default();
+                        let mut file = File::create(&save_path).unwrap();
+                        file.write_all(&bytes).unwrap();
+                    },
+                    Err(_) => {}
+                }
+            }) as Pin<Box<dyn Future<Output = ()>>>
+        }).collect();
+
+        let js_results: Vec<(String, String)> = js_links.iter().map(|(orig, _abs)| {
+            let filename = Path::new(orig).file_name().unwrap_or_else(|| std::ffi::OsStr::new("script.js")).to_string_lossy().to_string();
+            (filename, orig.clone())
+        }).collect();
+        let js_futures: Vec<Pin<Box<dyn Future<Output = ()>>>> = js_links.iter().map(|(orig, abs)| {
+            let filename = Path::new(orig).file_name().unwrap_or_else(|| std::ffi::OsStr::new("script.js")).to_string_lossy().to_string();
+            let save_path = format!("{}/js/{}", save_dir, filename);
+            let abs = abs.clone();
+            Box::pin(async move {
+                let client = Client::new();
+                match client.get(&abs).send().await {
+                    Ok(resp) => {
+                        let bytes = resp.bytes().await.unwrap_or_default();
+                        let mut file = File::create(&save_path).unwrap();
+                        file.write_all(&bytes).unwrap();
+                    },
+                    Err(_) => {}
+                }
+            }) as Pin<Box<dyn Future<Output = ()>>>
+        }).collect();
+
+        futures::future::join_all(css_futures).await;
+        futures::future::join_all(js_futures).await;
+
+        // Rewrite HTML to point to local files
+        let mut new_html = html.clone();
+        for (filename, orig) in &css_results {
+            let local_path = format!("css/{}", filename);
+            new_html = new_html.replace(orig, &local_path);
+        }
+        for (filename, orig) in &js_results {
+            let local_path = format!("js/{}", filename);
+            new_html = new_html.replace(orig, &local_path);
+        }
+
+        // Save rewritten HTML
+        let page_path = if page_url.path() == "/" || page_url.path().is_empty() {
+            format!("{}/index.html", save_dir)
+        } else {
+            let mut path = page_url.path().trim_start_matches('/').replace('/', "_");
+            if !path.ends_with(".html") {
+                path.push_str(".html");
+            }
+            format!("{}/{}", save_dir, path)
+        };
+        if let Some(parent) = Path::new(&page_path).parent() {
+            fs::create_dir_all(parent).unwrap();
+        }
+        let mut file = File::create(&page_path).unwrap();
+        file.write_all(new_html.as_bytes()).unwrap();
+
+        // Extract and recursively scrape <a href> links
+        let a_selector = Selector::parse("a[href]").unwrap();
+        for element in document.select(&a_selector) {
+            if let Some(href) = element.value().attr("href") {
+                if let Ok(link_url) = page_url.join(href) {
+                    // Only follow links within the same domain
+                    if link_url.domain() == base_url.domain() {
+                        scrape_page(client, base_url, &link_url, save_dir, visited, depth + 1, max_depth, status).await;
+                    }
+                }
+            }
+        }
+    })
+}