this works fine for now

This commit is contained in:
2025-07-03 20:39:41 +02:00
parent efb93878ff
commit 014f7a873d
5 changed files with 392 additions and 9 deletions

40
.gitignore vendored
View File

@@ -3,9 +3,6 @@
# will have compiled files and executables
debug/
target/
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
Cargo.lock
# These are backup files generated by rustfmt
@@ -15,16 +12,43 @@ Cargo.lock
*.pdb
# ---> JupyterNotebooks
# gitignore template for Jupyter Notebooks
# website: http://jupyter.org/
.ipynb_checkpoints
*/.ipynb_checkpoints/*
# IPython
profile_default/
ipython_config.py
# Remove previous ipynb_checkpoints
# git rm -r .ipynb_checkpoints/
# === Cursor IDE ===
.cursor.rules.yaml
.cursor/
cursor.log
cursor.db
cursor.history
cursor.workspace
.cursor_tmp/
# === Saved scraped sites ===
saved_site/
saved_site/*
# === Editor/system junk ===
*.swp
*.swo
*.bak
*.tmp
*.log
.DS_Store
*~
Thumbs.db
ehthumbs.db
Desktop.ini
# === VSCode (if used ; we only use AI in this household) ===
.vscode/
# === Node/npm junk (just in case frontend is added) ===
node_modules/
dist/
cursor.rules.yaml

14
Cargo.toml Normal file
View File

@@ -0,0 +1,14 @@
[package]
name = "prawn"
version = "0.1.0"
edition = "2024"
[dependencies]
reqwest = { version = "0.12", features = ["json", "blocking", "cookies", "gzip", "brotli", "deflate", "stream"] }
tokio = { version = "1.37", features = ["full"] }
scraper = "0.18"
rayon = "1.10"
url = "2.5"
futures = "0.3"
tui = "0.19"
crossterm = "0.27"

View File

@@ -1,3 +1,47 @@
# prawn
prawncrawler
<!--
Logo Image
Sadly I cant do my cool Styling for the Div :C
-->
<div
style = "
display: flex;
justify-content: center;
">
<img
src = "assets/logo.png"
alt = "logo"
style = "width:50%"
/>
</div>
prawn is an extremely fast Rust web scraper that downloads a webpage's HTML and all linked CSS and JS resources, saving them into a local folder for offline use.
## Features
- High-performance: Uses `reqwest` (with connection pooling), `tokio`, and `rayon` for parallelism.
- CLI tool: Accepts a URL as an argument.
- Downloads and parses HTML as fast as possible.
- Extracts and concurrently downloads all `<link rel="stylesheet">` and `<script src="...">` resources.
- Rewrites HTML to point to local files and saves it as `saved_site/index.html`.
- All CSS and JS files are saved into `saved_site/css/` and `saved_site/js/` respectively.
## Usage
```
cargo run -- https://example.com
```
This will download the HTML, CSS, and JS concurrently and save them to `./saved_site/` within seconds.
## Constraints
- Uses async Rust (`tokio`) for HTTP I/O and `rayon` or `futures` for concurrent downloads.
- Uses `scraper` for fast DOM-like parsing.
- No GUI dependencies or headless browsers (pure HTTP and HTML/CSS/JS).
- Avoids unsafe code unless absolutely justified and documented.
- Minimizes unnecessary allocations or cloning.
## License
MIT

BIN
assets/logo.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 MiB

301
src/main.rs Normal file
View File

@@ -0,0 +1,301 @@
use std::fs::{self, File};
use std::io::Write;
use std::path::Path;
use std::collections::HashSet;
use reqwest::Client;
use scraper::{Html, Selector};
use tokio::task;
use rayon::prelude::*;
use url::{Url};
use futures;
use std::pin::Pin;
use futures::Future;
use std::sync::{Arc, Mutex};
use std::thread;
use tui::Terminal;
use tui::backend::CrosstermBackend;
use tui::widgets::{Block, Borders, Paragraph};
use tui::layout::{Layout, Constraint, Direction};
use tui::text::{Span, Spans};
use crossterm::event::{self, DisableMouseCapture, EnableMouseCapture, Event, KeyCode};
use crossterm::terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen};
use crossterm::execute;
const SPINNER_FRAMES: &[&str] = &["", "", "", "", "", "", "", "", "", ""];
#[derive(Default)]
struct Status {
current_url: String,
pages_visited: usize,
done: bool,
spinner_idx: usize,
}
fn run_tui(status: Arc<Mutex<Status>>) {
enable_raw_mode().unwrap();
let mut stdout = std::io::stdout();
execute!(stdout, EnterAlternateScreen, EnableMouseCapture).unwrap();
let backend = CrosstermBackend::new(stdout);
let mut terminal = Terminal::new(backend).unwrap();
let mut frame_count = 0;
loop {
let mut s = status.lock().unwrap();
// Animate spinner
if !s.done {
s.spinner_idx = (s.spinner_idx + 1) % SPINNER_FRAMES.len();
}
let spinner = SPINNER_FRAMES[s.spinner_idx];
terminal.draw(|f| {
let size = f.size();
// Layout: center box and status bar at bottom
let layout = Layout::default()
.direction(Direction::Vertical)
.margin(2)
.constraints([
Constraint::Percentage(40),
Constraint::Length(9), // main box
Constraint::Percentage(40),
Constraint::Length(1), // status bar
]);
let chunks = layout.split(size);
// Centered main status box
let block = Block::default().title(Span::styled(
format!(" {} Prawn Web Scraper {} ", spinner, spinner),
tui::style::Style::default().fg(tui::style::Color::Cyan).add_modifier(tui::style::Modifier::BOLD),
)).borders(Borders::ALL);
let text = vec![
Spans::from(vec![Span::styled(
format!("Pages visited: {}", s.pages_visited),
tui::style::Style::default().fg(tui::style::Color::Green).add_modifier(tui::style::Modifier::BOLD),
)]),
Spans::from(vec![Span::raw("")]),
Spans::from(vec![Span::styled(
"Current URL:",
tui::style::Style::default().fg(tui::style::Color::Yellow),
)]),
Spans::from(vec![Span::styled(
format!("{}", s.current_url),
tui::style::Style::default().fg(tui::style::Color::White),
)]),
Spans::from(vec![Span::raw("")]),
Spans::from(vec![Span::styled(
if s.done {"Done! Press Ctrl+C to exit."} else {"Scraping..."},
tui::style::Style::default().fg(if s.done {tui::style::Color::Green} else {tui::style::Color::Blue}).add_modifier(tui::style::Modifier::BOLD),
)]),
];
let paragraph = Paragraph::new(text).block(block).alignment(tui::layout::Alignment::Center);
f.render_widget(paragraph, chunks[1]);
// Status bar at the bottom
let status_bar = Paragraph::new(Spans::from(vec![
Span::styled(
format!(" {} Prawn scraping in progress... ", spinner),
tui::style::Style::default().fg(tui::style::Color::Magenta).add_modifier(tui::style::Modifier::BOLD),
),
Span::raw(" | Press Ctrl+C to quit "),
]))
.block(Block::default().borders(Borders::NONE))
.alignment(tui::layout::Alignment::Center);
f.render_widget(status_bar, chunks[3]);
}).unwrap();
if s.done {
break;
}
drop(s);
std::thread::sleep(std::time::Duration::from_millis(80));
}
disable_raw_mode().unwrap();
let mut stdout = std::io::stdout();
execute!(stdout, LeaveAlternateScreen, DisableMouseCapture).unwrap();
}
#[tokio::main]
async fn main() {
let args: Vec<String> = std::env::args().collect();
if args.len() != 2 {
eprintln!("Usage: cargo run -- <URL>");
std::process::exit(1);
}
let mut url = args[1].clone();
if !url.starts_with("http://") && !url.starts_with("https://") {
url = format!("https://{}", url);
}
let base_url = Url::parse(&url).expect("Invalid URL");
let client = Client::builder().build().unwrap();
// Prepare output directories
fs::create_dir_all("saved_site/css").unwrap();
fs::create_dir_all("saved_site/js").unwrap();
// Shared status for TUI
let status = Arc::new(Mutex::new(Status::default()));
let status_tui = status.clone();
// Start TUI in a separate thread
let tui_handle = thread::spawn(move || {
run_tui(status_tui);
});
// Start recursive scraping
let mut visited = HashSet::new();
scrape_page(&client, &base_url, &base_url, "saved_site", &mut visited, 0, 2, &status).await;
// Wait for TUI to finish (send quit signal)
{
let mut s = status.lock().unwrap();
s.done = true;
}
tui_handle.join().unwrap();
println!("Site saved to ./saved_site/");
}
fn scrape_page<'a>(
client: &'a Client,
base_url: &'a Url,
page_url: &'a Url,
save_dir: &'a str,
visited: &'a mut HashSet<String>,
depth: usize,
max_depth: usize,
status: &'a Arc<Mutex<Status>>,
) -> Pin<Box<dyn Future<Output = ()> + 'a>> {
Box::pin(async move {
{
let mut s = status.lock().unwrap();
s.current_url = page_url.as_str().to_string();
}
if depth > max_depth {
return;
}
let url_str = page_url.as_str().to_string();
if visited.contains(&url_str) {
return;
}
visited.insert(url_str.clone());
{
let mut s = status.lock().unwrap();
s.pages_visited += 1;
}
let html = match client.get(page_url.clone()).send().await {
Ok(resp) => match resp.text().await {
Ok(text) => text,
Err(_) => return,
},
Err(_) => return,
};
let document = Html::parse_document(&html);
// Extract CSS and JS links
let css_selector = Selector::parse("link[rel=stylesheet]").unwrap();
let js_selector = Selector::parse("script[src]").unwrap();
let mut css_links = Vec::new();
for element in document.select(&css_selector) {
if let Some(href) = element.value().attr("href") {
if let Ok(abs_url) = page_url.join(href) {
css_links.push((href.to_string(), abs_url.to_string()));
}
}
}
let mut js_links = Vec::new();
for element in document.select(&js_selector) {
if let Some(src) = element.value().attr("src") {
if let Ok(abs_url) = page_url.join(src) {
js_links.push((src.to_string(), abs_url.to_string()));
}
}
}
// Download CSS and JS concurrently
let css_results: Vec<(String, String)> = css_links.iter().map(|(orig, _abs)| {
let filename = Path::new(orig).file_name().unwrap_or_else(|| std::ffi::OsStr::new("style.css")).to_string_lossy().to_string();
(filename, orig.clone())
}).collect();
let css_futures: Vec<Pin<Box<dyn Future<Output = ()>>>> = css_links.iter().map(|(orig, abs)| {
let filename = Path::new(orig).file_name().unwrap_or_else(|| std::ffi::OsStr::new("style.css")).to_string_lossy().to_string();
let save_path = format!("{}/css/{}", save_dir, filename);
let abs = abs.clone();
Box::pin(async move {
let client = Client::new();
match client.get(&abs).send().await {
Ok(resp) => {
let bytes = resp.bytes().await.unwrap_or_default();
let mut file = File::create(&save_path).unwrap();
file.write_all(&bytes).unwrap();
},
Err(_) => {}
}
}) as Pin<Box<dyn Future<Output = ()>>>
}).collect();
let js_results: Vec<(String, String)> = js_links.iter().map(|(orig, _abs)| {
let filename = Path::new(orig).file_name().unwrap_or_else(|| std::ffi::OsStr::new("script.js")).to_string_lossy().to_string();
(filename, orig.clone())
}).collect();
let js_futures: Vec<Pin<Box<dyn Future<Output = ()>>>> = js_links.iter().map(|(orig, abs)| {
let filename = Path::new(orig).file_name().unwrap_or_else(|| std::ffi::OsStr::new("script.js")).to_string_lossy().to_string();
let save_path = format!("{}/js/{}", save_dir, filename);
let abs = abs.clone();
Box::pin(async move {
let client = Client::new();
match client.get(&abs).send().await {
Ok(resp) => {
let bytes = resp.bytes().await.unwrap_or_default();
let mut file = File::create(&save_path).unwrap();
file.write_all(&bytes).unwrap();
},
Err(_) => {}
}
}) as Pin<Box<dyn Future<Output = ()>>>
}).collect();
futures::future::join_all(css_futures).await;
futures::future::join_all(js_futures).await;
// Rewrite HTML to point to local files
let mut new_html = html.clone();
for (filename, orig) in &css_results {
let local_path = format!("css/{}", filename);
new_html = new_html.replace(orig, &local_path);
}
for (filename, orig) in &js_results {
let local_path = format!("js/{}", filename);
new_html = new_html.replace(orig, &local_path);
}
// Save rewritten HTML
let page_path = if page_url.path() == "/" || page_url.path().is_empty() {
format!("{}/index.html", save_dir)
} else {
let mut path = page_url.path().trim_start_matches('/').replace('/', "_");
if !path.ends_with(".html") {
path.push_str(".html");
}
format!("{}/{}", save_dir, path)
};
if let Some(parent) = Path::new(&page_path).parent() {
fs::create_dir_all(parent).unwrap();
}
let mut file = File::create(&page_path).unwrap();
file.write_all(new_html.as_bytes()).unwrap();
// Extract and recursively scrape <a href> links
let a_selector = Selector::parse("a[href]").unwrap();
for element in document.select(&a_selector) {
if let Some(href) = element.value().attr("href") {
if let Ok(link_url) = page_url.join(href) {
// Only follow links within the same domain
if link_url.domain() == base_url.domain() {
scrape_page(client, base_url, &link_url, save_dir, visited, depth + 1, max_depth, status).await;
}
}
}
}
})
}