this works fine for now
This commit is contained in:
40
.gitignore
vendored
40
.gitignore
vendored
@@ -3,9 +3,6 @@
|
|||||||
# will have compiled files and executables
|
# will have compiled files and executables
|
||||||
debug/
|
debug/
|
||||||
target/
|
target/
|
||||||
|
|
||||||
# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries
|
|
||||||
# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html
|
|
||||||
Cargo.lock
|
Cargo.lock
|
||||||
|
|
||||||
# These are backup files generated by rustfmt
|
# These are backup files generated by rustfmt
|
||||||
@@ -15,16 +12,43 @@ Cargo.lock
|
|||||||
*.pdb
|
*.pdb
|
||||||
|
|
||||||
# ---> JupyterNotebooks
|
# ---> JupyterNotebooks
|
||||||
# gitignore template for Jupyter Notebooks
|
|
||||||
# website: http://jupyter.org/
|
|
||||||
|
|
||||||
.ipynb_checkpoints
|
.ipynb_checkpoints
|
||||||
*/.ipynb_checkpoints/*
|
*/.ipynb_checkpoints/*
|
||||||
|
|
||||||
# IPython
|
|
||||||
profile_default/
|
profile_default/
|
||||||
ipython_config.py
|
ipython_config.py
|
||||||
|
|
||||||
# Remove previous ipynb_checkpoints
|
# Remove previous ipynb_checkpoints
|
||||||
# git rm -r .ipynb_checkpoints/
|
# git rm -r .ipynb_checkpoints/
|
||||||
|
|
||||||
|
# === Cursor IDE ===
|
||||||
|
.cursor.rules.yaml
|
||||||
|
.cursor/
|
||||||
|
cursor.log
|
||||||
|
cursor.db
|
||||||
|
cursor.history
|
||||||
|
cursor.workspace
|
||||||
|
.cursor_tmp/
|
||||||
|
|
||||||
|
# === Saved scraped sites ===
|
||||||
|
saved_site/
|
||||||
|
saved_site/*
|
||||||
|
|
||||||
|
# === Editor/system junk ===
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*.bak
|
||||||
|
*.tmp
|
||||||
|
*.log
|
||||||
|
.DS_Store
|
||||||
|
*~
|
||||||
|
Thumbs.db
|
||||||
|
ehthumbs.db
|
||||||
|
Desktop.ini
|
||||||
|
|
||||||
|
# === VSCode (if used ; we only use AI in this household) ===
|
||||||
|
.vscode/
|
||||||
|
|
||||||
|
# === Node/npm junk (just in case frontend is added) ===
|
||||||
|
node_modules/
|
||||||
|
dist/
|
||||||
|
cursor.rules.yaml
|
||||||
|
|||||||
14
Cargo.toml
Normal file
14
Cargo.toml
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
[package]
|
||||||
|
name = "prawn"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
reqwest = { version = "0.12", features = ["json", "blocking", "cookies", "gzip", "brotli", "deflate", "stream"] }
|
||||||
|
tokio = { version = "1.37", features = ["full"] }
|
||||||
|
scraper = "0.18"
|
||||||
|
rayon = "1.10"
|
||||||
|
url = "2.5"
|
||||||
|
futures = "0.3"
|
||||||
|
tui = "0.19"
|
||||||
|
crossterm = "0.27"
|
||||||
46
README.md
46
README.md
@@ -1,3 +1,47 @@
|
|||||||
# prawn
|
# prawn
|
||||||
|
|
||||||
prawncrawler
|
<!--
|
||||||
|
|
||||||
|
Logo Image
|
||||||
|
Sadly I cant do my cool Styling for the Div :C
|
||||||
|
-->
|
||||||
|
<div
|
||||||
|
style = "
|
||||||
|
display: flex;
|
||||||
|
justify-content: center;
|
||||||
|
">
|
||||||
|
<img
|
||||||
|
src = "assets/logo.png"
|
||||||
|
alt = "logo"
|
||||||
|
style = "width:50%"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
prawn is an extremely fast Rust web scraper that downloads a webpage's HTML and all linked CSS and JS resources, saving them into a local folder for offline use.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
- High-performance: Uses `reqwest` (with connection pooling), `tokio`, and `rayon` for parallelism.
|
||||||
|
- CLI tool: Accepts a URL as an argument.
|
||||||
|
- Downloads and parses HTML as fast as possible.
|
||||||
|
- Extracts and concurrently downloads all `<link rel="stylesheet">` and `<script src="...">` resources.
|
||||||
|
- Rewrites HTML to point to local files and saves it as `saved_site/index.html`.
|
||||||
|
- All CSS and JS files are saved into `saved_site/css/` and `saved_site/js/` respectively.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```
|
||||||
|
cargo run -- https://example.com
|
||||||
|
```
|
||||||
|
|
||||||
|
This will download the HTML, CSS, and JS concurrently and save them to `./saved_site/` within seconds.
|
||||||
|
|
||||||
|
## Constraints
|
||||||
|
- Uses async Rust (`tokio`) for HTTP I/O and `rayon` or `futures` for concurrent downloads.
|
||||||
|
- Uses `scraper` for fast DOM-like parsing.
|
||||||
|
- No GUI dependencies or headless browsers (pure HTTP and HTML/CSS/JS).
|
||||||
|
- Avoids unsafe code unless absolutely justified and documented.
|
||||||
|
- Minimizes unnecessary allocations or cloning.
|
||||||
|
|
||||||
|
## License
|
||||||
|
MIT
|
||||||
BIN
assets/logo.png
Normal file
BIN
assets/logo.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.8 MiB |
301
src/main.rs
Normal file
301
src/main.rs
Normal file
@@ -0,0 +1,301 @@
|
|||||||
|
use std::fs::{self, File};
|
||||||
|
use std::io::Write;
|
||||||
|
use std::path::Path;
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use reqwest::Client;
|
||||||
|
use scraper::{Html, Selector};
|
||||||
|
use tokio::task;
|
||||||
|
use rayon::prelude::*;
|
||||||
|
use url::{Url};
|
||||||
|
use futures;
|
||||||
|
use std::pin::Pin;
|
||||||
|
use futures::Future;
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
use std::thread;
|
||||||
|
use tui::Terminal;
|
||||||
|
use tui::backend::CrosstermBackend;
|
||||||
|
use tui::widgets::{Block, Borders, Paragraph};
|
||||||
|
use tui::layout::{Layout, Constraint, Direction};
|
||||||
|
use tui::text::{Span, Spans};
|
||||||
|
use crossterm::event::{self, DisableMouseCapture, EnableMouseCapture, Event, KeyCode};
|
||||||
|
use crossterm::terminal::{disable_raw_mode, enable_raw_mode, EnterAlternateScreen, LeaveAlternateScreen};
|
||||||
|
use crossterm::execute;
|
||||||
|
|
||||||
|
const SPINNER_FRAMES: &[&str] = &["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"];
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
struct Status {
|
||||||
|
current_url: String,
|
||||||
|
pages_visited: usize,
|
||||||
|
done: bool,
|
||||||
|
spinner_idx: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn run_tui(status: Arc<Mutex<Status>>) {
|
||||||
|
enable_raw_mode().unwrap();
|
||||||
|
let mut stdout = std::io::stdout();
|
||||||
|
execute!(stdout, EnterAlternateScreen, EnableMouseCapture).unwrap();
|
||||||
|
let backend = CrosstermBackend::new(stdout);
|
||||||
|
let mut terminal = Terminal::new(backend).unwrap();
|
||||||
|
|
||||||
|
let mut frame_count = 0;
|
||||||
|
|
||||||
|
loop {
|
||||||
|
let mut s = status.lock().unwrap();
|
||||||
|
// Animate spinner
|
||||||
|
if !s.done {
|
||||||
|
s.spinner_idx = (s.spinner_idx + 1) % SPINNER_FRAMES.len();
|
||||||
|
}
|
||||||
|
let spinner = SPINNER_FRAMES[s.spinner_idx];
|
||||||
|
terminal.draw(|f| {
|
||||||
|
let size = f.size();
|
||||||
|
// Layout: center box and status bar at bottom
|
||||||
|
let layout = Layout::default()
|
||||||
|
.direction(Direction::Vertical)
|
||||||
|
.margin(2)
|
||||||
|
.constraints([
|
||||||
|
Constraint::Percentage(40),
|
||||||
|
Constraint::Length(9), // main box
|
||||||
|
Constraint::Percentage(40),
|
||||||
|
Constraint::Length(1), // status bar
|
||||||
|
]);
|
||||||
|
let chunks = layout.split(size);
|
||||||
|
|
||||||
|
// Centered main status box
|
||||||
|
let block = Block::default().title(Span::styled(
|
||||||
|
format!(" {} Prawn Web Scraper {} ", spinner, spinner),
|
||||||
|
tui::style::Style::default().fg(tui::style::Color::Cyan).add_modifier(tui::style::Modifier::BOLD),
|
||||||
|
)).borders(Borders::ALL);
|
||||||
|
let text = vec![
|
||||||
|
Spans::from(vec![Span::styled(
|
||||||
|
format!("Pages visited: {}", s.pages_visited),
|
||||||
|
tui::style::Style::default().fg(tui::style::Color::Green).add_modifier(tui::style::Modifier::BOLD),
|
||||||
|
)]),
|
||||||
|
Spans::from(vec![Span::raw("")]),
|
||||||
|
Spans::from(vec![Span::styled(
|
||||||
|
"Current URL:",
|
||||||
|
tui::style::Style::default().fg(tui::style::Color::Yellow),
|
||||||
|
)]),
|
||||||
|
Spans::from(vec![Span::styled(
|
||||||
|
format!("{}", s.current_url),
|
||||||
|
tui::style::Style::default().fg(tui::style::Color::White),
|
||||||
|
)]),
|
||||||
|
Spans::from(vec![Span::raw("")]),
|
||||||
|
Spans::from(vec![Span::styled(
|
||||||
|
if s.done {"Done! Press Ctrl+C to exit."} else {"Scraping..."},
|
||||||
|
tui::style::Style::default().fg(if s.done {tui::style::Color::Green} else {tui::style::Color::Blue}).add_modifier(tui::style::Modifier::BOLD),
|
||||||
|
)]),
|
||||||
|
];
|
||||||
|
let paragraph = Paragraph::new(text).block(block).alignment(tui::layout::Alignment::Center);
|
||||||
|
f.render_widget(paragraph, chunks[1]);
|
||||||
|
|
||||||
|
// Status bar at the bottom
|
||||||
|
let status_bar = Paragraph::new(Spans::from(vec![
|
||||||
|
Span::styled(
|
||||||
|
format!(" {} Prawn scraping in progress... ", spinner),
|
||||||
|
tui::style::Style::default().fg(tui::style::Color::Magenta).add_modifier(tui::style::Modifier::BOLD),
|
||||||
|
),
|
||||||
|
Span::raw(" | Press Ctrl+C to quit "),
|
||||||
|
]))
|
||||||
|
.block(Block::default().borders(Borders::NONE))
|
||||||
|
.alignment(tui::layout::Alignment::Center);
|
||||||
|
f.render_widget(status_bar, chunks[3]);
|
||||||
|
}).unwrap();
|
||||||
|
if s.done {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
drop(s);
|
||||||
|
std::thread::sleep(std::time::Duration::from_millis(80));
|
||||||
|
}
|
||||||
|
disable_raw_mode().unwrap();
|
||||||
|
let mut stdout = std::io::stdout();
|
||||||
|
execute!(stdout, LeaveAlternateScreen, DisableMouseCapture).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
let args: Vec<String> = std::env::args().collect();
|
||||||
|
if args.len() != 2 {
|
||||||
|
eprintln!("Usage: cargo run -- <URL>");
|
||||||
|
std::process::exit(1);
|
||||||
|
}
|
||||||
|
let mut url = args[1].clone();
|
||||||
|
if !url.starts_with("http://") && !url.starts_with("https://") {
|
||||||
|
url = format!("https://{}", url);
|
||||||
|
}
|
||||||
|
let base_url = Url::parse(&url).expect("Invalid URL");
|
||||||
|
let client = Client::builder().build().unwrap();
|
||||||
|
|
||||||
|
// Prepare output directories
|
||||||
|
fs::create_dir_all("saved_site/css").unwrap();
|
||||||
|
fs::create_dir_all("saved_site/js").unwrap();
|
||||||
|
|
||||||
|
// Shared status for TUI
|
||||||
|
let status = Arc::new(Mutex::new(Status::default()));
|
||||||
|
let status_tui = status.clone();
|
||||||
|
|
||||||
|
// Start TUI in a separate thread
|
||||||
|
let tui_handle = thread::spawn(move || {
|
||||||
|
run_tui(status_tui);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Start recursive scraping
|
||||||
|
let mut visited = HashSet::new();
|
||||||
|
scrape_page(&client, &base_url, &base_url, "saved_site", &mut visited, 0, 2, &status).await;
|
||||||
|
|
||||||
|
// Wait for TUI to finish (send quit signal)
|
||||||
|
{
|
||||||
|
let mut s = status.lock().unwrap();
|
||||||
|
s.done = true;
|
||||||
|
}
|
||||||
|
tui_handle.join().unwrap();
|
||||||
|
println!("Site saved to ./saved_site/");
|
||||||
|
}
|
||||||
|
|
||||||
|
fn scrape_page<'a>(
|
||||||
|
client: &'a Client,
|
||||||
|
base_url: &'a Url,
|
||||||
|
page_url: &'a Url,
|
||||||
|
save_dir: &'a str,
|
||||||
|
visited: &'a mut HashSet<String>,
|
||||||
|
depth: usize,
|
||||||
|
max_depth: usize,
|
||||||
|
status: &'a Arc<Mutex<Status>>,
|
||||||
|
) -> Pin<Box<dyn Future<Output = ()> + 'a>> {
|
||||||
|
Box::pin(async move {
|
||||||
|
{
|
||||||
|
let mut s = status.lock().unwrap();
|
||||||
|
s.current_url = page_url.as_str().to_string();
|
||||||
|
}
|
||||||
|
if depth > max_depth {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
let url_str = page_url.as_str().to_string();
|
||||||
|
if visited.contains(&url_str) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
visited.insert(url_str.clone());
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut s = status.lock().unwrap();
|
||||||
|
s.pages_visited += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
let html = match client.get(page_url.clone()).send().await {
|
||||||
|
Ok(resp) => match resp.text().await {
|
||||||
|
Ok(text) => text,
|
||||||
|
Err(_) => return,
|
||||||
|
},
|
||||||
|
Err(_) => return,
|
||||||
|
};
|
||||||
|
let document = Html::parse_document(&html);
|
||||||
|
|
||||||
|
// Extract CSS and JS links
|
||||||
|
let css_selector = Selector::parse("link[rel=stylesheet]").unwrap();
|
||||||
|
let js_selector = Selector::parse("script[src]").unwrap();
|
||||||
|
|
||||||
|
let mut css_links = Vec::new();
|
||||||
|
for element in document.select(&css_selector) {
|
||||||
|
if let Some(href) = element.value().attr("href") {
|
||||||
|
if let Ok(abs_url) = page_url.join(href) {
|
||||||
|
css_links.push((href.to_string(), abs_url.to_string()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut js_links = Vec::new();
|
||||||
|
for element in document.select(&js_selector) {
|
||||||
|
if let Some(src) = element.value().attr("src") {
|
||||||
|
if let Ok(abs_url) = page_url.join(src) {
|
||||||
|
js_links.push((src.to_string(), abs_url.to_string()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Download CSS and JS concurrently
|
||||||
|
let css_results: Vec<(String, String)> = css_links.iter().map(|(orig, _abs)| {
|
||||||
|
let filename = Path::new(orig).file_name().unwrap_or_else(|| std::ffi::OsStr::new("style.css")).to_string_lossy().to_string();
|
||||||
|
(filename, orig.clone())
|
||||||
|
}).collect();
|
||||||
|
let css_futures: Vec<Pin<Box<dyn Future<Output = ()>>>> = css_links.iter().map(|(orig, abs)| {
|
||||||
|
let filename = Path::new(orig).file_name().unwrap_or_else(|| std::ffi::OsStr::new("style.css")).to_string_lossy().to_string();
|
||||||
|
let save_path = format!("{}/css/{}", save_dir, filename);
|
||||||
|
let abs = abs.clone();
|
||||||
|
Box::pin(async move {
|
||||||
|
let client = Client::new();
|
||||||
|
match client.get(&abs).send().await {
|
||||||
|
Ok(resp) => {
|
||||||
|
let bytes = resp.bytes().await.unwrap_or_default();
|
||||||
|
let mut file = File::create(&save_path).unwrap();
|
||||||
|
file.write_all(&bytes).unwrap();
|
||||||
|
},
|
||||||
|
Err(_) => {}
|
||||||
|
}
|
||||||
|
}) as Pin<Box<dyn Future<Output = ()>>>
|
||||||
|
}).collect();
|
||||||
|
|
||||||
|
let js_results: Vec<(String, String)> = js_links.iter().map(|(orig, _abs)| {
|
||||||
|
let filename = Path::new(orig).file_name().unwrap_or_else(|| std::ffi::OsStr::new("script.js")).to_string_lossy().to_string();
|
||||||
|
(filename, orig.clone())
|
||||||
|
}).collect();
|
||||||
|
let js_futures: Vec<Pin<Box<dyn Future<Output = ()>>>> = js_links.iter().map(|(orig, abs)| {
|
||||||
|
let filename = Path::new(orig).file_name().unwrap_or_else(|| std::ffi::OsStr::new("script.js")).to_string_lossy().to_string();
|
||||||
|
let save_path = format!("{}/js/{}", save_dir, filename);
|
||||||
|
let abs = abs.clone();
|
||||||
|
Box::pin(async move {
|
||||||
|
let client = Client::new();
|
||||||
|
match client.get(&abs).send().await {
|
||||||
|
Ok(resp) => {
|
||||||
|
let bytes = resp.bytes().await.unwrap_or_default();
|
||||||
|
let mut file = File::create(&save_path).unwrap();
|
||||||
|
file.write_all(&bytes).unwrap();
|
||||||
|
},
|
||||||
|
Err(_) => {}
|
||||||
|
}
|
||||||
|
}) as Pin<Box<dyn Future<Output = ()>>>
|
||||||
|
}).collect();
|
||||||
|
|
||||||
|
futures::future::join_all(css_futures).await;
|
||||||
|
futures::future::join_all(js_futures).await;
|
||||||
|
|
||||||
|
// Rewrite HTML to point to local files
|
||||||
|
let mut new_html = html.clone();
|
||||||
|
for (filename, orig) in &css_results {
|
||||||
|
let local_path = format!("css/{}", filename);
|
||||||
|
new_html = new_html.replace(orig, &local_path);
|
||||||
|
}
|
||||||
|
for (filename, orig) in &js_results {
|
||||||
|
let local_path = format!("js/{}", filename);
|
||||||
|
new_html = new_html.replace(orig, &local_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save rewritten HTML
|
||||||
|
let page_path = if page_url.path() == "/" || page_url.path().is_empty() {
|
||||||
|
format!("{}/index.html", save_dir)
|
||||||
|
} else {
|
||||||
|
let mut path = page_url.path().trim_start_matches('/').replace('/', "_");
|
||||||
|
if !path.ends_with(".html") {
|
||||||
|
path.push_str(".html");
|
||||||
|
}
|
||||||
|
format!("{}/{}", save_dir, path)
|
||||||
|
};
|
||||||
|
if let Some(parent) = Path::new(&page_path).parent() {
|
||||||
|
fs::create_dir_all(parent).unwrap();
|
||||||
|
}
|
||||||
|
let mut file = File::create(&page_path).unwrap();
|
||||||
|
file.write_all(new_html.as_bytes()).unwrap();
|
||||||
|
|
||||||
|
// Extract and recursively scrape <a href> links
|
||||||
|
let a_selector = Selector::parse("a[href]").unwrap();
|
||||||
|
for element in document.select(&a_selector) {
|
||||||
|
if let Some(href) = element.value().attr("href") {
|
||||||
|
if let Ok(link_url) = page_url.join(href) {
|
||||||
|
// Only follow links within the same domain
|
||||||
|
if link_url.domain() == base_url.domain() {
|
||||||
|
scrape_page(client, base_url, &link_url, save_dir, visited, depth + 1, max_depth, status).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user