From 995741ecad01730e70a8e0384da7d24115643e27 Mon Sep 17 00:00:00 2001 From: rattatwinko Date: Fri, 2 May 2025 17:53:24 +0200 Subject: [PATCH] some more bullshit 2 --- src/main.rs | 81 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 58 insertions(+), 23 deletions(-) diff --git a/src/main.rs b/src/main.rs index 5da37a8..e4bae70 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,5 @@ use std::collections::{HashSet, VecDeque}; +use std::path::Path; use std::sync::{Arc, Mutex}; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -8,10 +9,12 @@ use regex::Regex; use reqwest::Client; use scraper::{Html, Selector}; use tokio; +use url::Url; #[derive(Debug, Clone)] struct CrawlResult { url: String, + html: String, title: Option, text: Option, links: Vec, @@ -53,6 +56,7 @@ async fn crawl_url( } }; + let html_content = res.clone(); let url_clone = url.clone(); let pattern_clone = pattern_regex.clone(); @@ -60,21 +64,18 @@ async fn crawl_url( let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || { let document = Html::parse_document(&res); - // Title extraction let title_selector = Selector::parse("title").unwrap(); let title = document .select(&title_selector) .next() .map(|e| e.inner_html()); - // Body text extraction let body_selector = Selector::parse("body").unwrap(); let text = document .select(&body_selector) .next() .map(|e| e.text().collect::>().join(" ")); - // Link extraction let link_selector = Selector::parse("a[href]").unwrap(); let links: Vec = document .select(&link_selector) @@ -82,7 +83,6 @@ async fn crawl_url( .map(|s| s.to_string()) .collect(); - // Image extraction let img_selector = Selector::parse("img[src]").unwrap(); let images: Vec = document .select(&img_selector) @@ -90,7 +90,6 @@ async fn crawl_url( .map(|s| s.to_string()) .collect(); - // Pattern matching let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) { regex .captures_iter(text) @@ -105,7 +104,6 @@ async fn crawl_url( .await .ok()?; - // Queue new links { let mut queue = to_visit.lock().unwrap(); let mut new_links = 0; @@ -122,6 +120,7 @@ async fn crawl_url( Some(CrawlResult { url: url_clone, + html: html_content, title, text, links, @@ -130,11 +129,30 @@ async fn crawl_url( }) } +fn sanitize_filename(url: &str) -> String { + let parsed = Url::parse(url).unwrap(); + let mut filename = parsed.path().replace('/', "__"); + + if filename.is_empty() || filename == "__" { + filename = "index".to_string(); + } + + filename = filename.replace(|c: char| !c.is_ascii_alphanumeric(), "_"); + + if filename.len() > 50 { + filename = filename[..50].to_string(); + } + + format!("{}.html", filename.trim_matches('_')) +} + #[tokio::main] async fn main() { + println!("[🚀] Starting web crawler..."); + let matches = App::new("Web Crawler") .version("1.0") - .about("Multi-threaded web crawler with pattern matching") + .about("Multi-threaded web crawler with pattern matching and website storage") .arg( Arg::with_name("url") .help("Starting URL") @@ -167,12 +185,19 @@ async fn main() { let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex")); let client = Arc::new(Client::new()); + let parsed_url = Url::parse(&start_url).expect("Invalid URL"); + let domain = parsed_url.host_str().unwrap().replace('.', "_"); + let output_dir = format!("./{}", domain); + std::fs::create_dir_all(&output_dir).expect("Failed to create output directory"); + println!("[📂] Created output directory: {}", output_dir); + let visited = Arc::new(Mutex::new(HashSet::new())); let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url]))); let active_count = Arc::new(AtomicUsize::new(0)); let mut tasks = FuturesUnordered::new(); + println!("[🔄] Initializing {} worker tasks...", concurrency); for _ in 0..concurrency { let client = client.clone(); let visited = visited.clone(); @@ -189,6 +214,7 @@ async fn main() { }; if let Some(url) = next_url { + println!("[📥] Processing: {}", url); if let Some(result) = crawl_url( url, client.clone(), @@ -202,9 +228,12 @@ async fn main() { results.push(result); } } else { - if active_count.load(Ordering::SeqCst) == 0 { + let active = active_count.load(Ordering::SeqCst); + if active == 0 { + println!("[🛑] Worker detected completion"); break; } + println!("[⏳] Worker waiting (active: {})...", active); tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; } } @@ -212,26 +241,32 @@ async fn main() { })); } + println!("[📊] Crawling in progress..."); let mut all_results = vec![]; while let Some(result_set) = tasks.next().await { - if let Ok(results) = result_set { - all_results.extend(results); + match result_set { + Ok(results) => { + println!("[✔️] Worker completed with {} results", results.len()); + all_results.extend(results); + } + Err(e) => println!("[⚠️] Worker error: {}", e), } } + println!("\n[🏁] Crawling completed!"); + println!("[📋] Total pages crawled: {}", all_results.len()); + println!("[💾] Saving results to {}...", output_dir); + for res in all_results { - println!("URL: {}", res.url); - if let Some(title) = res.title { - println!("Title: {}", title); - } - if let Some(text) = res.text { - println!("Text (truncated): {:.100}...", text); - } - println!("Links: {}", res.links.len()); - println!("Images: {}", res.images.len()); - if !res.pattern_matches.is_empty() { - println!("Pattern Matches: {:?}", res.pattern_matches); - } - println!("---"); + let filename = sanitize_filename(&res.url); + let file_path = Path::new(&output_dir).join(filename); + + tokio::fs::write(&file_path, &res.html) + .await + .unwrap_or_else(|_| panic!("Failed to write: {}", file_path.display())); + + println!("[💾] Saved: {}", file_path.display()); } + + println!("[✅] Done! Website stored in: {}", output_dir); }