use std::collections::{HashSet, VecDeque}; use std::sync::{Arc, Mutex}; use std::sync::atomic::{AtomicUsize, Ordering}; use clap::{App, Arg}; use futures::stream::{FuturesUnordered, StreamExt}; use regex::Regex; use reqwest::Client; use scraper::{Html, Selector}; use tokio; #[derive(Debug, Clone)] struct CrawlResult { url: String, title: Option, text: Option, links: Vec, images: Vec, pattern_matches: Vec, } async fn crawl_url( url: String, client: Arc, visited: Arc>>, to_visit: Arc>>, pattern_regex: Option, active_count: Arc, ) -> Option { if { let mut visited = visited.lock().unwrap(); if visited.contains(&url) { return None; } visited.insert(url.clone()); active_count.fetch_add(1, Ordering::SeqCst); true } == false { return None; } let res = client.get(&url).send().await.ok()?.text().await.ok()?; let url_clone = url.clone(); let pattern_clone = pattern_regex.clone(); // Move heavy processing to blocking thread pool let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || { let document = Html::parse_document(&res); let title_selector = Selector::parse("title").unwrap(); let title = document .select(&title_selector) .next() .map(|e| e.inner_html()); let body_selector = Selector::parse("body").unwrap(); let text = document .select(&body_selector) .next() .map(|e| e.text().collect::>().join(" ")); let link_selector = Selector::parse("a[href]").unwrap(); let links: Vec = document .select(&link_selector) .filter_map(|e| e.value().attr("href")) .map(|s| s.to_string()) .collect(); let img_selector = Selector::parse("img[src]").unwrap(); let images: Vec = document .select(&img_selector) .filter_map(|e| e.value().attr("src")) .map(|s| s.to_string()) .collect(); let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) { regex .captures_iter(text) .filter_map(|cap| cap.get(0).map(|m| m.as_str().to_string())) .collect() } else { vec![] }; (title, text, links, images, pattern_matches) }) .await .ok()?; { let mut queue = to_visit.lock().unwrap(); for link in &links { if link.starts_with("http") { queue.push_back(link.clone()); } } } active_count.fetch_sub(1, Ordering::SeqCst); Some(CrawlResult { url: url_clone, title, text, links, images, pattern_matches, }) } #[tokio::main] async fn main() { let matches = App::new("Web Crawler") .version("1.0") .about("Multi-threaded web crawler with pattern matching") .arg( Arg::with_name("url") .help("Starting URL") .required(true), ) .arg( Arg::with_name("pattern") .short("p") .long("pattern") .help("Regex pattern to match in page text") .takes_value(true), ) .arg( Arg::with_name("concurrency") .short("c") .long("concurrency") .help("Max concurrent requests") .takes_value(true), ) .get_matches(); let start_url = matches.value_of("url").unwrap().to_string(); let pattern = matches.value_of("pattern"); let concurrency: usize = matches .value_of("concurrency") .unwrap_or("20") .parse() .unwrap_or(20); let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex")); let client = Arc::new(Client::new()); let visited = Arc::new(Mutex::new(HashSet::new())); let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url]))); let active_count = Arc::new(AtomicUsize::new(0)); let mut tasks = FuturesUnordered::new(); for _ in 0..concurrency { let client = client.clone(); let visited = visited.clone(); let to_visit = to_visit.clone(); let pattern_regex = pattern_regex.clone(); let active_count = active_count.clone(); tasks.push(tokio::spawn(async move { let mut results = vec![]; loop { let next_url = { let mut queue = to_visit.lock().unwrap(); queue.pop_front() }; if let Some(url) = next_url { if let Some(result) = crawl_url( url, client.clone(), visited.clone(), to_visit.clone(), pattern_regex.clone(), active_count.clone(), ) .await { results.push(result); } } else { if active_count.load(Ordering::SeqCst) == 0 { break; } tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; } } results })); } let mut all_results = vec![]; while let Some(result_set) = tasks.next().await { if let Ok(results) = result_set { all_results.extend(results); } } for res in all_results { println!("URL: {}", res.url); if let Some(title) = res.title { println!("Title: {}", title); } if let Some(text) = res.text { println!("Text (truncated): {:.100}...", text); } println!("Links: {}", res.links.len()); println!("Images: {}", res.images.len()); if !res.pattern_matches.is_empty() { println!("Pattern Matches: {:?}", res.pattern_matches); } println!("---"); } }