Files
crawler/src/main.rs
rattatwinko 1bfa72db56 initial
2025-05-02 17:33:21 +02:00

216 lines
6.3 KiB
Rust

use std::collections::{HashSet, VecDeque};
use std::sync::{Arc, Mutex};
use std::sync::atomic::{AtomicUsize, Ordering};
use clap::{App, Arg};
use futures::stream::{FuturesUnordered, StreamExt};
use regex::Regex;
use reqwest::Client;
use scraper::{Html, Selector};
use tokio;
#[derive(Debug, Clone)]
struct CrawlResult {
url: String,
title: Option<String>,
text: Option<String>,
links: Vec<String>,
images: Vec<String>,
pattern_matches: Vec<String>,
}
async fn crawl_url(
url: String,
client: Arc<Client>,
visited: Arc<Mutex<HashSet<String>>>,
to_visit: Arc<Mutex<VecDeque<String>>>,
pattern_regex: Option<Regex>,
active_count: Arc<AtomicUsize>,
) -> Option<CrawlResult> {
if {
let mut visited = visited.lock().unwrap();
if visited.contains(&url) {
return None;
}
visited.insert(url.clone());
active_count.fetch_add(1, Ordering::SeqCst);
true
} == false {
return None;
}
let res = client.get(&url).send().await.ok()?.text().await.ok()?;
let url_clone = url.clone();
let pattern_clone = pattern_regex.clone();
// Move heavy processing to blocking thread pool
let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
let document = Html::parse_document(&res);
let title_selector = Selector::parse("title").unwrap();
let title = document
.select(&title_selector)
.next()
.map(|e| e.inner_html());
let body_selector = Selector::parse("body").unwrap();
let text = document
.select(&body_selector)
.next()
.map(|e| e.text().collect::<Vec<_>>().join(" "));
let link_selector = Selector::parse("a[href]").unwrap();
let links: Vec<String> = document
.select(&link_selector)
.filter_map(|e| e.value().attr("href"))
.map(|s| s.to_string())
.collect();
let img_selector = Selector::parse("img[src]").unwrap();
let images: Vec<String> = document
.select(&img_selector)
.filter_map(|e| e.value().attr("src"))
.map(|s| s.to_string())
.collect();
let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
regex
.captures_iter(text)
.filter_map(|cap| cap.get(0).map(|m| m.as_str().to_string()))
.collect()
} else {
vec![]
};
(title, text, links, images, pattern_matches)
})
.await
.ok()?;
{
let mut queue = to_visit.lock().unwrap();
for link in &links {
if link.starts_with("http") {
queue.push_back(link.clone());
}
}
}
active_count.fetch_sub(1, Ordering::SeqCst);
Some(CrawlResult {
url: url_clone,
title,
text,
links,
images,
pattern_matches,
})
}
#[tokio::main]
async fn main() {
let matches = App::new("Web Crawler")
.version("1.0")
.about("Multi-threaded web crawler with pattern matching")
.arg(
Arg::with_name("url")
.help("Starting URL")
.required(true),
)
.arg(
Arg::with_name("pattern")
.short("p")
.long("pattern")
.help("Regex pattern to match in page text")
.takes_value(true),
)
.arg(
Arg::with_name("concurrency")
.short("c")
.long("concurrency")
.help("Max concurrent requests")
.takes_value(true),
)
.get_matches();
let start_url = matches.value_of("url").unwrap().to_string();
let pattern = matches.value_of("pattern");
let concurrency: usize = matches
.value_of("concurrency")
.unwrap_or("20")
.parse()
.unwrap_or(20);
let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex"));
let client = Arc::new(Client::new());
let visited = Arc::new(Mutex::new(HashSet::new()));
let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url])));
let active_count = Arc::new(AtomicUsize::new(0));
let mut tasks = FuturesUnordered::new();
for _ in 0..concurrency {
let client = client.clone();
let visited = visited.clone();
let to_visit = to_visit.clone();
let pattern_regex = pattern_regex.clone();
let active_count = active_count.clone();
tasks.push(tokio::spawn(async move {
let mut results = vec![];
loop {
let next_url = {
let mut queue = to_visit.lock().unwrap();
queue.pop_front()
};
if let Some(url) = next_url {
if let Some(result) = crawl_url(
url,
client.clone(),
visited.clone(),
to_visit.clone(),
pattern_regex.clone(),
active_count.clone(),
)
.await
{
results.push(result);
}
} else {
if active_count.load(Ordering::SeqCst) == 0 {
break;
}
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
}
}
results
}));
}
let mut all_results = vec![];
while let Some(result_set) = tasks.next().await {
if let Ok(results) = result_set {
all_results.extend(results);
}
}
for res in all_results {
println!("URL: {}", res.url);
if let Some(title) = res.title {
println!("Title: {}", title);
}
if let Some(text) = res.text {
println!("Text (truncated): {:.100}...", text);
}
println!("Links: {}", res.links.len());
println!("Images: {}", res.images.len());
if !res.pattern_matches.is_empty() {
println!("Pattern Matches: {:?}", res.pattern_matches);
}
println!("---");
}
}