some more bullshit

This commit is contained in:
rattatwinko
2025-05-02 17:43:33 +02:00
parent 1bfa72db56
commit 28bbbb6e7d

View File

@@ -27,9 +27,12 @@ async fn crawl_url(
pattern_regex: Option<Regex>,
active_count: Arc<AtomicUsize>,
) -> Option<CrawlResult> {
println!("[🌐] Attempting: {}", url);
if {
let mut visited = visited.lock().unwrap();
if visited.contains(&url) {
println!("[⏭️] Already visited: {}", url);
return None;
}
visited.insert(url.clone());
@@ -39,26 +42,39 @@ async fn crawl_url(
return None;
}
let res = client.get(&url).send().await.ok()?.text().await.ok()?;
let res = match client.get(&url).send().await {
Ok(response) => {
println!("[✅] Success: {}", url);
response.text().await.ok()?
}
Err(e) => {
println!("[❌] Failed: {} - {}", url, e);
return None;
}
};
let url_clone = url.clone();
let pattern_clone = pattern_regex.clone();
// Move heavy processing to blocking thread pool
println!("[🔍] Processing: {}", url);
let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
let document = Html::parse_document(&res);
// Title extraction
let title_selector = Selector::parse("title").unwrap();
let title = document
.select(&title_selector)
.next()
.map(|e| e.inner_html());
// Body text extraction
let body_selector = Selector::parse("body").unwrap();
let text = document
.select(&body_selector)
.next()
.map(|e| e.text().collect::<Vec<_>>().join(" "));
// Link extraction
let link_selector = Selector::parse("a[href]").unwrap();
let links: Vec<String> = document
.select(&link_selector)
@@ -66,6 +82,7 @@ async fn crawl_url(
.map(|s| s.to_string())
.collect();
// Image extraction
let img_selector = Selector::parse("img[src]").unwrap();
let images: Vec<String> = document
.select(&img_selector)
@@ -73,6 +90,7 @@ async fn crawl_url(
.map(|s| s.to_string())
.collect();
// Pattern matching
let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
regex
.captures_iter(text)
@@ -87,13 +105,17 @@ async fn crawl_url(
.await
.ok()?;
// Queue new links
{
let mut queue = to_visit.lock().unwrap();
let mut new_links = 0;
for link in &links {
if link.starts_with("http") {
queue.push_back(link.clone());
new_links += 1;
}
}
println!("[🔄] Discovered {} new links from {}", new_links, url);
}
active_count.fetch_sub(1, Ordering::SeqCst);