some more bullshit

This commit is contained in:
rattatwinko
2025-05-02 17:43:33 +02:00
parent 1bfa72db56
commit 28bbbb6e7d

View File

@@ -27,9 +27,12 @@ async fn crawl_url(
pattern_regex: Option<Regex>, pattern_regex: Option<Regex>,
active_count: Arc<AtomicUsize>, active_count: Arc<AtomicUsize>,
) -> Option<CrawlResult> { ) -> Option<CrawlResult> {
println!("[🌐] Attempting: {}", url);
if { if {
let mut visited = visited.lock().unwrap(); let mut visited = visited.lock().unwrap();
if visited.contains(&url) { if visited.contains(&url) {
println!("[⏭️] Already visited: {}", url);
return None; return None;
} }
visited.insert(url.clone()); visited.insert(url.clone());
@@ -39,26 +42,39 @@ async fn crawl_url(
return None; return None;
} }
let res = client.get(&url).send().await.ok()?.text().await.ok()?; let res = match client.get(&url).send().await {
Ok(response) => {
println!("[✅] Success: {}", url);
response.text().await.ok()?
}
Err(e) => {
println!("[❌] Failed: {} - {}", url, e);
return None;
}
};
let url_clone = url.clone(); let url_clone = url.clone();
let pattern_clone = pattern_regex.clone(); let pattern_clone = pattern_regex.clone();
// Move heavy processing to blocking thread pool println!("[🔍] Processing: {}", url);
let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || { let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
let document = Html::parse_document(&res); let document = Html::parse_document(&res);
// Title extraction
let title_selector = Selector::parse("title").unwrap(); let title_selector = Selector::parse("title").unwrap();
let title = document let title = document
.select(&title_selector) .select(&title_selector)
.next() .next()
.map(|e| e.inner_html()); .map(|e| e.inner_html());
// Body text extraction
let body_selector = Selector::parse("body").unwrap(); let body_selector = Selector::parse("body").unwrap();
let text = document let text = document
.select(&body_selector) .select(&body_selector)
.next() .next()
.map(|e| e.text().collect::<Vec<_>>().join(" ")); .map(|e| e.text().collect::<Vec<_>>().join(" "));
// Link extraction
let link_selector = Selector::parse("a[href]").unwrap(); let link_selector = Selector::parse("a[href]").unwrap();
let links: Vec<String> = document let links: Vec<String> = document
.select(&link_selector) .select(&link_selector)
@@ -66,6 +82,7 @@ async fn crawl_url(
.map(|s| s.to_string()) .map(|s| s.to_string())
.collect(); .collect();
// Image extraction
let img_selector = Selector::parse("img[src]").unwrap(); let img_selector = Selector::parse("img[src]").unwrap();
let images: Vec<String> = document let images: Vec<String> = document
.select(&img_selector) .select(&img_selector)
@@ -73,6 +90,7 @@ async fn crawl_url(
.map(|s| s.to_string()) .map(|s| s.to_string())
.collect(); .collect();
// Pattern matching
let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) { let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
regex regex
.captures_iter(text) .captures_iter(text)
@@ -87,13 +105,17 @@ async fn crawl_url(
.await .await
.ok()?; .ok()?;
// Queue new links
{ {
let mut queue = to_visit.lock().unwrap(); let mut queue = to_visit.lock().unwrap();
let mut new_links = 0;
for link in &links { for link in &links {
if link.starts_with("http") { if link.starts_with("http") {
queue.push_back(link.clone()); queue.push_back(link.clone());
new_links += 1;
} }
} }
println!("[🔄] Discovered {} new links from {}", new_links, url);
} }
active_count.fetch_sub(1, Ordering::SeqCst); active_count.fetch_sub(1, Ordering::SeqCst);