From 28bbbb6e7dcd6e959a52ca98e5035c75a3af1e8e Mon Sep 17 00:00:00 2001 From: rattatwinko Date: Fri, 2 May 2025 17:43:33 +0200 Subject: [PATCH] some more bullshit --- src/main.rs | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index 399bf6e..5da37a8 100644 --- a/src/main.rs +++ b/src/main.rs @@ -27,9 +27,12 @@ async fn crawl_url( pattern_regex: Option, active_count: Arc, ) -> Option { + println!("[🌐] Attempting: {}", url); + if { let mut visited = visited.lock().unwrap(); if visited.contains(&url) { + println!("[⏭️] Already visited: {}", url); return None; } visited.insert(url.clone()); @@ -39,26 +42,39 @@ async fn crawl_url( return None; } - let res = client.get(&url).send().await.ok()?.text().await.ok()?; + let res = match client.get(&url).send().await { + Ok(response) => { + println!("[✅] Success: {}", url); + response.text().await.ok()? + } + Err(e) => { + println!("[❌] Failed: {} - {}", url, e); + return None; + } + }; + let url_clone = url.clone(); let pattern_clone = pattern_regex.clone(); - // Move heavy processing to blocking thread pool + println!("[🔍] Processing: {}", url); let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || { let document = Html::parse_document(&res); + // Title extraction let title_selector = Selector::parse("title").unwrap(); let title = document .select(&title_selector) .next() .map(|e| e.inner_html()); + // Body text extraction let body_selector = Selector::parse("body").unwrap(); let text = document .select(&body_selector) .next() .map(|e| e.text().collect::>().join(" ")); + // Link extraction let link_selector = Selector::parse("a[href]").unwrap(); let links: Vec = document .select(&link_selector) @@ -66,6 +82,7 @@ async fn crawl_url( .map(|s| s.to_string()) .collect(); + // Image extraction let img_selector = Selector::parse("img[src]").unwrap(); let images: Vec = document .select(&img_selector) @@ -73,6 +90,7 @@ async fn crawl_url( .map(|s| s.to_string()) .collect(); + // Pattern matching let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) { regex .captures_iter(text) @@ -87,13 +105,17 @@ async fn crawl_url( .await .ok()?; + // Queue new links { let mut queue = to_visit.lock().unwrap(); + let mut new_links = 0; for link in &links { if link.starts_with("http") { queue.push_back(link.clone()); + new_links += 1; } } + println!("[🔄] Discovered {} new links from {}", new_links, url); } active_count.fetch_sub(1, Ordering::SeqCst);