some more bullshit

2025-05-02 17:43:33 +02:00
parent 1bfa72db56
commit 28bbbb6e7d
1 changed files with 24 additions and 2 deletions
--- a/src/main.rs
+++ b/src/main.rs
@@ -27,9 +27,12 @@ async fn crawl_url(
    pattern_regex: Option<Regex>,
    active_count: Arc<AtomicUsize>,
 ) -> Option<CrawlResult> {
    println!("[🌐] Attempting: {}", url);
    if {
        let mut visited = visited.lock().unwrap();
        if visited.contains(&url) {
            println!("[⏭️] Already visited: {}", url);
            return None;
        }
        visited.insert(url.clone());
@@ -39,26 +42,39 @@ async fn crawl_url(
        return None;
    }
-    let res = client.get(&url).send().await.ok()?.text().await.ok()?;
+    let res = match client.get(&url).send().await {
        Ok(response) => {
            println!("[✅] Success: {}", url);
            response.text().await.ok()?
        }
        Err(e) => {
            println!("[❌] Failed: {} - {}", url, e);
            return None;
        }
    };
    let url_clone = url.clone();
    let pattern_clone = pattern_regex.clone();
-    // Move heavy processing to blocking thread pool
+    println!("[🔍] Processing: {}", url);
    let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
        let document = Html::parse_document(&res);
        // Title extraction
        let title_selector = Selector::parse("title").unwrap();
        let title = document
            .select(&title_selector)
            .next()
            .map(|e| e.inner_html());
        // Body text extraction
        let body_selector = Selector::parse("body").unwrap();
        let text = document
            .select(&body_selector)
            .next()
            .map(|e| e.text().collect::<Vec<_>>().join(" "));
        // Link extraction
        let link_selector = Selector::parse("a[href]").unwrap();
        let links: Vec<String> = document
            .select(&link_selector)
@@ -66,6 +82,7 @@ async fn crawl_url(
            .map(|s| s.to_string())
            .collect();
        // Image extraction
        let img_selector = Selector::parse("img[src]").unwrap();
        let images: Vec<String> = document
            .select(&img_selector)
@@ -73,6 +90,7 @@ async fn crawl_url(
            .map(|s| s.to_string())
            .collect();
        // Pattern matching
        let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
            regex
                .captures_iter(text)
@@ -87,13 +105,17 @@ async fn crawl_url(
        .await
        .ok()?;
    // Queue new links
    {
        let mut queue = to_visit.lock().unwrap();
        let mut new_links = 0;
        for link in &links {
            if link.starts_with("http") {
                queue.push_back(link.clone());
                new_links += 1;
            }
        }
        println!("[🔄] Discovered {} new links from {}", new_links, url);
    }
    active_count.fetch_sub(1, Ordering::SeqCst);