From 995741ecad01730e70a8e0384da7d24115643e27 Mon Sep 17 00:00:00 2001
From: rattatwinko <basti.zinkl10@gmail.com>
Date: Fri, 2 May 2025 17:53:24 +0200
Subject: [PATCH] some more bullshit 2

---
 src/main.rs | 81 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 58 insertions(+), 23 deletions(-)
diff --git a/src/main.rs b/src/main.rs
index 5da37a8..e4bae70 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,4 +1,5 @@
 use std::collections::{HashSet, VecDeque};
+use std::path::Path;
 use std::sync::{Arc, Mutex};
 use std::sync::atomic::{AtomicUsize, Ordering};
 
@@ -8,10 +9,12 @@ use regex::Regex;
 use reqwest::Client;
 use scraper::{Html, Selector};
 use tokio;
+use url::Url;
 
 #[derive(Debug, Clone)]
 struct CrawlResult {
     url: String,
+    html: String,
     title: Option<String>,
     text: Option<String>,
     links: Vec<String>,
@@ -53,6 +56,7 @@ async fn crawl_url(
         }
     };
 
+    let html_content = res.clone();
     let url_clone = url.clone();
     let pattern_clone = pattern_regex.clone();
 
@@ -60,21 +64,18 @@ async fn crawl_url(
     let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
         let document = Html::parse_document(&res);
 
-        // Title extraction
         let title_selector = Selector::parse("title").unwrap();
         let title = document
             .select(&title_selector)
             .next()
             .map(|e| e.inner_html());
 
-        // Body text extraction
         let body_selector = Selector::parse("body").unwrap();
         let text = document
             .select(&body_selector)
             .next()
             .map(|e| e.text().collect::<Vec<_>>().join(" "));
 
-        // Link extraction
         let link_selector = Selector::parse("a[href]").unwrap();
         let links: Vec<String> = document
             .select(&link_selector)
@@ -82,7 +83,6 @@ async fn crawl_url(
             .map(|s| s.to_string())
             .collect();
 
-        // Image extraction
         let img_selector = Selector::parse("img[src]").unwrap();
         let images: Vec<String> = document
             .select(&img_selector)
@@ -90,7 +90,6 @@ async fn crawl_url(
             .map(|s| s.to_string())
             .collect();
 
-        // Pattern matching
         let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
             regex
                 .captures_iter(text)
@@ -105,7 +104,6 @@ async fn crawl_url(
         .await
         .ok()?;
 
-    // Queue new links
     {
         let mut queue = to_visit.lock().unwrap();
         let mut new_links = 0;
@@ -122,6 +120,7 @@ async fn crawl_url(
 
     Some(CrawlResult {
         url: url_clone,
+        html: html_content,
         title,
         text,
         links,
@@ -130,11 +129,30 @@ async fn crawl_url(
     })
 }
 
+fn sanitize_filename(url: &str) -> String {
+    let parsed = Url::parse(url).unwrap();
+    let mut filename = parsed.path().replace('/', "__");
+
+    if filename.is_empty() || filename == "__" {
+        filename = "index".to_string();
+    }
+
+    filename = filename.replace(|c: char| !c.is_ascii_alphanumeric(), "_");
+
+    if filename.len() > 50 {
+        filename = filename[..50].to_string();
+    }
+
+    format!("{}.html", filename.trim_matches('_'))
+}
+
 #[tokio::main]
 async fn main() {
+    println!("[🚀] Starting web crawler...");
+
     let matches = App::new("Web Crawler")
         .version("1.0")
-        .about("Multi-threaded web crawler with pattern matching")
+        .about("Multi-threaded web crawler with pattern matching and website storage")
         .arg(
             Arg::with_name("url")
                 .help("Starting URL")
@@ -167,12 +185,19 @@ async fn main() {
     let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex"));
     let client = Arc::new(Client::new());
 
+    let parsed_url = Url::parse(&start_url).expect("Invalid URL");
+    let domain = parsed_url.host_str().unwrap().replace('.', "_");
+    let output_dir = format!("./{}", domain);
+    std::fs::create_dir_all(&output_dir).expect("Failed to create output directory");
+    println!("[📂] Created output directory: {}", output_dir);
+
     let visited = Arc::new(Mutex::new(HashSet::new()));
     let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url])));
     let active_count = Arc::new(AtomicUsize::new(0));
 
     let mut tasks = FuturesUnordered::new();
 
+    println!("[🔄] Initializing {} worker tasks...", concurrency);
     for _ in 0..concurrency {
         let client = client.clone();
         let visited = visited.clone();
@@ -189,6 +214,7 @@ async fn main() {
                 };
 
                 if let Some(url) = next_url {
+                    println!("[📥] Processing: {}", url);
                     if let Some(result) = crawl_url(
                         url,
                         client.clone(),
@@ -202,9 +228,12 @@ async fn main() {
                         results.push(result);
                     }
                 } else {
-                    if active_count.load(Ordering::SeqCst) == 0 {
+                    let active = active_count.load(Ordering::SeqCst);
+                    if active == 0 {
+                        println!("[🛑] Worker detected completion");
                         break;
                     }
+                    println!("[⏳] Worker waiting (active: {})...", active);
                     tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
                 }
             }
@@ -212,26 +241,32 @@ async fn main() {
         }));
     }
 
+    println!("[📊] Crawling in progress...");
     let mut all_results = vec![];
     while let Some(result_set) = tasks.next().await {
-        if let Ok(results) = result_set {
-            all_results.extend(results);
+        match result_set {
+            Ok(results) => {
+                println!("[✔️] Worker completed with {} results", results.len());
+                all_results.extend(results);
+            }
+            Err(e) => println!("[⚠️] Worker error: {}", e),
         }
     }
 
+    println!("\n[🏁] Crawling completed!");
+    println!("[📋] Total pages crawled: {}", all_results.len());
+    println!("[💾] Saving results to {}...", output_dir);
+
     for res in all_results {
-        println!("URL: {}", res.url);
-        if let Some(title) = res.title {
-            println!("Title: {}", title);
-        }
-        if let Some(text) = res.text {
-            println!("Text (truncated): {:.100}...", text);
-        }
-        println!("Links: {}", res.links.len());
-        println!("Images: {}", res.images.len());
-        if !res.pattern_matches.is_empty() {
-            println!("Pattern Matches: {:?}", res.pattern_matches);
-        }
-        println!("---");
+        let filename = sanitize_filename(&res.url);
+        let file_path = Path::new(&output_dir).join(filename);
+
+        tokio::fs::write(&file_path, &res.html)
+            .await
+            .unwrap_or_else(|_| panic!("Failed to write: {}", file_path.display()));
+
+        println!("[💾] Saved: {}", file_path.display());
     }
+
+    println!("[✅] Done! Website stored in: {}", output_dir);
 }