some more bullshit 2

2025-05-02 17:53:24 +02:00
parent 28bbbb6e7d
commit 995741ecad
1 changed files with 58 additions and 23 deletions
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,4 +1,5 @@
 use std::collections::{HashSet, VecDeque};
+use std::path::Path;
 use std::sync::{Arc, Mutex};
 use std::sync::atomic::{AtomicUsize, Ordering};

@@ -8,10 +9,12 @@ use regex::Regex;
 use reqwest::Client;
 use scraper::{Html, Selector};
 use tokio;
+use url::Url;

 #[derive(Debug, Clone)]
 struct CrawlResult {
    url: String,
+    html: String,
    title: Option<String>,
    text: Option<String>,
    links: Vec<String>,
@@ -53,6 +56,7 @@ async fn crawl_url(
        }
    };

+    let html_content = res.clone();
    let url_clone = url.clone();
    let pattern_clone = pattern_regex.clone();

@@ -60,21 +64,18 @@ async fn crawl_url(
    let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
        let document = Html::parse_document(&res);

-        // Title extraction
        let title_selector = Selector::parse("title").unwrap();
        let title = document
            .select(&title_selector)
            .next()
            .map(|e| e.inner_html());

-        // Body text extraction
        let body_selector = Selector::parse("body").unwrap();
        let text = document
            .select(&body_selector)
            .next()
            .map(|e| e.text().collect::<Vec<_>>().join(" "));

-        // Link extraction
        let link_selector = Selector::parse("a[href]").unwrap();
        let links: Vec<String> = document
            .select(&link_selector)
@@ -82,7 +83,6 @@ async fn crawl_url(
            .map(|s| s.to_string())
            .collect();

-        // Image extraction
        let img_selector = Selector::parse("img[src]").unwrap();
        let images: Vec<String> = document
            .select(&img_selector)
@@ -90,7 +90,6 @@ async fn crawl_url(
            .map(|s| s.to_string())
            .collect();

-        // Pattern matching
        let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
            regex
                .captures_iter(text)
@@ -105,7 +104,6 @@ async fn crawl_url(
        .await
        .ok()?;

-    // Queue new links
    {
        let mut queue = to_visit.lock().unwrap();
        let mut new_links = 0;
@@ -122,6 +120,7 @@ async fn crawl_url(

    Some(CrawlResult {
        url: url_clone,
+        html: html_content,
        title,
        text,
        links,
@@ -130,11 +129,30 @@ async fn crawl_url(
    })
 }

+fn sanitize_filename(url: &str) -> String {
+    let parsed = Url::parse(url).unwrap();
+    let mut filename = parsed.path().replace('/', "__");
+
+    if filename.is_empty() || filename == "__" {
+        filename = "index".to_string();
+    }
+
+    filename = filename.replace(|c: char| !c.is_ascii_alphanumeric(), "_");
+
+    if filename.len() > 50 {
+        filename = filename[..50].to_string();
+    }
+
+    format!("{}.html", filename.trim_matches('_'))
+}
+
 #[tokio::main]
 async fn main() {
+    println!("[🚀] Starting web crawler...");
+
    let matches = App::new("Web Crawler")
        .version("1.0")
-        .about("Multi-threaded web crawler with pattern matching")
+        .about("Multi-threaded web crawler with pattern matching and website storage")
        .arg(
            Arg::with_name("url")
                .help("Starting URL")
@@ -167,12 +185,19 @@ async fn main() {
    let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex"));
    let client = Arc::new(Client::new());

+    let parsed_url = Url::parse(&start_url).expect("Invalid URL");
+    let domain = parsed_url.host_str().unwrap().replace('.', "_");
+    let output_dir = format!("./{}", domain);
+    std::fs::create_dir_all(&output_dir).expect("Failed to create output directory");
+    println!("[📂] Created output directory: {}", output_dir);
+
    let visited = Arc::new(Mutex::new(HashSet::new()));
    let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url])));
    let active_count = Arc::new(AtomicUsize::new(0));

    let mut tasks = FuturesUnordered::new();

+    println!("[🔄] Initializing {} worker tasks...", concurrency);
    for _ in 0..concurrency {
        let client = client.clone();
        let visited = visited.clone();
@@ -189,6 +214,7 @@ async fn main() {
                };

                if let Some(url) = next_url {
+                    println!("[📥] Processing: {}", url);
                    if let Some(result) = crawl_url(
                        url,
                        client.clone(),
@@ -202,9 +228,12 @@ async fn main() {
                        results.push(result);
                    }
                } else {
-                    if active_count.load(Ordering::SeqCst) == 0 {
+                    let active = active_count.load(Ordering::SeqCst);
+                    if active == 0 {
+                        println!("[🛑] Worker detected completion");
                        break;
                    }
+                    println!("[⏳] Worker waiting (active: {})...", active);
                    tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
                }
            }
@@ -212,26 +241,32 @@ async fn main() {
        }));
    }

+    println!("[📊] Crawling in progress...");
    let mut all_results = vec![];
    while let Some(result_set) = tasks.next().await {
-        if let Ok(results) = result_set {
-            all_results.extend(results);
+        match result_set {
+            Ok(results) => {
+                println!("[✔️] Worker completed with {} results", results.len());
+                all_results.extend(results);
+            }
+            Err(e) => println!("[⚠️] Worker error: {}", e),
        }
    }

+    println!("\n[🏁] Crawling completed!");
+    println!("[📋] Total pages crawled: {}", all_results.len());
+    println!("[💾] Saving results to {}...", output_dir);
+
    for res in all_results {
-        println!("URL: {}", res.url);
-        if let Some(title) = res.title {
-            println!("Title: {}", title);
-        }
-        if let Some(text) = res.text {
-            println!("Text (truncated): {:.100}...", text);
-        }
-        println!("Links: {}", res.links.len());
-        println!("Images: {}", res.images.len());
-        if !res.pattern_matches.is_empty() {
-            println!("Pattern Matches: {:?}", res.pattern_matches);
-        }
-        println!("---");
+        let filename = sanitize_filename(&res.url);
+        let file_path = Path::new(&output_dir).join(filename);
+
+        tokio::fs::write(&file_path, &res.html)
+            .await
+            .unwrap_or_else(|_| panic!("Failed to write: {}", file_path.display()));
+
+        println!("[💾] Saved: {}", file_path.display());
    }
+
+    println!("[✅] Done! Website stored in: {}", output_dir);
 }