some more bullshit 2

This commit is contained in:
rattatwinko
2025-05-02 17:53:24 +02:00
parent 28bbbb6e7d
commit 995741ecad

View File

@@ -1,4 +1,5 @@
use std::collections::{HashSet, VecDeque}; use std::collections::{HashSet, VecDeque};
use std::path::Path;
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::atomic::{AtomicUsize, Ordering};
@@ -8,10 +9,12 @@ use regex::Regex;
use reqwest::Client; use reqwest::Client;
use scraper::{Html, Selector}; use scraper::{Html, Selector};
use tokio; use tokio;
use url::Url;
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
struct CrawlResult { struct CrawlResult {
url: String, url: String,
html: String,
title: Option<String>, title: Option<String>,
text: Option<String>, text: Option<String>,
links: Vec<String>, links: Vec<String>,
@@ -53,6 +56,7 @@ async fn crawl_url(
} }
}; };
let html_content = res.clone();
let url_clone = url.clone(); let url_clone = url.clone();
let pattern_clone = pattern_regex.clone(); let pattern_clone = pattern_regex.clone();
@@ -60,21 +64,18 @@ async fn crawl_url(
let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || { let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
let document = Html::parse_document(&res); let document = Html::parse_document(&res);
// Title extraction
let title_selector = Selector::parse("title").unwrap(); let title_selector = Selector::parse("title").unwrap();
let title = document let title = document
.select(&title_selector) .select(&title_selector)
.next() .next()
.map(|e| e.inner_html()); .map(|e| e.inner_html());
// Body text extraction
let body_selector = Selector::parse("body").unwrap(); let body_selector = Selector::parse("body").unwrap();
let text = document let text = document
.select(&body_selector) .select(&body_selector)
.next() .next()
.map(|e| e.text().collect::<Vec<_>>().join(" ")); .map(|e| e.text().collect::<Vec<_>>().join(" "));
// Link extraction
let link_selector = Selector::parse("a[href]").unwrap(); let link_selector = Selector::parse("a[href]").unwrap();
let links: Vec<String> = document let links: Vec<String> = document
.select(&link_selector) .select(&link_selector)
@@ -82,7 +83,6 @@ async fn crawl_url(
.map(|s| s.to_string()) .map(|s| s.to_string())
.collect(); .collect();
// Image extraction
let img_selector = Selector::parse("img[src]").unwrap(); let img_selector = Selector::parse("img[src]").unwrap();
let images: Vec<String> = document let images: Vec<String> = document
.select(&img_selector) .select(&img_selector)
@@ -90,7 +90,6 @@ async fn crawl_url(
.map(|s| s.to_string()) .map(|s| s.to_string())
.collect(); .collect();
// Pattern matching
let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) { let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
regex regex
.captures_iter(text) .captures_iter(text)
@@ -105,7 +104,6 @@ async fn crawl_url(
.await .await
.ok()?; .ok()?;
// Queue new links
{ {
let mut queue = to_visit.lock().unwrap(); let mut queue = to_visit.lock().unwrap();
let mut new_links = 0; let mut new_links = 0;
@@ -122,6 +120,7 @@ async fn crawl_url(
Some(CrawlResult { Some(CrawlResult {
url: url_clone, url: url_clone,
html: html_content,
title, title,
text, text,
links, links,
@@ -130,11 +129,30 @@ async fn crawl_url(
}) })
} }
fn sanitize_filename(url: &str) -> String {
let parsed = Url::parse(url).unwrap();
let mut filename = parsed.path().replace('/', "__");
if filename.is_empty() || filename == "__" {
filename = "index".to_string();
}
filename = filename.replace(|c: char| !c.is_ascii_alphanumeric(), "_");
if filename.len() > 50 {
filename = filename[..50].to_string();
}
format!("{}.html", filename.trim_matches('_'))
}
#[tokio::main] #[tokio::main]
async fn main() { async fn main() {
println!("[🚀] Starting web crawler...");
let matches = App::new("Web Crawler") let matches = App::new("Web Crawler")
.version("1.0") .version("1.0")
.about("Multi-threaded web crawler with pattern matching") .about("Multi-threaded web crawler with pattern matching and website storage")
.arg( .arg(
Arg::with_name("url") Arg::with_name("url")
.help("Starting URL") .help("Starting URL")
@@ -167,12 +185,19 @@ async fn main() {
let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex")); let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex"));
let client = Arc::new(Client::new()); let client = Arc::new(Client::new());
let parsed_url = Url::parse(&start_url).expect("Invalid URL");
let domain = parsed_url.host_str().unwrap().replace('.', "_");
let output_dir = format!("./{}", domain);
std::fs::create_dir_all(&output_dir).expect("Failed to create output directory");
println!("[📂] Created output directory: {}", output_dir);
let visited = Arc::new(Mutex::new(HashSet::new())); let visited = Arc::new(Mutex::new(HashSet::new()));
let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url]))); let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url])));
let active_count = Arc::new(AtomicUsize::new(0)); let active_count = Arc::new(AtomicUsize::new(0));
let mut tasks = FuturesUnordered::new(); let mut tasks = FuturesUnordered::new();
println!("[🔄] Initializing {} worker tasks...", concurrency);
for _ in 0..concurrency { for _ in 0..concurrency {
let client = client.clone(); let client = client.clone();
let visited = visited.clone(); let visited = visited.clone();
@@ -189,6 +214,7 @@ async fn main() {
}; };
if let Some(url) = next_url { if let Some(url) = next_url {
println!("[📥] Processing: {}", url);
if let Some(result) = crawl_url( if let Some(result) = crawl_url(
url, url,
client.clone(), client.clone(),
@@ -202,9 +228,12 @@ async fn main() {
results.push(result); results.push(result);
} }
} else { } else {
if active_count.load(Ordering::SeqCst) == 0 { let active = active_count.load(Ordering::SeqCst);
if active == 0 {
println!("[🛑] Worker detected completion");
break; break;
} }
println!("[⏳] Worker waiting (active: {})...", active);
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
} }
} }
@@ -212,26 +241,32 @@ async fn main() {
})); }));
} }
println!("[📊] Crawling in progress...");
let mut all_results = vec![]; let mut all_results = vec![];
while let Some(result_set) = tasks.next().await { while let Some(result_set) = tasks.next().await {
if let Ok(results) = result_set { match result_set {
all_results.extend(results); Ok(results) => {
println!("[✔️] Worker completed with {} results", results.len());
all_results.extend(results);
}
Err(e) => println!("[⚠️] Worker error: {}", e),
} }
} }
println!("\n[🏁] Crawling completed!");
println!("[📋] Total pages crawled: {}", all_results.len());
println!("[💾] Saving results to {}...", output_dir);
for res in all_results { for res in all_results {
println!("URL: {}", res.url); let filename = sanitize_filename(&res.url);
if let Some(title) = res.title { let file_path = Path::new(&output_dir).join(filename);
println!("Title: {}", title);
} tokio::fs::write(&file_path, &res.html)
if let Some(text) = res.text { .await
println!("Text (truncated): {:.100}...", text); .unwrap_or_else(|_| panic!("Failed to write: {}", file_path.display()));
}
println!("Links: {}", res.links.len()); println!("[💾] Saved: {}", file_path.display());
println!("Images: {}", res.images.len());
if !res.pattern_matches.is_empty() {
println!("Pattern Matches: {:?}", res.pattern_matches);
}
println!("---");
} }
println!("[✅] Done! Website stored in: {}", output_dir);
} }