some more bullshit 2

This commit is contained in:
rattatwinko
2025-05-02 17:53:24 +02:00
parent 28bbbb6e7d
commit 995741ecad

View File

@@ -1,4 +1,5 @@
use std::collections::{HashSet, VecDeque};
use std::path::Path;
use std::sync::{Arc, Mutex};
use std::sync::atomic::{AtomicUsize, Ordering};
@@ -8,10 +9,12 @@ use regex::Regex;
use reqwest::Client;
use scraper::{Html, Selector};
use tokio;
use url::Url;
#[derive(Debug, Clone)]
struct CrawlResult {
url: String,
html: String,
title: Option<String>,
text: Option<String>,
links: Vec<String>,
@@ -53,6 +56,7 @@ async fn crawl_url(
}
};
let html_content = res.clone();
let url_clone = url.clone();
let pattern_clone = pattern_regex.clone();
@@ -60,21 +64,18 @@ async fn crawl_url(
let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
let document = Html::parse_document(&res);
// Title extraction
let title_selector = Selector::parse("title").unwrap();
let title = document
.select(&title_selector)
.next()
.map(|e| e.inner_html());
// Body text extraction
let body_selector = Selector::parse("body").unwrap();
let text = document
.select(&body_selector)
.next()
.map(|e| e.text().collect::<Vec<_>>().join(" "));
// Link extraction
let link_selector = Selector::parse("a[href]").unwrap();
let links: Vec<String> = document
.select(&link_selector)
@@ -82,7 +83,6 @@ async fn crawl_url(
.map(|s| s.to_string())
.collect();
// Image extraction
let img_selector = Selector::parse("img[src]").unwrap();
let images: Vec<String> = document
.select(&img_selector)
@@ -90,7 +90,6 @@ async fn crawl_url(
.map(|s| s.to_string())
.collect();
// Pattern matching
let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
regex
.captures_iter(text)
@@ -105,7 +104,6 @@ async fn crawl_url(
.await
.ok()?;
// Queue new links
{
let mut queue = to_visit.lock().unwrap();
let mut new_links = 0;
@@ -122,6 +120,7 @@ async fn crawl_url(
Some(CrawlResult {
url: url_clone,
html: html_content,
title,
text,
links,
@@ -130,11 +129,30 @@ async fn crawl_url(
})
}
fn sanitize_filename(url: &str) -> String {
let parsed = Url::parse(url).unwrap();
let mut filename = parsed.path().replace('/', "__");
if filename.is_empty() || filename == "__" {
filename = "index".to_string();
}
filename = filename.replace(|c: char| !c.is_ascii_alphanumeric(), "_");
if filename.len() > 50 {
filename = filename[..50].to_string();
}
format!("{}.html", filename.trim_matches('_'))
}
#[tokio::main]
async fn main() {
println!("[🚀] Starting web crawler...");
let matches = App::new("Web Crawler")
.version("1.0")
.about("Multi-threaded web crawler with pattern matching")
.about("Multi-threaded web crawler with pattern matching and website storage")
.arg(
Arg::with_name("url")
.help("Starting URL")
@@ -167,12 +185,19 @@ async fn main() {
let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex"));
let client = Arc::new(Client::new());
let parsed_url = Url::parse(&start_url).expect("Invalid URL");
let domain = parsed_url.host_str().unwrap().replace('.', "_");
let output_dir = format!("./{}", domain);
std::fs::create_dir_all(&output_dir).expect("Failed to create output directory");
println!("[📂] Created output directory: {}", output_dir);
let visited = Arc::new(Mutex::new(HashSet::new()));
let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url])));
let active_count = Arc::new(AtomicUsize::new(0));
let mut tasks = FuturesUnordered::new();
println!("[🔄] Initializing {} worker tasks...", concurrency);
for _ in 0..concurrency {
let client = client.clone();
let visited = visited.clone();
@@ -189,6 +214,7 @@ async fn main() {
};
if let Some(url) = next_url {
println!("[📥] Processing: {}", url);
if let Some(result) = crawl_url(
url,
client.clone(),
@@ -202,9 +228,12 @@ async fn main() {
results.push(result);
}
} else {
if active_count.load(Ordering::SeqCst) == 0 {
let active = active_count.load(Ordering::SeqCst);
if active == 0 {
println!("[🛑] Worker detected completion");
break;
}
println!("[⏳] Worker waiting (active: {})...", active);
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
}
}
@@ -212,26 +241,32 @@ async fn main() {
}));
}
println!("[📊] Crawling in progress...");
let mut all_results = vec![];
while let Some(result_set) = tasks.next().await {
if let Ok(results) = result_set {
all_results.extend(results);
match result_set {
Ok(results) => {
println!("[✔️] Worker completed with {} results", results.len());
all_results.extend(results);
}
Err(e) => println!("[⚠️] Worker error: {}", e),
}
}
println!("\n[🏁] Crawling completed!");
println!("[📋] Total pages crawled: {}", all_results.len());
println!("[💾] Saving results to {}...", output_dir);
for res in all_results {
println!("URL: {}", res.url);
if let Some(title) = res.title {
println!("Title: {}", title);
}
if let Some(text) = res.text {
println!("Text (truncated): {:.100}...", text);
}
println!("Links: {}", res.links.len());
println!("Images: {}", res.images.len());
if !res.pattern_matches.is_empty() {
println!("Pattern Matches: {:?}", res.pattern_matches);
}
println!("---");
let filename = sanitize_filename(&res.url);
let file_path = Path::new(&output_dir).join(filename);
tokio::fs::write(&file_path, &res.html)
.await
.unwrap_or_else(|_| panic!("Failed to write: {}", file_path.display()));
println!("[💾] Saved: {}", file_path.display());
}
println!("[✅] Done! Website stored in: {}", output_dir);
}