some more bullshit 2
This commit is contained in:
81
src/main.rs
81
src/main.rs
@@ -1,4 +1,5 @@
|
||||
use std::collections::{HashSet, VecDeque};
|
||||
use std::path::Path;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
@@ -8,10 +9,12 @@ use regex::Regex;
|
||||
use reqwest::Client;
|
||||
use scraper::{Html, Selector};
|
||||
use tokio;
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct CrawlResult {
|
||||
url: String,
|
||||
html: String,
|
||||
title: Option<String>,
|
||||
text: Option<String>,
|
||||
links: Vec<String>,
|
||||
@@ -53,6 +56,7 @@ async fn crawl_url(
|
||||
}
|
||||
};
|
||||
|
||||
let html_content = res.clone();
|
||||
let url_clone = url.clone();
|
||||
let pattern_clone = pattern_regex.clone();
|
||||
|
||||
@@ -60,21 +64,18 @@ async fn crawl_url(
|
||||
let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
|
||||
let document = Html::parse_document(&res);
|
||||
|
||||
// Title extraction
|
||||
let title_selector = Selector::parse("title").unwrap();
|
||||
let title = document
|
||||
.select(&title_selector)
|
||||
.next()
|
||||
.map(|e| e.inner_html());
|
||||
|
||||
// Body text extraction
|
||||
let body_selector = Selector::parse("body").unwrap();
|
||||
let text = document
|
||||
.select(&body_selector)
|
||||
.next()
|
||||
.map(|e| e.text().collect::<Vec<_>>().join(" "));
|
||||
|
||||
// Link extraction
|
||||
let link_selector = Selector::parse("a[href]").unwrap();
|
||||
let links: Vec<String> = document
|
||||
.select(&link_selector)
|
||||
@@ -82,7 +83,6 @@ async fn crawl_url(
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
|
||||
// Image extraction
|
||||
let img_selector = Selector::parse("img[src]").unwrap();
|
||||
let images: Vec<String> = document
|
||||
.select(&img_selector)
|
||||
@@ -90,7 +90,6 @@ async fn crawl_url(
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
|
||||
// Pattern matching
|
||||
let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
|
||||
regex
|
||||
.captures_iter(text)
|
||||
@@ -105,7 +104,6 @@ async fn crawl_url(
|
||||
.await
|
||||
.ok()?;
|
||||
|
||||
// Queue new links
|
||||
{
|
||||
let mut queue = to_visit.lock().unwrap();
|
||||
let mut new_links = 0;
|
||||
@@ -122,6 +120,7 @@ async fn crawl_url(
|
||||
|
||||
Some(CrawlResult {
|
||||
url: url_clone,
|
||||
html: html_content,
|
||||
title,
|
||||
text,
|
||||
links,
|
||||
@@ -130,11 +129,30 @@ async fn crawl_url(
|
||||
})
|
||||
}
|
||||
|
||||
fn sanitize_filename(url: &str) -> String {
|
||||
let parsed = Url::parse(url).unwrap();
|
||||
let mut filename = parsed.path().replace('/', "__");
|
||||
|
||||
if filename.is_empty() || filename == "__" {
|
||||
filename = "index".to_string();
|
||||
}
|
||||
|
||||
filename = filename.replace(|c: char| !c.is_ascii_alphanumeric(), "_");
|
||||
|
||||
if filename.len() > 50 {
|
||||
filename = filename[..50].to_string();
|
||||
}
|
||||
|
||||
format!("{}.html", filename.trim_matches('_'))
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
println!("[🚀] Starting web crawler...");
|
||||
|
||||
let matches = App::new("Web Crawler")
|
||||
.version("1.0")
|
||||
.about("Multi-threaded web crawler with pattern matching")
|
||||
.about("Multi-threaded web crawler with pattern matching and website storage")
|
||||
.arg(
|
||||
Arg::with_name("url")
|
||||
.help("Starting URL")
|
||||
@@ -167,12 +185,19 @@ async fn main() {
|
||||
let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex"));
|
||||
let client = Arc::new(Client::new());
|
||||
|
||||
let parsed_url = Url::parse(&start_url).expect("Invalid URL");
|
||||
let domain = parsed_url.host_str().unwrap().replace('.', "_");
|
||||
let output_dir = format!("./{}", domain);
|
||||
std::fs::create_dir_all(&output_dir).expect("Failed to create output directory");
|
||||
println!("[📂] Created output directory: {}", output_dir);
|
||||
|
||||
let visited = Arc::new(Mutex::new(HashSet::new()));
|
||||
let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url])));
|
||||
let active_count = Arc::new(AtomicUsize::new(0));
|
||||
|
||||
let mut tasks = FuturesUnordered::new();
|
||||
|
||||
println!("[🔄] Initializing {} worker tasks...", concurrency);
|
||||
for _ in 0..concurrency {
|
||||
let client = client.clone();
|
||||
let visited = visited.clone();
|
||||
@@ -189,6 +214,7 @@ async fn main() {
|
||||
};
|
||||
|
||||
if let Some(url) = next_url {
|
||||
println!("[📥] Processing: {}", url);
|
||||
if let Some(result) = crawl_url(
|
||||
url,
|
||||
client.clone(),
|
||||
@@ -202,9 +228,12 @@ async fn main() {
|
||||
results.push(result);
|
||||
}
|
||||
} else {
|
||||
if active_count.load(Ordering::SeqCst) == 0 {
|
||||
let active = active_count.load(Ordering::SeqCst);
|
||||
if active == 0 {
|
||||
println!("[🛑] Worker detected completion");
|
||||
break;
|
||||
}
|
||||
println!("[⏳] Worker waiting (active: {})...", active);
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
|
||||
}
|
||||
}
|
||||
@@ -212,26 +241,32 @@ async fn main() {
|
||||
}));
|
||||
}
|
||||
|
||||
println!("[📊] Crawling in progress...");
|
||||
let mut all_results = vec![];
|
||||
while let Some(result_set) = tasks.next().await {
|
||||
if let Ok(results) = result_set {
|
||||
all_results.extend(results);
|
||||
match result_set {
|
||||
Ok(results) => {
|
||||
println!("[✔️] Worker completed with {} results", results.len());
|
||||
all_results.extend(results);
|
||||
}
|
||||
Err(e) => println!("[⚠️] Worker error: {}", e),
|
||||
}
|
||||
}
|
||||
|
||||
println!("\n[🏁] Crawling completed!");
|
||||
println!("[📋] Total pages crawled: {}", all_results.len());
|
||||
println!("[💾] Saving results to {}...", output_dir);
|
||||
|
||||
for res in all_results {
|
||||
println!("URL: {}", res.url);
|
||||
if let Some(title) = res.title {
|
||||
println!("Title: {}", title);
|
||||
}
|
||||
if let Some(text) = res.text {
|
||||
println!("Text (truncated): {:.100}...", text);
|
||||
}
|
||||
println!("Links: {}", res.links.len());
|
||||
println!("Images: {}", res.images.len());
|
||||
if !res.pattern_matches.is_empty() {
|
||||
println!("Pattern Matches: {:?}", res.pattern_matches);
|
||||
}
|
||||
println!("---");
|
||||
let filename = sanitize_filename(&res.url);
|
||||
let file_path = Path::new(&output_dir).join(filename);
|
||||
|
||||
tokio::fs::write(&file_path, &res.html)
|
||||
.await
|
||||
.unwrap_or_else(|_| panic!("Failed to write: {}", file_path.display()));
|
||||
|
||||
println!("[💾] Saved: {}", file_path.display());
|
||||
}
|
||||
|
||||
println!("[✅] Done! Website stored in: {}", output_dir);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user