some more bullshit 2
This commit is contained in:
79
src/main.rs
79
src/main.rs
@@ -1,4 +1,5 @@
|
|||||||
use std::collections::{HashSet, VecDeque};
|
use std::collections::{HashSet, VecDeque};
|
||||||
|
use std::path::Path;
|
||||||
use std::sync::{Arc, Mutex};
|
use std::sync::{Arc, Mutex};
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
|
|
||||||
@@ -8,10 +9,12 @@ use regex::Regex;
|
|||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
use scraper::{Html, Selector};
|
use scraper::{Html, Selector};
|
||||||
use tokio;
|
use tokio;
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
struct CrawlResult {
|
struct CrawlResult {
|
||||||
url: String,
|
url: String,
|
||||||
|
html: String,
|
||||||
title: Option<String>,
|
title: Option<String>,
|
||||||
text: Option<String>,
|
text: Option<String>,
|
||||||
links: Vec<String>,
|
links: Vec<String>,
|
||||||
@@ -53,6 +56,7 @@ async fn crawl_url(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
let html_content = res.clone();
|
||||||
let url_clone = url.clone();
|
let url_clone = url.clone();
|
||||||
let pattern_clone = pattern_regex.clone();
|
let pattern_clone = pattern_regex.clone();
|
||||||
|
|
||||||
@@ -60,21 +64,18 @@ async fn crawl_url(
|
|||||||
let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
|
let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
|
||||||
let document = Html::parse_document(&res);
|
let document = Html::parse_document(&res);
|
||||||
|
|
||||||
// Title extraction
|
|
||||||
let title_selector = Selector::parse("title").unwrap();
|
let title_selector = Selector::parse("title").unwrap();
|
||||||
let title = document
|
let title = document
|
||||||
.select(&title_selector)
|
.select(&title_selector)
|
||||||
.next()
|
.next()
|
||||||
.map(|e| e.inner_html());
|
.map(|e| e.inner_html());
|
||||||
|
|
||||||
// Body text extraction
|
|
||||||
let body_selector = Selector::parse("body").unwrap();
|
let body_selector = Selector::parse("body").unwrap();
|
||||||
let text = document
|
let text = document
|
||||||
.select(&body_selector)
|
.select(&body_selector)
|
||||||
.next()
|
.next()
|
||||||
.map(|e| e.text().collect::<Vec<_>>().join(" "));
|
.map(|e| e.text().collect::<Vec<_>>().join(" "));
|
||||||
|
|
||||||
// Link extraction
|
|
||||||
let link_selector = Selector::parse("a[href]").unwrap();
|
let link_selector = Selector::parse("a[href]").unwrap();
|
||||||
let links: Vec<String> = document
|
let links: Vec<String> = document
|
||||||
.select(&link_selector)
|
.select(&link_selector)
|
||||||
@@ -82,7 +83,6 @@ async fn crawl_url(
|
|||||||
.map(|s| s.to_string())
|
.map(|s| s.to_string())
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
// Image extraction
|
|
||||||
let img_selector = Selector::parse("img[src]").unwrap();
|
let img_selector = Selector::parse("img[src]").unwrap();
|
||||||
let images: Vec<String> = document
|
let images: Vec<String> = document
|
||||||
.select(&img_selector)
|
.select(&img_selector)
|
||||||
@@ -90,7 +90,6 @@ async fn crawl_url(
|
|||||||
.map(|s| s.to_string())
|
.map(|s| s.to_string())
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
// Pattern matching
|
|
||||||
let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
|
let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
|
||||||
regex
|
regex
|
||||||
.captures_iter(text)
|
.captures_iter(text)
|
||||||
@@ -105,7 +104,6 @@ async fn crawl_url(
|
|||||||
.await
|
.await
|
||||||
.ok()?;
|
.ok()?;
|
||||||
|
|
||||||
// Queue new links
|
|
||||||
{
|
{
|
||||||
let mut queue = to_visit.lock().unwrap();
|
let mut queue = to_visit.lock().unwrap();
|
||||||
let mut new_links = 0;
|
let mut new_links = 0;
|
||||||
@@ -122,6 +120,7 @@ async fn crawl_url(
|
|||||||
|
|
||||||
Some(CrawlResult {
|
Some(CrawlResult {
|
||||||
url: url_clone,
|
url: url_clone,
|
||||||
|
html: html_content,
|
||||||
title,
|
title,
|
||||||
text,
|
text,
|
||||||
links,
|
links,
|
||||||
@@ -130,11 +129,30 @@ async fn crawl_url(
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn sanitize_filename(url: &str) -> String {
|
||||||
|
let parsed = Url::parse(url).unwrap();
|
||||||
|
let mut filename = parsed.path().replace('/', "__");
|
||||||
|
|
||||||
|
if filename.is_empty() || filename == "__" {
|
||||||
|
filename = "index".to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
filename = filename.replace(|c: char| !c.is_ascii_alphanumeric(), "_");
|
||||||
|
|
||||||
|
if filename.len() > 50 {
|
||||||
|
filename = filename[..50].to_string();
|
||||||
|
}
|
||||||
|
|
||||||
|
format!("{}.html", filename.trim_matches('_'))
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() {
|
async fn main() {
|
||||||
|
println!("[🚀] Starting web crawler...");
|
||||||
|
|
||||||
let matches = App::new("Web Crawler")
|
let matches = App::new("Web Crawler")
|
||||||
.version("1.0")
|
.version("1.0")
|
||||||
.about("Multi-threaded web crawler with pattern matching")
|
.about("Multi-threaded web crawler with pattern matching and website storage")
|
||||||
.arg(
|
.arg(
|
||||||
Arg::with_name("url")
|
Arg::with_name("url")
|
||||||
.help("Starting URL")
|
.help("Starting URL")
|
||||||
@@ -167,12 +185,19 @@ async fn main() {
|
|||||||
let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex"));
|
let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex"));
|
||||||
let client = Arc::new(Client::new());
|
let client = Arc::new(Client::new());
|
||||||
|
|
||||||
|
let parsed_url = Url::parse(&start_url).expect("Invalid URL");
|
||||||
|
let domain = parsed_url.host_str().unwrap().replace('.', "_");
|
||||||
|
let output_dir = format!("./{}", domain);
|
||||||
|
std::fs::create_dir_all(&output_dir).expect("Failed to create output directory");
|
||||||
|
println!("[📂] Created output directory: {}", output_dir);
|
||||||
|
|
||||||
let visited = Arc::new(Mutex::new(HashSet::new()));
|
let visited = Arc::new(Mutex::new(HashSet::new()));
|
||||||
let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url])));
|
let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url])));
|
||||||
let active_count = Arc::new(AtomicUsize::new(0));
|
let active_count = Arc::new(AtomicUsize::new(0));
|
||||||
|
|
||||||
let mut tasks = FuturesUnordered::new();
|
let mut tasks = FuturesUnordered::new();
|
||||||
|
|
||||||
|
println!("[🔄] Initializing {} worker tasks...", concurrency);
|
||||||
for _ in 0..concurrency {
|
for _ in 0..concurrency {
|
||||||
let client = client.clone();
|
let client = client.clone();
|
||||||
let visited = visited.clone();
|
let visited = visited.clone();
|
||||||
@@ -189,6 +214,7 @@ async fn main() {
|
|||||||
};
|
};
|
||||||
|
|
||||||
if let Some(url) = next_url {
|
if let Some(url) = next_url {
|
||||||
|
println!("[📥] Processing: {}", url);
|
||||||
if let Some(result) = crawl_url(
|
if let Some(result) = crawl_url(
|
||||||
url,
|
url,
|
||||||
client.clone(),
|
client.clone(),
|
||||||
@@ -202,9 +228,12 @@ async fn main() {
|
|||||||
results.push(result);
|
results.push(result);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if active_count.load(Ordering::SeqCst) == 0 {
|
let active = active_count.load(Ordering::SeqCst);
|
||||||
|
if active == 0 {
|
||||||
|
println!("[🛑] Worker detected completion");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
println!("[⏳] Worker waiting (active: {})...", active);
|
||||||
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
|
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -212,26 +241,32 @@ async fn main() {
|
|||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
println!("[📊] Crawling in progress...");
|
||||||
let mut all_results = vec![];
|
let mut all_results = vec![];
|
||||||
while let Some(result_set) = tasks.next().await {
|
while let Some(result_set) = tasks.next().await {
|
||||||
if let Ok(results) = result_set {
|
match result_set {
|
||||||
|
Ok(results) => {
|
||||||
|
println!("[✔️] Worker completed with {} results", results.len());
|
||||||
all_results.extend(results);
|
all_results.extend(results);
|
||||||
}
|
}
|
||||||
|
Err(e) => println!("[⚠️] Worker error: {}", e),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
println!("\n[🏁] Crawling completed!");
|
||||||
|
println!("[📋] Total pages crawled: {}", all_results.len());
|
||||||
|
println!("[💾] Saving results to {}...", output_dir);
|
||||||
|
|
||||||
for res in all_results {
|
for res in all_results {
|
||||||
println!("URL: {}", res.url);
|
let filename = sanitize_filename(&res.url);
|
||||||
if let Some(title) = res.title {
|
let file_path = Path::new(&output_dir).join(filename);
|
||||||
println!("Title: {}", title);
|
|
||||||
}
|
tokio::fs::write(&file_path, &res.html)
|
||||||
if let Some(text) = res.text {
|
.await
|
||||||
println!("Text (truncated): {:.100}...", text);
|
.unwrap_or_else(|_| panic!("Failed to write: {}", file_path.display()));
|
||||||
}
|
|
||||||
println!("Links: {}", res.links.len());
|
println!("[💾] Saved: {}", file_path.display());
|
||||||
println!("Images: {}", res.images.len());
|
|
||||||
if !res.pattern_matches.is_empty() {
|
|
||||||
println!("Pattern Matches: {:?}", res.pattern_matches);
|
|
||||||
}
|
|
||||||
println!("---");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
println!("[✅] Done! Website stored in: {}", output_dir);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user