some more bullshit
This commit is contained in:
26
src/main.rs
26
src/main.rs
@@ -27,9 +27,12 @@ async fn crawl_url(
|
||||
pattern_regex: Option<Regex>,
|
||||
active_count: Arc<AtomicUsize>,
|
||||
) -> Option<CrawlResult> {
|
||||
println!("[🌐] Attempting: {}", url);
|
||||
|
||||
if {
|
||||
let mut visited = visited.lock().unwrap();
|
||||
if visited.contains(&url) {
|
||||
println!("[⏭️] Already visited: {}", url);
|
||||
return None;
|
||||
}
|
||||
visited.insert(url.clone());
|
||||
@@ -39,26 +42,39 @@ async fn crawl_url(
|
||||
return None;
|
||||
}
|
||||
|
||||
let res = client.get(&url).send().await.ok()?.text().await.ok()?;
|
||||
let res = match client.get(&url).send().await {
|
||||
Ok(response) => {
|
||||
println!("[✅] Success: {}", url);
|
||||
response.text().await.ok()?
|
||||
}
|
||||
Err(e) => {
|
||||
println!("[❌] Failed: {} - {}", url, e);
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
let url_clone = url.clone();
|
||||
let pattern_clone = pattern_regex.clone();
|
||||
|
||||
// Move heavy processing to blocking thread pool
|
||||
println!("[🔍] Processing: {}", url);
|
||||
let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
|
||||
let document = Html::parse_document(&res);
|
||||
|
||||
// Title extraction
|
||||
let title_selector = Selector::parse("title").unwrap();
|
||||
let title = document
|
||||
.select(&title_selector)
|
||||
.next()
|
||||
.map(|e| e.inner_html());
|
||||
|
||||
// Body text extraction
|
||||
let body_selector = Selector::parse("body").unwrap();
|
||||
let text = document
|
||||
.select(&body_selector)
|
||||
.next()
|
||||
.map(|e| e.text().collect::<Vec<_>>().join(" "));
|
||||
|
||||
// Link extraction
|
||||
let link_selector = Selector::parse("a[href]").unwrap();
|
||||
let links: Vec<String> = document
|
||||
.select(&link_selector)
|
||||
@@ -66,6 +82,7 @@ async fn crawl_url(
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
|
||||
// Image extraction
|
||||
let img_selector = Selector::parse("img[src]").unwrap();
|
||||
let images: Vec<String> = document
|
||||
.select(&img_selector)
|
||||
@@ -73,6 +90,7 @@ async fn crawl_url(
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
|
||||
// Pattern matching
|
||||
let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
|
||||
regex
|
||||
.captures_iter(text)
|
||||
@@ -87,13 +105,17 @@ async fn crawl_url(
|
||||
.await
|
||||
.ok()?;
|
||||
|
||||
// Queue new links
|
||||
{
|
||||
let mut queue = to_visit.lock().unwrap();
|
||||
let mut new_links = 0;
|
||||
for link in &links {
|
||||
if link.starts_with("http") {
|
||||
queue.push_back(link.clone());
|
||||
new_links += 1;
|
||||
}
|
||||
}
|
||||
println!("[🔄] Discovered {} new links from {}", new_links, url);
|
||||
}
|
||||
|
||||
active_count.fetch_sub(1, Ordering::SeqCst);
|
||||
|
||||
Reference in New Issue
Block a user