some more bullshit
This commit is contained in:
26
src/main.rs
26
src/main.rs
@@ -27,9 +27,12 @@ async fn crawl_url(
|
|||||||
pattern_regex: Option<Regex>,
|
pattern_regex: Option<Regex>,
|
||||||
active_count: Arc<AtomicUsize>,
|
active_count: Arc<AtomicUsize>,
|
||||||
) -> Option<CrawlResult> {
|
) -> Option<CrawlResult> {
|
||||||
|
println!("[🌐] Attempting: {}", url);
|
||||||
|
|
||||||
if {
|
if {
|
||||||
let mut visited = visited.lock().unwrap();
|
let mut visited = visited.lock().unwrap();
|
||||||
if visited.contains(&url) {
|
if visited.contains(&url) {
|
||||||
|
println!("[⏭️] Already visited: {}", url);
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
visited.insert(url.clone());
|
visited.insert(url.clone());
|
||||||
@@ -39,26 +42,39 @@ async fn crawl_url(
|
|||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
let res = client.get(&url).send().await.ok()?.text().await.ok()?;
|
let res = match client.get(&url).send().await {
|
||||||
|
Ok(response) => {
|
||||||
|
println!("[✅] Success: {}", url);
|
||||||
|
response.text().await.ok()?
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
println!("[❌] Failed: {} - {}", url, e);
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let url_clone = url.clone();
|
let url_clone = url.clone();
|
||||||
let pattern_clone = pattern_regex.clone();
|
let pattern_clone = pattern_regex.clone();
|
||||||
|
|
||||||
// Move heavy processing to blocking thread pool
|
println!("[🔍] Processing: {}", url);
|
||||||
let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
|
let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
|
||||||
let document = Html::parse_document(&res);
|
let document = Html::parse_document(&res);
|
||||||
|
|
||||||
|
// Title extraction
|
||||||
let title_selector = Selector::parse("title").unwrap();
|
let title_selector = Selector::parse("title").unwrap();
|
||||||
let title = document
|
let title = document
|
||||||
.select(&title_selector)
|
.select(&title_selector)
|
||||||
.next()
|
.next()
|
||||||
.map(|e| e.inner_html());
|
.map(|e| e.inner_html());
|
||||||
|
|
||||||
|
// Body text extraction
|
||||||
let body_selector = Selector::parse("body").unwrap();
|
let body_selector = Selector::parse("body").unwrap();
|
||||||
let text = document
|
let text = document
|
||||||
.select(&body_selector)
|
.select(&body_selector)
|
||||||
.next()
|
.next()
|
||||||
.map(|e| e.text().collect::<Vec<_>>().join(" "));
|
.map(|e| e.text().collect::<Vec<_>>().join(" "));
|
||||||
|
|
||||||
|
// Link extraction
|
||||||
let link_selector = Selector::parse("a[href]").unwrap();
|
let link_selector = Selector::parse("a[href]").unwrap();
|
||||||
let links: Vec<String> = document
|
let links: Vec<String> = document
|
||||||
.select(&link_selector)
|
.select(&link_selector)
|
||||||
@@ -66,6 +82,7 @@ async fn crawl_url(
|
|||||||
.map(|s| s.to_string())
|
.map(|s| s.to_string())
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
// Image extraction
|
||||||
let img_selector = Selector::parse("img[src]").unwrap();
|
let img_selector = Selector::parse("img[src]").unwrap();
|
||||||
let images: Vec<String> = document
|
let images: Vec<String> = document
|
||||||
.select(&img_selector)
|
.select(&img_selector)
|
||||||
@@ -73,6 +90,7 @@ async fn crawl_url(
|
|||||||
.map(|s| s.to_string())
|
.map(|s| s.to_string())
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
// Pattern matching
|
||||||
let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
|
let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
|
||||||
regex
|
regex
|
||||||
.captures_iter(text)
|
.captures_iter(text)
|
||||||
@@ -87,13 +105,17 @@ async fn crawl_url(
|
|||||||
.await
|
.await
|
||||||
.ok()?;
|
.ok()?;
|
||||||
|
|
||||||
|
// Queue new links
|
||||||
{
|
{
|
||||||
let mut queue = to_visit.lock().unwrap();
|
let mut queue = to_visit.lock().unwrap();
|
||||||
|
let mut new_links = 0;
|
||||||
for link in &links {
|
for link in &links {
|
||||||
if link.starts_with("http") {
|
if link.starts_with("http") {
|
||||||
queue.push_back(link.clone());
|
queue.push_back(link.clone());
|
||||||
|
new_links += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
println!("[🔄] Discovered {} new links from {}", new_links, url);
|
||||||
}
|
}
|
||||||
|
|
||||||
active_count.fetch_sub(1, Ordering::SeqCst);
|
active_count.fetch_sub(1, Ordering::SeqCst);
|
||||||
|
|||||||
Reference in New Issue
Block a user