This commit is contained in:
rattatwinko
2025-05-02 17:33:21 +02:00
commit 1bfa72db56
10 changed files with 2556 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
/target

8
.idea/.gitignore generated vendored Normal file
View File

@@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

11
.idea/crawler.iml generated Normal file
View File

@@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="EMPTY_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
<excludeFolder url="file://$MODULE_DIR$/target" />
</content>
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

View File

@@ -0,0 +1,16 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="NonAsciiCharacters" enabled="false" level="WARNING" enabled_by_default="false" />
<inspection_tool class="PyBroadExceptionInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
<option name="ignoredErrors">
<list>
<option value="N806" />
<option value="N803" />
</list>
</option>
</inspection_tool>
<inspection_tool class="PyTypeCheckerInspection" enabled="false" level="WARNING" enabled_by_default="false" />
</profile>
</component>

12
.idea/material_theme_project_new.xml generated Normal file
View File

@@ -0,0 +1,12 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="MaterialThemeProjectNewConfig">
<option name="metadata">
<MTProjectMetadataState>
<option name="migrated" value="true" />
<option name="pristineConfig" value="false" />
<option name="userId" value="-482e1190:19649c22859:-7ffe" />
</MTProjectMetadataState>
</option>
</component>
</project>

8
.idea/modules.xml generated Normal file
View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/crawler.iml" filepath="$PROJECT_DIR$/.idea/crawler.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml generated Normal file
View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

2260
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

19
Cargo.toml Normal file
View File

@@ -0,0 +1,19 @@
[package]
name = "crawler"
version = "0.1.0"
edition = "2024"
[dependencies]
tokio = { version = "1.28", features = ["full"] }
reqwest = { version = "0.11", features = ["json"] }
futures = "0.3"
scraper = "0.17"
structopt = "0.3"
indicatif = "0.17"
colored = "2.0"
regex = "1.10"
url = "2.4"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
csv = "1.2"
clap = "2.34.0"

215
src/main.rs Normal file
View File

@@ -0,0 +1,215 @@
use std::collections::{HashSet, VecDeque};
use std::sync::{Arc, Mutex};
use std::sync::atomic::{AtomicUsize, Ordering};
use clap::{App, Arg};
use futures::stream::{FuturesUnordered, StreamExt};
use regex::Regex;
use reqwest::Client;
use scraper::{Html, Selector};
use tokio;
#[derive(Debug, Clone)]
struct CrawlResult {
url: String,
title: Option<String>,
text: Option<String>,
links: Vec<String>,
images: Vec<String>,
pattern_matches: Vec<String>,
}
async fn crawl_url(
url: String,
client: Arc<Client>,
visited: Arc<Mutex<HashSet<String>>>,
to_visit: Arc<Mutex<VecDeque<String>>>,
pattern_regex: Option<Regex>,
active_count: Arc<AtomicUsize>,
) -> Option<CrawlResult> {
if {
let mut visited = visited.lock().unwrap();
if visited.contains(&url) {
return None;
}
visited.insert(url.clone());
active_count.fetch_add(1, Ordering::SeqCst);
true
} == false {
return None;
}
let res = client.get(&url).send().await.ok()?.text().await.ok()?;
let url_clone = url.clone();
let pattern_clone = pattern_regex.clone();
// Move heavy processing to blocking thread pool
let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
let document = Html::parse_document(&res);
let title_selector = Selector::parse("title").unwrap();
let title = document
.select(&title_selector)
.next()
.map(|e| e.inner_html());
let body_selector = Selector::parse("body").unwrap();
let text = document
.select(&body_selector)
.next()
.map(|e| e.text().collect::<Vec<_>>().join(" "));
let link_selector = Selector::parse("a[href]").unwrap();
let links: Vec<String> = document
.select(&link_selector)
.filter_map(|e| e.value().attr("href"))
.map(|s| s.to_string())
.collect();
let img_selector = Selector::parse("img[src]").unwrap();
let images: Vec<String> = document
.select(&img_selector)
.filter_map(|e| e.value().attr("src"))
.map(|s| s.to_string())
.collect();
let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
regex
.captures_iter(text)
.filter_map(|cap| cap.get(0).map(|m| m.as_str().to_string()))
.collect()
} else {
vec![]
};
(title, text, links, images, pattern_matches)
})
.await
.ok()?;
{
let mut queue = to_visit.lock().unwrap();
for link in &links {
if link.starts_with("http") {
queue.push_back(link.clone());
}
}
}
active_count.fetch_sub(1, Ordering::SeqCst);
Some(CrawlResult {
url: url_clone,
title,
text,
links,
images,
pattern_matches,
})
}
#[tokio::main]
async fn main() {
let matches = App::new("Web Crawler")
.version("1.0")
.about("Multi-threaded web crawler with pattern matching")
.arg(
Arg::with_name("url")
.help("Starting URL")
.required(true),
)
.arg(
Arg::with_name("pattern")
.short("p")
.long("pattern")
.help("Regex pattern to match in page text")
.takes_value(true),
)
.arg(
Arg::with_name("concurrency")
.short("c")
.long("concurrency")
.help("Max concurrent requests")
.takes_value(true),
)
.get_matches();
let start_url = matches.value_of("url").unwrap().to_string();
let pattern = matches.value_of("pattern");
let concurrency: usize = matches
.value_of("concurrency")
.unwrap_or("20")
.parse()
.unwrap_or(20);
let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex"));
let client = Arc::new(Client::new());
let visited = Arc::new(Mutex::new(HashSet::new()));
let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url])));
let active_count = Arc::new(AtomicUsize::new(0));
let mut tasks = FuturesUnordered::new();
for _ in 0..concurrency {
let client = client.clone();
let visited = visited.clone();
let to_visit = to_visit.clone();
let pattern_regex = pattern_regex.clone();
let active_count = active_count.clone();
tasks.push(tokio::spawn(async move {
let mut results = vec![];
loop {
let next_url = {
let mut queue = to_visit.lock().unwrap();
queue.pop_front()
};
if let Some(url) = next_url {
if let Some(result) = crawl_url(
url,
client.clone(),
visited.clone(),
to_visit.clone(),
pattern_regex.clone(),
active_count.clone(),
)
.await
{
results.push(result);
}
} else {
if active_count.load(Ordering::SeqCst) == 0 {
break;
}
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
}
}
results
}));
}
let mut all_results = vec![];
while let Some(result_set) = tasks.next().await {
if let Ok(results) = result_set {
all_results.extend(results);
}
}
for res in all_results {
println!("URL: {}", res.url);
if let Some(title) = res.title {
println!("Title: {}", title);
}
if let Some(text) = res.text {
println!("Text (truncated): {:.100}...", text);
}
println!("Links: {}", res.links.len());
println!("Images: {}", res.images.len());
if !res.pattern_matches.is_empty() {
println!("Pattern Matches: {:?}", res.pattern_matches);
}
println!("---");
}
}