initial
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
/target
|
||||||
8
.idea/.gitignore
generated
vendored
Normal file
8
.idea/.gitignore
generated
vendored
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# Editor-based HTTP Client requests
|
||||||
|
/httpRequests/
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
||||||
11
.idea/crawler.iml
generated
Normal file
11
.idea/crawler.iml
generated
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="EMPTY_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/target" />
|
||||||
|
</content>
|
||||||
|
<orderEntry type="inheritedJdk" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
||||||
16
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
16
.idea/inspectionProfiles/Project_Default.xml
generated
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<profile version="1.0">
|
||||||
|
<option name="myName" value="Project Default" />
|
||||||
|
<inspection_tool class="NonAsciiCharacters" enabled="false" level="WARNING" enabled_by_default="false" />
|
||||||
|
<inspection_tool class="PyBroadExceptionInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
|
||||||
|
<inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
|
||||||
|
<option name="ignoredErrors">
|
||||||
|
<list>
|
||||||
|
<option value="N806" />
|
||||||
|
<option value="N803" />
|
||||||
|
</list>
|
||||||
|
</option>
|
||||||
|
</inspection_tool>
|
||||||
|
<inspection_tool class="PyTypeCheckerInspection" enabled="false" level="WARNING" enabled_by_default="false" />
|
||||||
|
</profile>
|
||||||
|
</component>
|
||||||
12
.idea/material_theme_project_new.xml
generated
Normal file
12
.idea/material_theme_project_new.xml
generated
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="MaterialThemeProjectNewConfig">
|
||||||
|
<option name="metadata">
|
||||||
|
<MTProjectMetadataState>
|
||||||
|
<option name="migrated" value="true" />
|
||||||
|
<option name="pristineConfig" value="false" />
|
||||||
|
<option name="userId" value="-482e1190:19649c22859:-7ffe" />
|
||||||
|
</MTProjectMetadataState>
|
||||||
|
</option>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/crawler.iml" filepath="$PROJECT_DIR$/.idea/crawler.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
6
.idea/vcs.xml
generated
Normal file
6
.idea/vcs.xml
generated
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
2260
Cargo.lock
generated
Normal file
2260
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
19
Cargo.toml
Normal file
19
Cargo.toml
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
[package]
|
||||||
|
name = "crawler"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2024"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
tokio = { version = "1.28", features = ["full"] }
|
||||||
|
reqwest = { version = "0.11", features = ["json"] }
|
||||||
|
futures = "0.3"
|
||||||
|
scraper = "0.17"
|
||||||
|
structopt = "0.3"
|
||||||
|
indicatif = "0.17"
|
||||||
|
colored = "2.0"
|
||||||
|
regex = "1.10"
|
||||||
|
url = "2.4"
|
||||||
|
serde = { version = "1.0", features = ["derive"] }
|
||||||
|
serde_json = "1.0"
|
||||||
|
csv = "1.2"
|
||||||
|
clap = "2.34.0"
|
||||||
215
src/main.rs
Normal file
215
src/main.rs
Normal file
@@ -0,0 +1,215 @@
|
|||||||
|
use std::collections::{HashSet, VecDeque};
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
|
|
||||||
|
use clap::{App, Arg};
|
||||||
|
use futures::stream::{FuturesUnordered, StreamExt};
|
||||||
|
use regex::Regex;
|
||||||
|
use reqwest::Client;
|
||||||
|
use scraper::{Html, Selector};
|
||||||
|
use tokio;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
struct CrawlResult {
|
||||||
|
url: String,
|
||||||
|
title: Option<String>,
|
||||||
|
text: Option<String>,
|
||||||
|
links: Vec<String>,
|
||||||
|
images: Vec<String>,
|
||||||
|
pattern_matches: Vec<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn crawl_url(
|
||||||
|
url: String,
|
||||||
|
client: Arc<Client>,
|
||||||
|
visited: Arc<Mutex<HashSet<String>>>,
|
||||||
|
to_visit: Arc<Mutex<VecDeque<String>>>,
|
||||||
|
pattern_regex: Option<Regex>,
|
||||||
|
active_count: Arc<AtomicUsize>,
|
||||||
|
) -> Option<CrawlResult> {
|
||||||
|
if {
|
||||||
|
let mut visited = visited.lock().unwrap();
|
||||||
|
if visited.contains(&url) {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
visited.insert(url.clone());
|
||||||
|
active_count.fetch_add(1, Ordering::SeqCst);
|
||||||
|
true
|
||||||
|
} == false {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
let res = client.get(&url).send().await.ok()?.text().await.ok()?;
|
||||||
|
let url_clone = url.clone();
|
||||||
|
let pattern_clone = pattern_regex.clone();
|
||||||
|
|
||||||
|
// Move heavy processing to blocking thread pool
|
||||||
|
let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
|
||||||
|
let document = Html::parse_document(&res);
|
||||||
|
|
||||||
|
let title_selector = Selector::parse("title").unwrap();
|
||||||
|
let title = document
|
||||||
|
.select(&title_selector)
|
||||||
|
.next()
|
||||||
|
.map(|e| e.inner_html());
|
||||||
|
|
||||||
|
let body_selector = Selector::parse("body").unwrap();
|
||||||
|
let text = document
|
||||||
|
.select(&body_selector)
|
||||||
|
.next()
|
||||||
|
.map(|e| e.text().collect::<Vec<_>>().join(" "));
|
||||||
|
|
||||||
|
let link_selector = Selector::parse("a[href]").unwrap();
|
||||||
|
let links: Vec<String> = document
|
||||||
|
.select(&link_selector)
|
||||||
|
.filter_map(|e| e.value().attr("href"))
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let img_selector = Selector::parse("img[src]").unwrap();
|
||||||
|
let images: Vec<String> = document
|
||||||
|
.select(&img_selector)
|
||||||
|
.filter_map(|e| e.value().attr("src"))
|
||||||
|
.map(|s| s.to_string())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
|
||||||
|
regex
|
||||||
|
.captures_iter(text)
|
||||||
|
.filter_map(|cap| cap.get(0).map(|m| m.as_str().to_string()))
|
||||||
|
.collect()
|
||||||
|
} else {
|
||||||
|
vec![]
|
||||||
|
};
|
||||||
|
|
||||||
|
(title, text, links, images, pattern_matches)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
.ok()?;
|
||||||
|
|
||||||
|
{
|
||||||
|
let mut queue = to_visit.lock().unwrap();
|
||||||
|
for link in &links {
|
||||||
|
if link.starts_with("http") {
|
||||||
|
queue.push_back(link.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
active_count.fetch_sub(1, Ordering::SeqCst);
|
||||||
|
|
||||||
|
Some(CrawlResult {
|
||||||
|
url: url_clone,
|
||||||
|
title,
|
||||||
|
text,
|
||||||
|
links,
|
||||||
|
images,
|
||||||
|
pattern_matches,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
let matches = App::new("Web Crawler")
|
||||||
|
.version("1.0")
|
||||||
|
.about("Multi-threaded web crawler with pattern matching")
|
||||||
|
.arg(
|
||||||
|
Arg::with_name("url")
|
||||||
|
.help("Starting URL")
|
||||||
|
.required(true),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::with_name("pattern")
|
||||||
|
.short("p")
|
||||||
|
.long("pattern")
|
||||||
|
.help("Regex pattern to match in page text")
|
||||||
|
.takes_value(true),
|
||||||
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::with_name("concurrency")
|
||||||
|
.short("c")
|
||||||
|
.long("concurrency")
|
||||||
|
.help("Max concurrent requests")
|
||||||
|
.takes_value(true),
|
||||||
|
)
|
||||||
|
.get_matches();
|
||||||
|
|
||||||
|
let start_url = matches.value_of("url").unwrap().to_string();
|
||||||
|
let pattern = matches.value_of("pattern");
|
||||||
|
let concurrency: usize = matches
|
||||||
|
.value_of("concurrency")
|
||||||
|
.unwrap_or("20")
|
||||||
|
.parse()
|
||||||
|
.unwrap_or(20);
|
||||||
|
|
||||||
|
let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex"));
|
||||||
|
let client = Arc::new(Client::new());
|
||||||
|
|
||||||
|
let visited = Arc::new(Mutex::new(HashSet::new()));
|
||||||
|
let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url])));
|
||||||
|
let active_count = Arc::new(AtomicUsize::new(0));
|
||||||
|
|
||||||
|
let mut tasks = FuturesUnordered::new();
|
||||||
|
|
||||||
|
for _ in 0..concurrency {
|
||||||
|
let client = client.clone();
|
||||||
|
let visited = visited.clone();
|
||||||
|
let to_visit = to_visit.clone();
|
||||||
|
let pattern_regex = pattern_regex.clone();
|
||||||
|
let active_count = active_count.clone();
|
||||||
|
|
||||||
|
tasks.push(tokio::spawn(async move {
|
||||||
|
let mut results = vec![];
|
||||||
|
loop {
|
||||||
|
let next_url = {
|
||||||
|
let mut queue = to_visit.lock().unwrap();
|
||||||
|
queue.pop_front()
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some(url) = next_url {
|
||||||
|
if let Some(result) = crawl_url(
|
||||||
|
url,
|
||||||
|
client.clone(),
|
||||||
|
visited.clone(),
|
||||||
|
to_visit.clone(),
|
||||||
|
pattern_regex.clone(),
|
||||||
|
active_count.clone(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
results.push(result);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if active_count.load(Ordering::SeqCst) == 0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
results
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut all_results = vec![];
|
||||||
|
while let Some(result_set) = tasks.next().await {
|
||||||
|
if let Ok(results) = result_set {
|
||||||
|
all_results.extend(results);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for res in all_results {
|
||||||
|
println!("URL: {}", res.url);
|
||||||
|
if let Some(title) = res.title {
|
||||||
|
println!("Title: {}", title);
|
||||||
|
}
|
||||||
|
if let Some(text) = res.text {
|
||||||
|
println!("Text (truncated): {:.100}...", text);
|
||||||
|
}
|
||||||
|
println!("Links: {}", res.links.len());
|
||||||
|
println!("Images: {}", res.images.len());
|
||||||
|
if !res.pattern_matches.is_empty() {
|
||||||
|
println!("Pattern Matches: {:?}", res.pattern_matches);
|
||||||
|
}
|
||||||
|
println!("---");
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user