initial

2025-05-02 17:33:21 +02:00
commit 1bfa72db56
10 changed files with 2556 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+/target
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# Default ignored files
+/shelf/
+/workspace.xml
+# Editor-based HTTP Client requests
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
--- a/.idea/crawler.iml
+++ b/.idea/crawler.iml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="EMPTY_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
+      <excludeFolder url="file://$MODULE_DIR$/target" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,16 @@
+<component name="InspectionProjectProfileManager">
+  <profile version="1.0">
+    <option name="myName" value="Project Default" />
+    <inspection_tool class="NonAsciiCharacters" enabled="false" level="WARNING" enabled_by_default="false" />
+    <inspection_tool class="PyBroadExceptionInspection" enabled="false" level="WEAK WARNING" enabled_by_default="false" />
+    <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
+      <option name="ignoredErrors">
+        <list>
+          <option value="N806" />
+          <option value="N803" />
+        </list>
+      </option>
+    </inspection_tool>
+    <inspection_tool class="PyTypeCheckerInspection" enabled="false" level="WARNING" enabled_by_default="false" />
+  </profile>
+</component>
--- a/.idea/material_theme_project_new.xml
+++ b/.idea/material_theme_project_new.xml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="MaterialThemeProjectNewConfig">
+    <option name="metadata">
+      <MTProjectMetadataState>
+        <option name="migrated" value="true" />
+        <option name="pristineConfig" value="false" />
+        <option name="userId" value="-482e1190:19649c22859:-7ffe" />
+      </MTProjectMetadataState>
+    </option>
+  </component>
+</project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/crawler.iml" filepath="$PROJECT_DIR$/.idea/crawler.iml" />
+    </modules>
+  </component>
+</project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -0,0 +1,19 @@
+[package]
+name = "crawler"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+tokio = { version = "1.28", features = ["full"] }
+reqwest = { version = "0.11", features = ["json"] }
+futures = "0.3"
+scraper = "0.17"
+structopt = "0.3"
+indicatif = "0.17"
+colored = "2.0"
+regex = "1.10"
+url = "2.4"
+serde = { version = "1.0", features = ["derive"] }
+serde_json = "1.0"
+csv = "1.2"
+clap = "2.34.0"
--- a/src/main.rs
+++ b/src/main.rs
@@ -0,0 +1,215 @@
+use std::collections::{HashSet, VecDeque};
+use std::sync::{Arc, Mutex};
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use clap::{App, Arg};
+use futures::stream::{FuturesUnordered, StreamExt};
+use regex::Regex;
+use reqwest::Client;
+use scraper::{Html, Selector};
+use tokio;
+
+#[derive(Debug, Clone)]
+struct CrawlResult {
+    url: String,
+    title: Option<String>,
+    text: Option<String>,
+    links: Vec<String>,
+    images: Vec<String>,
+    pattern_matches: Vec<String>,
+}
+
+async fn crawl_url(
+    url: String,
+    client: Arc<Client>,
+    visited: Arc<Mutex<HashSet<String>>>,
+    to_visit: Arc<Mutex<VecDeque<String>>>,
+    pattern_regex: Option<Regex>,
+    active_count: Arc<AtomicUsize>,
+) -> Option<CrawlResult> {
+    if {
+        let mut visited = visited.lock().unwrap();
+        if visited.contains(&url) {
+            return None;
+        }
+        visited.insert(url.clone());
+        active_count.fetch_add(1, Ordering::SeqCst);
+        true
+    } == false {
+        return None;
+    }
+
+    let res = client.get(&url).send().await.ok()?.text().await.ok()?;
+    let url_clone = url.clone();
+    let pattern_clone = pattern_regex.clone();
+
+    // Move heavy processing to blocking thread pool
+    let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
+        let document = Html::parse_document(&res);
+
+        let title_selector = Selector::parse("title").unwrap();
+        let title = document
+            .select(&title_selector)
+            .next()
+            .map(|e| e.inner_html());
+
+        let body_selector = Selector::parse("body").unwrap();
+        let text = document
+            .select(&body_selector)
+            .next()
+            .map(|e| e.text().collect::<Vec<_>>().join(" "));
+
+        let link_selector = Selector::parse("a[href]").unwrap();
+        let links: Vec<String> = document
+            .select(&link_selector)
+            .filter_map(|e| e.value().attr("href"))
+            .map(|s| s.to_string())
+            .collect();
+
+        let img_selector = Selector::parse("img[src]").unwrap();
+        let images: Vec<String> = document
+            .select(&img_selector)
+            .filter_map(|e| e.value().attr("src"))
+            .map(|s| s.to_string())
+            .collect();
+
+        let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
+            regex
+                .captures_iter(text)
+                .filter_map(|cap| cap.get(0).map(|m| m.as_str().to_string()))
+                .collect()
+        } else {
+            vec![]
+        };
+
+        (title, text, links, images, pattern_matches)
+    })
+        .await
+        .ok()?;
+
+    {
+        let mut queue = to_visit.lock().unwrap();
+        for link in &links {
+            if link.starts_with("http") {
+                queue.push_back(link.clone());
+            }
+        }
+    }
+
+    active_count.fetch_sub(1, Ordering::SeqCst);
+
+    Some(CrawlResult {
+        url: url_clone,
+        title,
+        text,
+        links,
+        images,
+        pattern_matches,
+    })
+}
+
+#[tokio::main]
+async fn main() {
+    let matches = App::new("Web Crawler")
+        .version("1.0")
+        .about("Multi-threaded web crawler with pattern matching")
+        .arg(
+            Arg::with_name("url")
+                .help("Starting URL")
+                .required(true),
+        )
+        .arg(
+            Arg::with_name("pattern")
+                .short("p")
+                .long("pattern")
+                .help("Regex pattern to match in page text")
+                .takes_value(true),
+        )
+        .arg(
+            Arg::with_name("concurrency")
+                .short("c")
+                .long("concurrency")
+                .help("Max concurrent requests")
+                .takes_value(true),
+        )
+        .get_matches();
+
+    let start_url = matches.value_of("url").unwrap().to_string();
+    let pattern = matches.value_of("pattern");
+    let concurrency: usize = matches
+        .value_of("concurrency")
+        .unwrap_or("20")
+        .parse()
+        .unwrap_or(20);
+
+    let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex"));
+    let client = Arc::new(Client::new());
+
+    let visited = Arc::new(Mutex::new(HashSet::new()));
+    let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url])));
+    let active_count = Arc::new(AtomicUsize::new(0));
+
+    let mut tasks = FuturesUnordered::new();
+
+    for _ in 0..concurrency {
+        let client = client.clone();
+        let visited = visited.clone();
+        let to_visit = to_visit.clone();
+        let pattern_regex = pattern_regex.clone();
+        let active_count = active_count.clone();
+
+        tasks.push(tokio::spawn(async move {
+            let mut results = vec![];
+            loop {
+                let next_url = {
+                    let mut queue = to_visit.lock().unwrap();
+                    queue.pop_front()
+                };
+
+                if let Some(url) = next_url {
+                    if let Some(result) = crawl_url(
+                        url,
+                        client.clone(),
+                        visited.clone(),
+                        to_visit.clone(),
+                        pattern_regex.clone(),
+                        active_count.clone(),
+                    )
+                        .await
+                    {
+                        results.push(result);
+                    }
+                } else {
+                    if active_count.load(Ordering::SeqCst) == 0 {
+                        break;
+                    }
+                    tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
+                }
+            }
+            results
+        }));
+    }
+
+    let mut all_results = vec![];
+    while let Some(result_set) = tasks.next().await {
+        if let Ok(results) = result_set {
+            all_results.extend(results);
+        }
+    }
+
+    for res in all_results {
+        println!("URL: {}", res.url);
+        if let Some(title) = res.title {
+            println!("Title: {}", title);
+        }
+        if let Some(text) = res.text {
+            println!("Text (truncated): {:.100}...", text);
+        }
+        println!("Links: {}", res.links.len());
+        println!("Images: {}", res.images.len());
+        if !res.pattern_matches.is_empty() {
+            println!("Pattern Matches: {:?}", res.pattern_matches);
+        }
+        println!("---");
+    }
+}