dont work very well fix in future

This commit is contained in:
rattatwinko
2025-05-02 18:39:57 +02:00
parent 995741ecad
commit ce85f8bdba
3 changed files with 382 additions and 385 deletions

394
Cargo.lock generated
View File

@@ -17,19 +17,6 @@ version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
[[package]]
name = "ahash"
version = "0.8.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
dependencies = [
"cfg-if",
"getrandom 0.2.16",
"once_cell",
"version_check",
"zerocopy 0.7.35",
]
[[package]]
name = "aho-corasick"
version = "1.1.3"
@@ -146,16 +133,6 @@ dependencies = [
"vec_map",
]
[[package]]
name = "colored"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c"
dependencies = [
"lazy_static",
"windows-sys 0.59.0",
]
[[package]]
name = "console"
version = "0.15.11"
@@ -169,6 +146,12 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "convert_case"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
[[package]]
name = "core-foundation"
version = "0.9.4"
@@ -190,31 +173,55 @@ name = "crawler"
version = "0.1.0"
dependencies = [
"clap",
"colored",
"csv",
"futures",
"indicatif",
"regex",
"reqwest",
"scraper",
"serde",
"serde_json",
"structopt",
"tokio",
"url",
]
[[package]]
name = "cssparser"
version = "0.31.2"
name = "crossbeam-deque"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be"
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "cssparser"
version = "0.27.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa",
"itoa 0.4.8",
"matches",
"phf",
"proc-macro2",
"quote",
"smallvec",
"syn 1.0.109",
]
[[package]]
@@ -227,35 +234,16 @@ dependencies = [
"syn 2.0.101",
]
[[package]]
name = "csv"
version = "1.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf"
dependencies = [
"csv-core",
"itoa",
"ryu",
"serde",
]
[[package]]
name = "csv-core"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d"
dependencies = [
"memchr",
]
[[package]]
name = "derive_more"
version = "0.99.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f"
dependencies = [
"convert_case",
"proc-macro2",
"quote",
"rustc_version",
"syn 2.0.101",
]
@@ -291,6 +279,12 @@ version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "12a0bb14ac04a9fcf170d0bbbef949b44cc492f4452bd20c095636956f653642"
[[package]]
name = "either"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
[[package]]
name = "encode_unicode"
version = "1.0.0"
@@ -477,13 +471,13 @@ dependencies = [
[[package]]
name = "getrandom"
version = "0.2.16"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
dependencies = [
"cfg-if",
"libc",
"wasi 0.11.0+wasi-snapshot-preview1",
"wasi 0.9.0+wasi-snapshot-preview1",
]
[[package]]
@@ -529,15 +523,6 @@ version = "0.15.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3"
[[package]]
name = "heck"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
dependencies = [
"unicode-segmentation",
]
[[package]]
name = "hermit-abi"
version = "0.1.19"
@@ -549,9 +534,9 @@ dependencies = [
[[package]]
name = "html5ever"
version = "0.26.0"
version = "0.25.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
checksum = "e5c13fb08e5d4dfc151ee5e88bae63f7773d61852f3bdc73c9f4b9e1bde03148"
dependencies = [
"log",
"mac",
@@ -569,7 +554,7 @@ checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1"
dependencies = [
"bytes",
"fnv",
"itoa",
"itoa 1.0.15",
]
[[package]]
@@ -610,7 +595,7 @@ dependencies = [
"http-body",
"httparse",
"httpdate",
"itoa",
"itoa 1.0.15",
"pin-project-lite",
"socket2",
"tokio",
@@ -790,6 +775,7 @@ dependencies = [
"console",
"number_prefix",
"portable-atomic",
"rayon",
"unicode-width 0.2.0",
"web-time",
]
@@ -800,6 +786,12 @@ version = "2.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130"
[[package]]
name = "itoa"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
[[package]]
name = "itoa"
version = "1.0.15"
@@ -816,12 +808,6 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "lazy_static"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "libc"
version = "0.2.172"
@@ -864,9 +850,9 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "markup5ever"
version = "0.11.0"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd"
dependencies = [
"log",
"phf",
@@ -876,6 +862,12 @@ dependencies = [
"tendril",
]
[[package]]
name = "matches"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5"
[[package]]
name = "memchr"
version = "2.7.4"
@@ -931,6 +923,12 @@ version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
[[package]]
name = "nodrop"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
[[package]]
name = "number_prefix"
version = "0.4.0"
@@ -1027,33 +1025,33 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]]
name = "phf"
version = "0.10.1"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
dependencies = [
"phf_macros",
"phf_shared 0.10.0",
"phf_shared 0.8.0",
"proc-macro-hack",
]
[[package]]
name = "phf_codegen"
version = "0.10.0"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
"phf_generator 0.8.0",
"phf_shared 0.8.0",
]
[[package]]
name = "phf_generator"
version = "0.10.0"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
dependencies = [
"phf_shared 0.10.0",
"rand",
"phf_shared 0.8.0",
"rand 0.7.3",
]
[[package]]
@@ -1063,17 +1061,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
dependencies = [
"phf_shared 0.11.3",
"rand",
"rand 0.8.5",
]
[[package]]
name = "phf_macros"
version = "0.10.0"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58fdf3184dd560f160dd73922bea2d5cd6e8f064bf4b13110abd81b03697b4e0"
checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
"phf_generator 0.8.0",
"phf_shared 0.8.0",
"proc-macro-hack",
"proc-macro2",
"quote",
@@ -1082,9 +1080,9 @@ dependencies = [
[[package]]
name = "phf_shared"
version = "0.10.0"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7"
dependencies = [
"siphasher 0.3.11",
]
@@ -1128,7 +1126,7 @@ version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
dependencies = [
"zerocopy 0.8.25",
"zerocopy",
]
[[package]]
@@ -1137,30 +1135,6 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro-error"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
dependencies = [
"proc-macro-error-attr",
"proc-macro2",
"quote",
"syn 1.0.109",
"version_check",
]
[[package]]
name = "proc-macro-error-attr"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
dependencies = [
"proc-macro2",
"quote",
"version_check",
]
[[package]]
name = "proc-macro-hack"
version = "0.5.20+deprecated"
@@ -1191,25 +1165,46 @@ version = "5.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
[[package]]
name = "rand"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
dependencies = [
"getrandom 0.1.16",
"libc",
"rand_chacha",
"rand_core 0.5.1",
"rand_hc",
"rand_pcg",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha",
"rand_core",
"rand_core 0.6.4",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
dependencies = [
"ppv-lite86",
"rand_core",
"rand_core 0.5.1",
]
[[package]]
name = "rand_core"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
dependencies = [
"getrandom 0.1.16",
]
[[package]]
@@ -1217,8 +1212,43 @@ name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
[[package]]
name = "rand_hc"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
dependencies = [
"getrandom 0.2.16",
"rand_core 0.5.1",
]
[[package]]
name = "rand_pcg"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
dependencies = [
"rand_core 0.5.1",
]
[[package]]
name = "rayon"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
dependencies = [
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
dependencies = [
"crossbeam-deque",
"crossbeam-utils",
]
[[package]]
@@ -1305,6 +1335,15 @@ version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
[[package]]
name = "rustc_version"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
dependencies = [
"semver",
]
[[package]]
name = "rustix"
version = "1.0.7"
@@ -1356,16 +1395,15 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "scraper"
version = "0.17.1"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c95a930e03325234c18c7071fd2b60118307e025d6fff3e12745ffbf63a3d29c"
checksum = "48e02aa790c80c2e494130dec6a522033b6a23603ffc06360e9fe6c611ea2c12"
dependencies = [
"ahash",
"cssparser",
"ego-tree",
"getopts",
"html5ever",
"once_cell",
"matches",
"selectors",
"smallvec",
"tendril",
@@ -1396,23 +1434,30 @@ dependencies = [
[[package]]
name = "selectors"
version = "0.25.0"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06"
checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe"
dependencies = [
"bitflags 2.9.0",
"bitflags 1.3.2",
"cssparser",
"derive_more",
"fxhash",
"log",
"new_debug_unreachable",
"matches",
"phf",
"phf_codegen",
"precomputed-hash",
"servo_arc",
"smallvec",
"thin-slice",
]
[[package]]
name = "semver"
version = "1.0.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0"
[[package]]
name = "serde"
version = "1.0.219"
@@ -1439,7 +1484,7 @@ version = "1.0.140"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373"
dependencies = [
"itoa",
"itoa 1.0.15",
"memchr",
"ryu",
"serde",
@@ -1452,17 +1497,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
dependencies = [
"form_urlencoded",
"itoa",
"itoa 1.0.15",
"ryu",
"serde",
]
[[package]]
name = "servo_arc"
version = "0.3.0"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44"
checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432"
dependencies = [
"nodrop",
"stable_deref_trait",
]
@@ -1555,30 +1601,6 @@ version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
[[package]]
name = "structopt"
version = "0.3.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10"
dependencies = [
"clap",
"lazy_static",
"structopt-derive",
]
[[package]]
name = "structopt-derive"
version = "0.4.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0"
dependencies = [
"heck",
"proc-macro-error",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "syn"
version = "1.0.109"
@@ -1672,6 +1694,12 @@ dependencies = [
"unicode-width 0.1.14",
]
[[package]]
name = "thin-slice"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
[[package]]
name = "tinystr"
version = "0.7.6"
@@ -1771,12 +1799,6 @@ version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
[[package]]
name = "unicode-segmentation"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
[[package]]
name = "unicode-width"
version = "0.1.14"
@@ -1830,12 +1852,6 @@ version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
[[package]]
name = "version_check"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "want"
version = "0.3.1"
@@ -1845,6 +1861,12 @@ dependencies = [
"try-lock",
]
[[package]]
name = "wasi"
version = "0.9.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
@@ -2176,33 +2198,13 @@ dependencies = [
"synstructure",
]
[[package]]
name = "zerocopy"
version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
dependencies = [
"zerocopy-derive 0.7.35",
]
[[package]]
name = "zerocopy"
version = "0.8.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb"
dependencies = [
"zerocopy-derive 0.8.25",
]
[[package]]
name = "zerocopy-derive"
version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.101",
"zerocopy-derive",
]
[[package]]

View File

@@ -4,16 +4,11 @@ version = "0.1.0"
edition = "2024"
[dependencies]
tokio = { version = "1.28", features = ["full"] }
indicatif = { version = "0.17", features = ["rayon"] }
tokio = { version = "1.0", features = ["full"] }
reqwest = { version = "0.11", features = ["json"] }
scraper = "0.12"
regex = "1.5"
url = "2.2"
clap = "2.33"
futures = "0.3"
scraper = "0.17"
structopt = "0.3"
indicatif = "0.17"
colored = "2.0"
regex = "1.10"
url = "2.4"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
csv = "1.2"
clap = "2.34.0"

View File

@@ -1,14 +1,16 @@
use std::collections::{HashSet, VecDeque};
use std::collections::{HashMap, HashSet, VecDeque};
use std::path::Path;
use std::sync::{Arc, Mutex};
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::time::{Duration, Instant};
use clap::{App, Arg};
use futures::stream::{FuturesUnordered, StreamExt};
use indicatif::{ProgressBar, ProgressStyle};
use regex::Regex;
use reqwest::Client;
use scraper::{Html, Selector};
use tokio;
use tokio::sync::Mutex;
use url::Url;
#[derive(Debug, Clone)]
@@ -22,251 +24,249 @@ struct CrawlResult {
pattern_matches: Vec<String>,
}
#[derive(Clone)]
struct CrawlConfig {
max_pages: usize,
max_depth: usize,
delay_ms: u64,
pattern_regex: Option<Regex>,
}
async fn crawl_url(
url: String,
client: Arc<Client>,
visited: Arc<Mutex<HashSet<String>>>,
to_visit: Arc<Mutex<VecDeque<String>>>,
pattern_regex: Option<Regex>,
to_visit: Arc<Mutex<VecDeque<(String, usize)>>>,
active_count: Arc<AtomicUsize>,
last_request: Arc<Mutex<HashMap<String, Instant>>>,
config: CrawlConfig,
) -> Option<CrawlResult> {
println!("[🌐] Attempting: {}", url);
let domain = match Url::parse(&url).ok()?.host_str() {
Some(d) => d.to_string(),
None => return None,
};
if {
let mut visited = visited.lock().unwrap();
if visited.contains(&url) {
println!("[⏭️] Already visited: {}", url);
{
let mut last_req = last_request.lock().await;
if let Some(last) = last_req.get(&domain) {
let elapsed = last.elapsed().as_millis() as u64;
if elapsed < config.delay_ms {
tokio::time::sleep(Duration::from_millis(config.delay_ms - elapsed)).await;
}
}
last_req.insert(domain, Instant::now());
}
{
let mut visited_set = visited.lock().await;
if visited_set.contains(&url) {
return None;
}
visited.insert(url.clone());
active_count.fetch_add(1, Ordering::SeqCst);
true
} == false {
visited_set.insert(url.clone());
}
let current_depth = {
let queue = to_visit.lock().await;
queue.iter().find(|(u, _)| u == &url).map(|(_, d)| *d).unwrap_or(0)
};
if current_depth > config.max_depth {
return None;
}
let res = match client.get(&url).send().await {
Ok(response) => {
println!("[✅] Success: {}", url);
response.text().await.ok()?
}
active_count.fetch_add(1, Ordering::SeqCst);
let res = match client.get(&url).timeout(Duration::from_secs(5)).send().await {
Ok(response) => match response.text().await {
Ok(text) => text,
Err(e) => {
println!("[❌] Failed to read text: {} - {}", url, e);
return None;
}
},
Err(e) => {
println!("[❌] Failed: {} - {}", url, e);
println!("[❌] Failed to fetch: {} - {}", url, e);
return None;
}
};
let html_content = res.clone();
let url_clone = url.clone();
let pattern_clone = pattern_regex.clone();
let pattern_clone = config.pattern_regex.clone();
println!("[🔍] Processing: {}", url);
let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
let document = Html::parse_document(&res);
let title = document.select(&Selector::parse("title").unwrap()).next().map(|e| e.inner_html());
let text = document.select(&Selector::parse("body").unwrap()).next().map(|e| e.text().collect::<String>());
let title_selector = Selector::parse("title").unwrap();
let title = document
.select(&title_selector)
.next()
.map(|e| e.inner_html());
let body_selector = Selector::parse("body").unwrap();
let text = document
.select(&body_selector)
.next()
.map(|e| e.text().collect::<Vec<_>>().join(" "));
let link_selector = Selector::parse("a[href]").unwrap();
let links: Vec<String> = document
.select(&link_selector)
let links: Vec<_> = document.select(&Selector::parse("a[href]").unwrap())
.filter_map(|e| e.value().attr("href"))
.map(|s| s.to_string())
.filter_map(|s| Url::parse(s)
.or_else(|_| Url::parse(&format!("{}/{}", url_clone, s.trim_start_matches('/'))))
.ok())
.map(|u| u.to_string())
.collect();
let img_selector = Selector::parse("img[src]").unwrap();
let images: Vec<String> = document
.select(&img_selector)
let images = document.select(&Selector::parse("img[src]").unwrap())
.filter_map(|e| e.value().attr("src"))
.map(|s| s.to_string())
.collect();
let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
regex
.captures_iter(text)
.filter_map(|cap| cap.get(0).map(|m| m.as_str().to_string()))
.collect()
} else {
vec![]
};
let pattern_matches = pattern_clone.as_ref().zip(text.as_ref())
.map(|(re, txt)| re.captures_iter(txt).filter_map(|c| c.get(0).map(|m| m.as_str().to_string())).collect())
.unwrap_or_default();
(title, text, links, images, pattern_matches)
})
.await
.ok()?;
}).await.ok()?;
{
let mut queue = to_visit.lock().unwrap();
let mut new_links = 0;
for link in &links {
if link.starts_with("http") {
queue.push_back(link.clone());
new_links += 1;
let mut queue = to_visit.lock().await;
let visited = visited.lock().await; // Lock once here
for link in links.iter() {
if link.starts_with("http") && !visited.contains(link) {
queue.push_back((link.clone(), current_depth + 1));
}
}
println!("[🔄] Discovered {} new links from {}", new_links, url);
}
} // Locks released here
active_count.fetch_sub(1, Ordering::SeqCst);
Some(CrawlResult {
url: url_clone,
html: html_content,
title,
text,
links,
images,
pattern_matches,
})
Some(CrawlResult { url, html: html_content, title, text, links, images, pattern_matches })
}
fn sanitize_filename(url: &str) -> String {
let parsed = Url::parse(url).unwrap();
let mut filename = parsed.path().replace('/', "__");
if filename.is_empty() || filename == "__" {
filename = "index".to_string();
}
let mut filename = parsed.path_segments().map(|s| s.collect::<Vec<_>>().join("_")).unwrap_or_else(|| "index".to_string());
filename = filename.replace(|c: char| !c.is_ascii_alphanumeric(), "_");
if filename.len() > 50 {
filename = filename[..50].to_string();
}
if filename.len() > 50 { filename = filename[..50].to_string(); }
format!("{}.html", filename.trim_matches('_'))
}
#[tokio::main]
async fn main() {
println!("[🚀] Starting web crawler...");
let matches = App::new("Web Crawler")
.version("1.0")
.about("Multi-threaded web crawler with pattern matching and website storage")
.arg(
Arg::with_name("url")
.help("Starting URL")
.required(true),
)
.arg(
Arg::with_name("pattern")
.short("p")
.long("pattern")
.help("Regex pattern to match in page text")
.takes_value(true),
)
.arg(
Arg::with_name("concurrency")
.short("c")
.long("concurrency")
.help("Max concurrent requests")
.takes_value(true),
)
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let matches = App::new("Web Crawler Pro")
.arg(Arg::with_name("url").required(true))
.arg(Arg::with_name("pattern").short("p").long("pattern"))
.arg(Arg::with_name("concurrency").short("c").long("concurrency").default_value("20"))
.arg(Arg::with_name("max_pages").long("max-pages").default_value("1000"))
.arg(Arg::with_name("max_depth").long("max-depth").default_value("3"))
.arg(Arg::with_name("delay").long("delay-ms").default_value("100"))
.get_matches();
let start_url = matches.value_of("url").unwrap().to_string();
let pattern = matches.value_of("pattern");
let concurrency: usize = matches
.value_of("concurrency")
.unwrap_or("20")
.parse()
.unwrap_or(20);
let config = CrawlConfig {
max_pages: matches.value_of("max_pages").unwrap().parse()?,
max_depth: matches.value_of("max_depth").unwrap().parse()?,
delay_ms: matches.value_of("delay").unwrap().parse()?,
pattern_regex: matches.value_of("pattern").map(|p| Regex::new(p).unwrap()),
};
let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex"));
let client = Arc::new(Client::new());
let pb = ProgressBar::new(config.max_pages as u64);
pb.set_style(
ProgressStyle::default_bar()
.template("{spinner:.green} [{elapsed_precise}] [{wide_bar:.cyan/blue}] {pos}/{len} ({eta}) | {msg}")
.unwrap()
.progress_chars("#>-")
);
let parsed_url = Url::parse(&start_url).expect("Invalid URL");
let domain = parsed_url.host_str().unwrap().replace('.', "_");
let client = Arc::new(Client::builder()
.pool_max_idle_per_host(20)
.timeout(Duration::from_secs(5))
.build()?);
let start_url = matches.value_of("url").unwrap();
let domain = Url::parse(start_url)?.host_str().unwrap().replace('.', "_");
let output_dir = format!("./{}", domain);
std::fs::create_dir_all(&output_dir).expect("Failed to create output directory");
println!("[📂] Created output directory: {}", output_dir);
tokio::fs::create_dir_all(&output_dir).await?;
let visited = Arc::new(Mutex::new(HashSet::new()));
let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url])));
let to_visit = Arc::new(Mutex::new(VecDeque::from([(start_url.to_string(), 0)])));
let active_count = Arc::new(AtomicUsize::new(0));
let last_request = Arc::new(Mutex::new(HashMap::new()));
let mut tasks = FuturesUnordered::new();
for _ in 0..matches.value_of("concurrency").unwrap().parse()? {
tasks.push(tokio::spawn({
let client = client.clone();
let visited = visited.clone();
let to_visit = to_visit.clone();
let active_count = active_count.clone();
let last_request = last_request.clone();
let config = config.clone();
let pb = pb.clone();
println!("[🔄] Initializing {} worker tasks...", concurrency);
for _ in 0..concurrency {
let client = client.clone();
let visited = visited.clone();
let to_visit = to_visit.clone();
let pattern_regex = pattern_regex.clone();
let active_count = active_count.clone();
async move {
let mut results = Vec::new();
loop {
let next_url = {
let mut queue = to_visit.lock().await;
queue.pop_front()
};
tasks.push(tokio::spawn(async move {
let mut results = vec![];
loop {
let next_url = {
let mut queue = to_visit.lock().unwrap();
queue.pop_front()
};
if let Some(url) = next_url {
println!("[📥] Processing: {}", url);
if let Some(result) = crawl_url(
url,
client.clone(),
visited.clone(),
to_visit.clone(),
pattern_regex.clone(),
active_count.clone(),
)
.await
{
results.push(result);
}
} else {
let active = active_count.load(Ordering::SeqCst);
if active == 0 {
println!("[🛑] Worker detected completion");
if active_count.load(Ordering::SeqCst) == 0 && to_visit.lock().await.is_empty() {
pb.finish_and_clear();
break;
}
println!("[⏳] Worker waiting (active: {})...", active);
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
match next_url {
Some((url, depth)) => {
if results.len() >= config.max_pages {
break;
}
if let Some(result) = crawl_url(
url,
client.clone(),
visited.clone(),
to_visit.clone(),
active_count.clone(),
last_request.clone(),
config.clone(),
).await {
let visited = visited.lock().await.len();
pb.set_position(visited as u64);
pb.set_message(format!(
"Active: {} | Queued: {} | Depth: {}",
active_count.load(Ordering::SeqCst),
to_visit.lock().await.len(),
depth
));
results.push(result);
}
}
None => {
if active_count.load(Ordering::SeqCst) == 0 {
break;
}
tokio::time::sleep(Duration::from_millis(100)).await;
}
}
}
results
}
results
}));
}
println!("[📊] Crawling in progress...");
let mut all_results = vec![];
while let Some(result_set) = tasks.next().await {
match result_set {
Ok(results) => {
println!("[✔️] Worker completed with {} results", results.len());
all_results.extend(results);
let mut all_results = Vec::new();
while let Some(result) = tasks.next().await {
match result {
Ok(mut res) => {
all_results.append(&mut res);
if all_results.len() >= config.max_pages {
break;
}
}
Err(e) => println!("[⚠️] Worker error: {}", e),
Err(e) => eprintln!("[⚠️] Task error: {}", e),
}
}
println!("\n[🏁] Crawling completed!");
println!("[📋] Total pages crawled: {}", all_results.len());
println!("[💾] Saving results to {}...", output_dir);
for res in all_results {
let filename = sanitize_filename(&res.url);
let file_path = Path::new(&output_dir).join(filename);
tokio::fs::write(&file_path, &res.html)
.await
.unwrap_or_else(|_| panic!("Failed to write: {}", file_path.display()));
println!("[💾] Saved: {}", file_path.display());
pb.finish_with_message("Saving files...");
for res in all_results.iter() { // Use iter() to borrow instead of move
let path = Path::new(&output_dir).join(sanitize_filename(&res.url));
tokio::fs::write(&path, &res.html).await?; // Add reference here too
}
println!("[✅] Done! Website stored in: {}", output_dir);
}
println!("[✅] Crawled {} pages to {}", all_results.len(), output_dir);
Ok(())
}