dont work very well fix in future
This commit is contained in:
394
Cargo.lock
generated
394
Cargo.lock
generated
@@ -17,19 +17,6 @@ version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
|
||||
|
||||
[[package]]
|
||||
name = "ahash"
|
||||
version = "0.8.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"getrandom 0.2.16",
|
||||
"once_cell",
|
||||
"version_check",
|
||||
"zerocopy 0.7.35",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aho-corasick"
|
||||
version = "1.1.3"
|
||||
@@ -146,16 +133,6 @@ dependencies = [
|
||||
"vec_map",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "colored"
|
||||
version = "2.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c"
|
||||
dependencies = [
|
||||
"lazy_static",
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "console"
|
||||
version = "0.15.11"
|
||||
@@ -169,6 +146,12 @@ dependencies = [
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "convert_case"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation"
|
||||
version = "0.9.4"
|
||||
@@ -190,31 +173,55 @@ name = "crawler"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"colored",
|
||||
"csv",
|
||||
"futures",
|
||||
"indicatif",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"structopt",
|
||||
"tokio",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cssparser"
|
||||
version = "0.31.2"
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be"
|
||||
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
||||
dependencies = [
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
||||
|
||||
[[package]]
|
||||
name = "cssparser"
|
||||
version = "0.27.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a"
|
||||
dependencies = [
|
||||
"cssparser-macros",
|
||||
"dtoa-short",
|
||||
"itoa",
|
||||
"itoa 0.4.8",
|
||||
"matches",
|
||||
"phf",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"smallvec",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -227,35 +234,16 @@ dependencies = [
|
||||
"syn 2.0.101",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv"
|
||||
version = "1.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf"
|
||||
dependencies = [
|
||||
"csv-core",
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv-core"
|
||||
version = "0.1.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more"
|
||||
version = "0.99.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f"
|
||||
dependencies = [
|
||||
"convert_case",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustc_version",
|
||||
"syn 2.0.101",
|
||||
]
|
||||
|
||||
@@ -291,6 +279,12 @@ version = "0.6.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "12a0bb14ac04a9fcf170d0bbbef949b44cc492f4452bd20c095636956f653642"
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
|
||||
|
||||
[[package]]
|
||||
name = "encode_unicode"
|
||||
version = "1.0.0"
|
||||
@@ -477,13 +471,13 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.2.16"
|
||||
version = "0.1.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
|
||||
checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"wasi 0.11.0+wasi-snapshot-preview1",
|
||||
"wasi 0.9.0+wasi-snapshot-preview1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -529,15 +523,6 @@ version = "0.15.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3"
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
|
||||
dependencies = [
|
||||
"unicode-segmentation",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hermit-abi"
|
||||
version = "0.1.19"
|
||||
@@ -549,9 +534,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "html5ever"
|
||||
version = "0.26.0"
|
||||
version = "0.25.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
|
||||
checksum = "e5c13fb08e5d4dfc151ee5e88bae63f7773d61852f3bdc73c9f4b9e1bde03148"
|
||||
dependencies = [
|
||||
"log",
|
||||
"mac",
|
||||
@@ -569,7 +554,7 @@ checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fnv",
|
||||
"itoa",
|
||||
"itoa 1.0.15",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -610,7 +595,7 @@ dependencies = [
|
||||
"http-body",
|
||||
"httparse",
|
||||
"httpdate",
|
||||
"itoa",
|
||||
"itoa 1.0.15",
|
||||
"pin-project-lite",
|
||||
"socket2",
|
||||
"tokio",
|
||||
@@ -790,6 +775,7 @@ dependencies = [
|
||||
"console",
|
||||
"number_prefix",
|
||||
"portable-atomic",
|
||||
"rayon",
|
||||
"unicode-width 0.2.0",
|
||||
"web-time",
|
||||
]
|
||||
@@ -800,6 +786,12 @@ version = "2.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130"
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "0.4.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.15"
|
||||
@@ -816,12 +808,6 @@ dependencies = [
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lazy_static"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.172"
|
||||
@@ -864,9 +850,9 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
||||
|
||||
[[package]]
|
||||
name = "markup5ever"
|
||||
version = "0.11.0"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
|
||||
checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd"
|
||||
dependencies = [
|
||||
"log",
|
||||
"phf",
|
||||
@@ -876,6 +862,12 @@ dependencies = [
|
||||
"tendril",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matches"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.4"
|
||||
@@ -931,6 +923,12 @@ version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
|
||||
|
||||
[[package]]
|
||||
name = "nodrop"
|
||||
version = "0.1.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
|
||||
|
||||
[[package]]
|
||||
name = "number_prefix"
|
||||
version = "0.4.0"
|
||||
@@ -1027,33 +1025,33 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.10.1"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
|
||||
checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
|
||||
dependencies = [
|
||||
"phf_macros",
|
||||
"phf_shared 0.10.0",
|
||||
"phf_shared 0.8.0",
|
||||
"proc-macro-hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_codegen"
|
||||
version = "0.10.0"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
|
||||
checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
|
||||
dependencies = [
|
||||
"phf_generator 0.10.0",
|
||||
"phf_shared 0.10.0",
|
||||
"phf_generator 0.8.0",
|
||||
"phf_shared 0.8.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.10.0"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
|
||||
checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
|
||||
dependencies = [
|
||||
"phf_shared 0.10.0",
|
||||
"rand",
|
||||
"phf_shared 0.8.0",
|
||||
"rand 0.7.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1063,17 +1061,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d"
|
||||
dependencies = [
|
||||
"phf_shared 0.11.3",
|
||||
"rand",
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_macros"
|
||||
version = "0.10.0"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "58fdf3184dd560f160dd73922bea2d5cd6e8f064bf4b13110abd81b03697b4e0"
|
||||
checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c"
|
||||
dependencies = [
|
||||
"phf_generator 0.10.0",
|
||||
"phf_shared 0.10.0",
|
||||
"phf_generator 0.8.0",
|
||||
"phf_shared 0.8.0",
|
||||
"proc-macro-hack",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -1082,9 +1080,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.10.0"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
|
||||
checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7"
|
||||
dependencies = [
|
||||
"siphasher 0.3.11",
|
||||
]
|
||||
@@ -1128,7 +1126,7 @@ version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
|
||||
dependencies = [
|
||||
"zerocopy 0.8.25",
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1137,30 +1135,6 @@ version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
|
||||
dependencies = [
|
||||
"proc-macro-error-attr",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-error-attr"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-hack"
|
||||
version = "0.5.20+deprecated"
|
||||
@@ -1191,25 +1165,46 @@ version = "5.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
|
||||
dependencies = [
|
||||
"getrandom 0.1.16",
|
||||
"libc",
|
||||
"rand_chacha",
|
||||
"rand_core 0.5.1",
|
||||
"rand_hc",
|
||||
"rand_pcg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"rand_chacha",
|
||||
"rand_core",
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.3.1"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
|
||||
checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core",
|
||||
"rand_core 0.5.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
|
||||
dependencies = [
|
||||
"getrandom 0.1.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1217,8 +1212,43 @@ name = "rand_core"
|
||||
version = "0.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
||||
|
||||
[[package]]
|
||||
name = "rand_hc"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
|
||||
dependencies = [
|
||||
"getrandom 0.2.16",
|
||||
"rand_core 0.5.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_pcg"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
|
||||
dependencies = [
|
||||
"rand_core 0.5.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
|
||||
dependencies = [
|
||||
"either",
|
||||
"rayon-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon-core"
|
||||
version = "1.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
|
||||
dependencies = [
|
||||
"crossbeam-deque",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1305,6 +1335,15 @@ version = "0.1.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
|
||||
|
||||
[[package]]
|
||||
name = "rustc_version"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
|
||||
dependencies = [
|
||||
"semver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "1.0.7"
|
||||
@@ -1356,16 +1395,15 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
||||
|
||||
[[package]]
|
||||
name = "scraper"
|
||||
version = "0.17.1"
|
||||
version = "0.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c95a930e03325234c18c7071fd2b60118307e025d6fff3e12745ffbf63a3d29c"
|
||||
checksum = "48e02aa790c80c2e494130dec6a522033b6a23603ffc06360e9fe6c611ea2c12"
|
||||
dependencies = [
|
||||
"ahash",
|
||||
"cssparser",
|
||||
"ego-tree",
|
||||
"getopts",
|
||||
"html5ever",
|
||||
"once_cell",
|
||||
"matches",
|
||||
"selectors",
|
||||
"smallvec",
|
||||
"tendril",
|
||||
@@ -1396,23 +1434,30 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "selectors"
|
||||
version = "0.25.0"
|
||||
version = "0.22.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06"
|
||||
checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe"
|
||||
dependencies = [
|
||||
"bitflags 2.9.0",
|
||||
"bitflags 1.3.2",
|
||||
"cssparser",
|
||||
"derive_more",
|
||||
"fxhash",
|
||||
"log",
|
||||
"new_debug_unreachable",
|
||||
"matches",
|
||||
"phf",
|
||||
"phf_codegen",
|
||||
"precomputed-hash",
|
||||
"servo_arc",
|
||||
"smallvec",
|
||||
"thin-slice",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "1.0.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.219"
|
||||
@@ -1439,7 +1484,7 @@ version = "1.0.140"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373"
|
||||
dependencies = [
|
||||
"itoa",
|
||||
"itoa 1.0.15",
|
||||
"memchr",
|
||||
"ryu",
|
||||
"serde",
|
||||
@@ -1452,17 +1497,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd"
|
||||
dependencies = [
|
||||
"form_urlencoded",
|
||||
"itoa",
|
||||
"itoa 1.0.15",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "servo_arc"
|
||||
version = "0.3.0"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44"
|
||||
checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432"
|
||||
dependencies = [
|
||||
"nodrop",
|
||||
"stable_deref_trait",
|
||||
]
|
||||
|
||||
@@ -1555,30 +1601,6 @@ version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
|
||||
|
||||
[[package]]
|
||||
name = "structopt"
|
||||
version = "0.3.26"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10"
|
||||
dependencies = [
|
||||
"clap",
|
||||
"lazy_static",
|
||||
"structopt-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "structopt-derive"
|
||||
version = "0.4.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro-error",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.109"
|
||||
@@ -1672,6 +1694,12 @@ dependencies = [
|
||||
"unicode-width 0.1.14",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thin-slice"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
|
||||
|
||||
[[package]]
|
||||
name = "tinystr"
|
||||
version = "0.7.6"
|
||||
@@ -1771,12 +1799,6 @@ version = "1.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-segmentation"
|
||||
version = "1.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.1.14"
|
||||
@@ -1830,12 +1852,6 @@ version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
||||
|
||||
[[package]]
|
||||
name = "want"
|
||||
version = "0.3.1"
|
||||
@@ -1845,6 +1861,12 @@ dependencies = [
|
||||
"try-lock",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.9.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.11.0+wasi-snapshot-preview1"
|
||||
@@ -2176,33 +2198,13 @@ dependencies = [
|
||||
"synstructure",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy"
|
||||
version = "0.7.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
|
||||
dependencies = [
|
||||
"zerocopy-derive 0.7.35",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy"
|
||||
version = "0.8.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb"
|
||||
dependencies = [
|
||||
"zerocopy-derive 0.8.25",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy-derive"
|
||||
version = "0.7.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.101",
|
||||
"zerocopy-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
||||
17
Cargo.toml
17
Cargo.toml
@@ -4,16 +4,11 @@ version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
tokio = { version = "1.28", features = ["full"] }
|
||||
indicatif = { version = "0.17", features = ["rayon"] }
|
||||
tokio = { version = "1.0", features = ["full"] }
|
||||
reqwest = { version = "0.11", features = ["json"] }
|
||||
scraper = "0.12"
|
||||
regex = "1.5"
|
||||
url = "2.2"
|
||||
clap = "2.33"
|
||||
futures = "0.3"
|
||||
scraper = "0.17"
|
||||
structopt = "0.3"
|
||||
indicatif = "0.17"
|
||||
colored = "2.0"
|
||||
regex = "1.10"
|
||||
url = "2.4"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
csv = "1.2"
|
||||
clap = "2.34.0"
|
||||
320
src/main.rs
320
src/main.rs
@@ -1,14 +1,16 @@
|
||||
use std::collections::{HashSet, VecDeque};
|
||||
use std::collections::{HashMap, HashSet, VecDeque};
|
||||
use std::path::Path;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use clap::{App, Arg};
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use indicatif::{ProgressBar, ProgressStyle};
|
||||
use regex::Regex;
|
||||
use reqwest::Client;
|
||||
use scraper::{Html, Selector};
|
||||
use tokio;
|
||||
use tokio::sync::Mutex;
|
||||
use url::Url;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -22,251 +24,249 @@ struct CrawlResult {
|
||||
pattern_matches: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct CrawlConfig {
|
||||
max_pages: usize,
|
||||
max_depth: usize,
|
||||
delay_ms: u64,
|
||||
pattern_regex: Option<Regex>,
|
||||
}
|
||||
|
||||
async fn crawl_url(
|
||||
url: String,
|
||||
client: Arc<Client>,
|
||||
visited: Arc<Mutex<HashSet<String>>>,
|
||||
to_visit: Arc<Mutex<VecDeque<String>>>,
|
||||
pattern_regex: Option<Regex>,
|
||||
to_visit: Arc<Mutex<VecDeque<(String, usize)>>>,
|
||||
active_count: Arc<AtomicUsize>,
|
||||
last_request: Arc<Mutex<HashMap<String, Instant>>>,
|
||||
config: CrawlConfig,
|
||||
) -> Option<CrawlResult> {
|
||||
println!("[🌐] Attempting: {}", url);
|
||||
let domain = match Url::parse(&url).ok()?.host_str() {
|
||||
Some(d) => d.to_string(),
|
||||
None => return None,
|
||||
};
|
||||
|
||||
if {
|
||||
let mut visited = visited.lock().unwrap();
|
||||
if visited.contains(&url) {
|
||||
println!("[⏭️] Already visited: {}", url);
|
||||
{
|
||||
let mut last_req = last_request.lock().await;
|
||||
if let Some(last) = last_req.get(&domain) {
|
||||
let elapsed = last.elapsed().as_millis() as u64;
|
||||
if elapsed < config.delay_ms {
|
||||
tokio::time::sleep(Duration::from_millis(config.delay_ms - elapsed)).await;
|
||||
}
|
||||
}
|
||||
last_req.insert(domain, Instant::now());
|
||||
}
|
||||
|
||||
{
|
||||
let mut visited_set = visited.lock().await;
|
||||
if visited_set.contains(&url) {
|
||||
return None;
|
||||
}
|
||||
visited.insert(url.clone());
|
||||
visited_set.insert(url.clone());
|
||||
}
|
||||
|
||||
let current_depth = {
|
||||
let queue = to_visit.lock().await;
|
||||
queue.iter().find(|(u, _)| u == &url).map(|(_, d)| *d).unwrap_or(0)
|
||||
};
|
||||
|
||||
if current_depth > config.max_depth {
|
||||
return None;
|
||||
}
|
||||
|
||||
active_count.fetch_add(1, Ordering::SeqCst);
|
||||
true
|
||||
} == false {
|
||||
|
||||
let res = match client.get(&url).timeout(Duration::from_secs(5)).send().await {
|
||||
Ok(response) => match response.text().await {
|
||||
Ok(text) => text,
|
||||
Err(e) => {
|
||||
println!("[❌] Failed to read text: {} - {}", url, e);
|
||||
return None;
|
||||
}
|
||||
|
||||
let res = match client.get(&url).send().await {
|
||||
Ok(response) => {
|
||||
println!("[✅] Success: {}", url);
|
||||
response.text().await.ok()?
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
println!("[❌] Failed: {} - {}", url, e);
|
||||
println!("[❌] Failed to fetch: {} - {}", url, e);
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
let html_content = res.clone();
|
||||
let url_clone = url.clone();
|
||||
let pattern_clone = pattern_regex.clone();
|
||||
let pattern_clone = config.pattern_regex.clone();
|
||||
|
||||
println!("[🔍] Processing: {}", url);
|
||||
let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || {
|
||||
let document = Html::parse_document(&res);
|
||||
let title = document.select(&Selector::parse("title").unwrap()).next().map(|e| e.inner_html());
|
||||
let text = document.select(&Selector::parse("body").unwrap()).next().map(|e| e.text().collect::<String>());
|
||||
|
||||
let title_selector = Selector::parse("title").unwrap();
|
||||
let title = document
|
||||
.select(&title_selector)
|
||||
.next()
|
||||
.map(|e| e.inner_html());
|
||||
|
||||
let body_selector = Selector::parse("body").unwrap();
|
||||
let text = document
|
||||
.select(&body_selector)
|
||||
.next()
|
||||
.map(|e| e.text().collect::<Vec<_>>().join(" "));
|
||||
|
||||
let link_selector = Selector::parse("a[href]").unwrap();
|
||||
let links: Vec<String> = document
|
||||
.select(&link_selector)
|
||||
let links: Vec<_> = document.select(&Selector::parse("a[href]").unwrap())
|
||||
.filter_map(|e| e.value().attr("href"))
|
||||
.map(|s| s.to_string())
|
||||
.filter_map(|s| Url::parse(s)
|
||||
.or_else(|_| Url::parse(&format!("{}/{}", url_clone, s.trim_start_matches('/'))))
|
||||
.ok())
|
||||
.map(|u| u.to_string())
|
||||
.collect();
|
||||
|
||||
let img_selector = Selector::parse("img[src]").unwrap();
|
||||
let images: Vec<String> = document
|
||||
.select(&img_selector)
|
||||
let images = document.select(&Selector::parse("img[src]").unwrap())
|
||||
.filter_map(|e| e.value().attr("src"))
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
|
||||
let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) {
|
||||
regex
|
||||
.captures_iter(text)
|
||||
.filter_map(|cap| cap.get(0).map(|m| m.as_str().to_string()))
|
||||
.collect()
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
let pattern_matches = pattern_clone.as_ref().zip(text.as_ref())
|
||||
.map(|(re, txt)| re.captures_iter(txt).filter_map(|c| c.get(0).map(|m| m.as_str().to_string())).collect())
|
||||
.unwrap_or_default();
|
||||
|
||||
(title, text, links, images, pattern_matches)
|
||||
})
|
||||
.await
|
||||
.ok()?;
|
||||
}).await.ok()?;
|
||||
|
||||
{
|
||||
let mut queue = to_visit.lock().unwrap();
|
||||
let mut new_links = 0;
|
||||
for link in &links {
|
||||
if link.starts_with("http") {
|
||||
queue.push_back(link.clone());
|
||||
new_links += 1;
|
||||
let mut queue = to_visit.lock().await;
|
||||
let visited = visited.lock().await; // Lock once here
|
||||
|
||||
for link in links.iter() {
|
||||
if link.starts_with("http") && !visited.contains(link) {
|
||||
queue.push_back((link.clone(), current_depth + 1));
|
||||
}
|
||||
}
|
||||
println!("[🔄] Discovered {} new links from {}", new_links, url);
|
||||
}
|
||||
} // Locks released here
|
||||
|
||||
|
||||
active_count.fetch_sub(1, Ordering::SeqCst);
|
||||
|
||||
Some(CrawlResult {
|
||||
url: url_clone,
|
||||
html: html_content,
|
||||
title,
|
||||
text,
|
||||
links,
|
||||
images,
|
||||
pattern_matches,
|
||||
})
|
||||
Some(CrawlResult { url, html: html_content, title, text, links, images, pattern_matches })
|
||||
}
|
||||
|
||||
fn sanitize_filename(url: &str) -> String {
|
||||
let parsed = Url::parse(url).unwrap();
|
||||
let mut filename = parsed.path().replace('/', "__");
|
||||
|
||||
if filename.is_empty() || filename == "__" {
|
||||
filename = "index".to_string();
|
||||
}
|
||||
|
||||
let mut filename = parsed.path_segments().map(|s| s.collect::<Vec<_>>().join("_")).unwrap_or_else(|| "index".to_string());
|
||||
filename = filename.replace(|c: char| !c.is_ascii_alphanumeric(), "_");
|
||||
|
||||
if filename.len() > 50 {
|
||||
filename = filename[..50].to_string();
|
||||
}
|
||||
|
||||
if filename.len() > 50 { filename = filename[..50].to_string(); }
|
||||
format!("{}.html", filename.trim_matches('_'))
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() {
|
||||
println!("[🚀] Starting web crawler...");
|
||||
|
||||
let matches = App::new("Web Crawler")
|
||||
.version("1.0")
|
||||
.about("Multi-threaded web crawler with pattern matching and website storage")
|
||||
.arg(
|
||||
Arg::with_name("url")
|
||||
.help("Starting URL")
|
||||
.required(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("pattern")
|
||||
.short("p")
|
||||
.long("pattern")
|
||||
.help("Regex pattern to match in page text")
|
||||
.takes_value(true),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("concurrency")
|
||||
.short("c")
|
||||
.long("concurrency")
|
||||
.help("Max concurrent requests")
|
||||
.takes_value(true),
|
||||
)
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let matches = App::new("Web Crawler Pro")
|
||||
.arg(Arg::with_name("url").required(true))
|
||||
.arg(Arg::with_name("pattern").short("p").long("pattern"))
|
||||
.arg(Arg::with_name("concurrency").short("c").long("concurrency").default_value("20"))
|
||||
.arg(Arg::with_name("max_pages").long("max-pages").default_value("1000"))
|
||||
.arg(Arg::with_name("max_depth").long("max-depth").default_value("3"))
|
||||
.arg(Arg::with_name("delay").long("delay-ms").default_value("100"))
|
||||
.get_matches();
|
||||
|
||||
let start_url = matches.value_of("url").unwrap().to_string();
|
||||
let pattern = matches.value_of("pattern");
|
||||
let concurrency: usize = matches
|
||||
.value_of("concurrency")
|
||||
.unwrap_or("20")
|
||||
.parse()
|
||||
.unwrap_or(20);
|
||||
let config = CrawlConfig {
|
||||
max_pages: matches.value_of("max_pages").unwrap().parse()?,
|
||||
max_depth: matches.value_of("max_depth").unwrap().parse()?,
|
||||
delay_ms: matches.value_of("delay").unwrap().parse()?,
|
||||
pattern_regex: matches.value_of("pattern").map(|p| Regex::new(p).unwrap()),
|
||||
};
|
||||
|
||||
let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex"));
|
||||
let client = Arc::new(Client::new());
|
||||
let pb = ProgressBar::new(config.max_pages as u64);
|
||||
pb.set_style(
|
||||
ProgressStyle::default_bar()
|
||||
.template("{spinner:.green} [{elapsed_precise}] [{wide_bar:.cyan/blue}] {pos}/{len} ({eta}) | {msg}")
|
||||
.unwrap()
|
||||
.progress_chars("#>-")
|
||||
);
|
||||
|
||||
let parsed_url = Url::parse(&start_url).expect("Invalid URL");
|
||||
let domain = parsed_url.host_str().unwrap().replace('.', "_");
|
||||
let client = Arc::new(Client::builder()
|
||||
.pool_max_idle_per_host(20)
|
||||
.timeout(Duration::from_secs(5))
|
||||
.build()?);
|
||||
|
||||
let start_url = matches.value_of("url").unwrap();
|
||||
let domain = Url::parse(start_url)?.host_str().unwrap().replace('.', "_");
|
||||
let output_dir = format!("./{}", domain);
|
||||
std::fs::create_dir_all(&output_dir).expect("Failed to create output directory");
|
||||
println!("[📂] Created output directory: {}", output_dir);
|
||||
tokio::fs::create_dir_all(&output_dir).await?;
|
||||
|
||||
let visited = Arc::new(Mutex::new(HashSet::new()));
|
||||
let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url])));
|
||||
let to_visit = Arc::new(Mutex::new(VecDeque::from([(start_url.to_string(), 0)])));
|
||||
let active_count = Arc::new(AtomicUsize::new(0));
|
||||
let last_request = Arc::new(Mutex::new(HashMap::new()));
|
||||
|
||||
let mut tasks = FuturesUnordered::new();
|
||||
|
||||
println!("[🔄] Initializing {} worker tasks...", concurrency);
|
||||
for _ in 0..concurrency {
|
||||
for _ in 0..matches.value_of("concurrency").unwrap().parse()? {
|
||||
tasks.push(tokio::spawn({
|
||||
let client = client.clone();
|
||||
let visited = visited.clone();
|
||||
let to_visit = to_visit.clone();
|
||||
let pattern_regex = pattern_regex.clone();
|
||||
let active_count = active_count.clone();
|
||||
let last_request = last_request.clone();
|
||||
let config = config.clone();
|
||||
let pb = pb.clone();
|
||||
|
||||
tasks.push(tokio::spawn(async move {
|
||||
let mut results = vec![];
|
||||
async move {
|
||||
let mut results = Vec::new();
|
||||
loop {
|
||||
let next_url = {
|
||||
let mut queue = to_visit.lock().unwrap();
|
||||
let mut queue = to_visit.lock().await;
|
||||
queue.pop_front()
|
||||
};
|
||||
|
||||
if let Some(url) = next_url {
|
||||
println!("[📥] Processing: {}", url);
|
||||
if active_count.load(Ordering::SeqCst) == 0 && to_visit.lock().await.is_empty() {
|
||||
pb.finish_and_clear();
|
||||
break;
|
||||
}
|
||||
|
||||
match next_url {
|
||||
Some((url, depth)) => {
|
||||
if results.len() >= config.max_pages {
|
||||
break;
|
||||
}
|
||||
|
||||
if let Some(result) = crawl_url(
|
||||
url,
|
||||
client.clone(),
|
||||
visited.clone(),
|
||||
to_visit.clone(),
|
||||
pattern_regex.clone(),
|
||||
active_count.clone(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
last_request.clone(),
|
||||
config.clone(),
|
||||
).await {
|
||||
let visited = visited.lock().await.len();
|
||||
pb.set_position(visited as u64);
|
||||
pb.set_message(format!(
|
||||
"Active: {} | Queued: {} | Depth: {}",
|
||||
active_count.load(Ordering::SeqCst),
|
||||
to_visit.lock().await.len(),
|
||||
depth
|
||||
));
|
||||
results.push(result);
|
||||
}
|
||||
} else {
|
||||
let active = active_count.load(Ordering::SeqCst);
|
||||
if active == 0 {
|
||||
println!("[🛑] Worker detected completion");
|
||||
}
|
||||
None => {
|
||||
if active_count.load(Ordering::SeqCst) == 0 {
|
||||
break;
|
||||
}
|
||||
println!("[⏳] Worker waiting (active: {})...", active);
|
||||
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
|
||||
tokio::time::sleep(Duration::from_millis(100)).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
results
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
println!("[📊] Crawling in progress...");
|
||||
let mut all_results = vec![];
|
||||
while let Some(result_set) = tasks.next().await {
|
||||
match result_set {
|
||||
Ok(results) => {
|
||||
println!("[✔️] Worker completed with {} results", results.len());
|
||||
all_results.extend(results);
|
||||
let mut all_results = Vec::new();
|
||||
while let Some(result) = tasks.next().await {
|
||||
match result {
|
||||
Ok(mut res) => {
|
||||
all_results.append(&mut res);
|
||||
if all_results.len() >= config.max_pages {
|
||||
break;
|
||||
}
|
||||
Err(e) => println!("[⚠️] Worker error: {}", e),
|
||||
}
|
||||
Err(e) => eprintln!("[⚠️] Task error: {}", e),
|
||||
}
|
||||
}
|
||||
|
||||
println!("\n[🏁] Crawling completed!");
|
||||
println!("[📋] Total pages crawled: {}", all_results.len());
|
||||
println!("[💾] Saving results to {}...", output_dir);
|
||||
|
||||
for res in all_results {
|
||||
let filename = sanitize_filename(&res.url);
|
||||
let file_path = Path::new(&output_dir).join(filename);
|
||||
|
||||
tokio::fs::write(&file_path, &res.html)
|
||||
.await
|
||||
.unwrap_or_else(|_| panic!("Failed to write: {}", file_path.display()));
|
||||
|
||||
println!("[💾] Saved: {}", file_path.display());
|
||||
pb.finish_with_message("Saving files...");
|
||||
for res in all_results.iter() { // Use iter() to borrow instead of move
|
||||
let path = Path::new(&output_dir).join(sanitize_filename(&res.url));
|
||||
tokio::fs::write(&path, &res.html).await?; // Add reference here too
|
||||
}
|
||||
|
||||
println!("[✅] Done! Website stored in: {}", output_dir);
|
||||
println!("[✅] Crawled {} pages to {}", all_results.len(), output_dir);
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user