From ce85f8bdba078787acdd7395eb4dda74b75a282f Mon Sep 17 00:00:00 2001 From: rattatwinko Date: Fri, 2 May 2025 18:39:57 +0200 Subject: [PATCH] dont work very well fix in future --- Cargo.lock | 394 ++++++++++++++++++++++++++-------------------------- Cargo.toml | 17 +-- src/main.rs | 356 +++++++++++++++++++++++------------------------ 3 files changed, 382 insertions(+), 385 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1d3c04c..260f248 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -17,19 +17,6 @@ version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" -[[package]] -name = "ahash" -version = "0.8.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" -dependencies = [ - "cfg-if", - "getrandom 0.2.16", - "once_cell", - "version_check", - "zerocopy 0.7.35", -] - [[package]] name = "aho-corasick" version = "1.1.3" @@ -146,16 +133,6 @@ dependencies = [ "vec_map", ] -[[package]] -name = "colored" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" -dependencies = [ - "lazy_static", - "windows-sys 0.59.0", -] - [[package]] name = "console" version = "0.15.11" @@ -169,6 +146,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + [[package]] name = "core-foundation" version = "0.9.4" @@ -190,31 +173,55 @@ name = "crawler" version = "0.1.0" dependencies = [ "clap", - "colored", - "csv", "futures", "indicatif", "regex", "reqwest", "scraper", - "serde", - "serde_json", - "structopt", "tokio", "url", ] [[package]] -name = "cssparser" -version = "0.31.2" +name = "crossbeam-deque" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "cssparser" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a" dependencies = [ "cssparser-macros", "dtoa-short", - "itoa", + "itoa 0.4.8", + "matches", "phf", + "proc-macro2", + "quote", "smallvec", + "syn 1.0.109", ] [[package]] @@ -227,35 +234,16 @@ dependencies = [ "syn 2.0.101", ] -[[package]] -name = "csv" -version = "1.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" -dependencies = [ - "csv-core", - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "csv-core" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" -dependencies = [ - "memchr", -] - [[package]] name = "derive_more" version = "0.99.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f" dependencies = [ + "convert_case", "proc-macro2", "quote", + "rustc_version", "syn 2.0.101", ] @@ -291,6 +279,12 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "12a0bb14ac04a9fcf170d0bbbef949b44cc492f4452bd20c095636956f653642" +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + [[package]] name = "encode_unicode" version = "1.0.0" @@ -477,13 +471,13 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.16" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" dependencies = [ "cfg-if", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi 0.9.0+wasi-snapshot-preview1", ] [[package]] @@ -529,15 +523,6 @@ version = "0.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" -[[package]] -name = "heck" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" -dependencies = [ - "unicode-segmentation", -] - [[package]] name = "hermit-abi" version = "0.1.19" @@ -549,9 +534,9 @@ dependencies = [ [[package]] name = "html5ever" -version = "0.26.0" +version = "0.25.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" +checksum = "e5c13fb08e5d4dfc151ee5e88bae63f7773d61852f3bdc73c9f4b9e1bde03148" dependencies = [ "log", "mac", @@ -569,7 +554,7 @@ checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" dependencies = [ "bytes", "fnv", - "itoa", + "itoa 1.0.15", ] [[package]] @@ -610,7 +595,7 @@ dependencies = [ "http-body", "httparse", "httpdate", - "itoa", + "itoa 1.0.15", "pin-project-lite", "socket2", "tokio", @@ -790,6 +775,7 @@ dependencies = [ "console", "number_prefix", "portable-atomic", + "rayon", "unicode-width 0.2.0", "web-time", ] @@ -800,6 +786,12 @@ version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" +[[package]] +name = "itoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" + [[package]] name = "itoa" version = "1.0.15" @@ -816,12 +808,6 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "lazy_static" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" - [[package]] name = "libc" version = "0.2.172" @@ -864,9 +850,9 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "markup5ever" -version = "0.11.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" +checksum = "a24f40fb03852d1cdd84330cddcaf98e9ec08a7b7768e952fad3b4cf048ec8fd" dependencies = [ "log", "phf", @@ -876,6 +862,12 @@ dependencies = [ "tendril", ] +[[package]] +name = "matches" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5" + [[package]] name = "memchr" version = "2.7.4" @@ -931,6 +923,12 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" +[[package]] +name = "nodrop" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" + [[package]] name = "number_prefix" version = "0.4.0" @@ -1027,33 +1025,33 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "phf" -version = "0.10.1" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" dependencies = [ "phf_macros", - "phf_shared 0.10.0", + "phf_shared 0.8.0", "proc-macro-hack", ] [[package]] name = "phf_codegen" -version = "0.10.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" dependencies = [ - "phf_generator 0.10.0", - "phf_shared 0.10.0", + "phf_generator 0.8.0", + "phf_shared 0.8.0", ] [[package]] name = "phf_generator" -version = "0.10.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" dependencies = [ - "phf_shared 0.10.0", - "rand", + "phf_shared 0.8.0", + "rand 0.7.3", ] [[package]] @@ -1063,17 +1061,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" dependencies = [ "phf_shared 0.11.3", - "rand", + "rand 0.8.5", ] [[package]] name = "phf_macros" -version = "0.10.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58fdf3184dd560f160dd73922bea2d5cd6e8f064bf4b13110abd81b03697b4e0" +checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c" dependencies = [ - "phf_generator 0.10.0", - "phf_shared 0.10.0", + "phf_generator 0.8.0", + "phf_shared 0.8.0", "proc-macro-hack", "proc-macro2", "quote", @@ -1082,9 +1080,9 @@ dependencies = [ [[package]] name = "phf_shared" -version = "0.10.0" +version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" dependencies = [ "siphasher 0.3.11", ] @@ -1128,7 +1126,7 @@ version = "0.2.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" dependencies = [ - "zerocopy 0.8.25", + "zerocopy", ] [[package]] @@ -1137,30 +1135,6 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" -[[package]] -name = "proc-macro-error" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn 1.0.109", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" -dependencies = [ - "proc-macro2", - "quote", - "version_check", -] - [[package]] name = "proc-macro-hack" version = "0.5.20+deprecated" @@ -1191,25 +1165,46 @@ version = "5.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha", + "rand_core 0.5.1", + "rand_hc", + "rand_pcg", +] + [[package]] name = "rand" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" dependencies = [ - "libc", - "rand_chacha", - "rand_core", + "rand_core 0.6.4", ] [[package]] name = "rand_chacha" -version = "0.3.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.5.1", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", ] [[package]] @@ -1217,8 +1212,43 @@ name = "rand_core" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" dependencies = [ - "getrandom 0.2.16", + "rand_core 0.5.1", +] + +[[package]] +name = "rand_pcg" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" +dependencies = [ + "rand_core 0.5.1", +] + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", ] [[package]] @@ -1305,6 +1335,15 @@ version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "1.0.7" @@ -1356,16 +1395,15 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "scraper" -version = "0.17.1" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c95a930e03325234c18c7071fd2b60118307e025d6fff3e12745ffbf63a3d29c" +checksum = "48e02aa790c80c2e494130dec6a522033b6a23603ffc06360e9fe6c611ea2c12" dependencies = [ - "ahash", "cssparser", "ego-tree", "getopts", "html5ever", - "once_cell", + "matches", "selectors", "smallvec", "tendril", @@ -1396,23 +1434,30 @@ dependencies = [ [[package]] name = "selectors" -version = "0.25.0" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" +checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe" dependencies = [ - "bitflags 2.9.0", + "bitflags 1.3.2", "cssparser", "derive_more", "fxhash", "log", - "new_debug_unreachable", + "matches", "phf", "phf_codegen", "precomputed-hash", "servo_arc", "smallvec", + "thin-slice", ] +[[package]] +name = "semver" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" + [[package]] name = "serde" version = "1.0.219" @@ -1439,7 +1484,7 @@ version = "1.0.140" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" dependencies = [ - "itoa", + "itoa 1.0.15", "memchr", "ryu", "serde", @@ -1452,17 +1497,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" dependencies = [ "form_urlencoded", - "itoa", + "itoa 1.0.15", "ryu", "serde", ] [[package]] name = "servo_arc" -version = "0.3.0" +version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" +checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432" dependencies = [ + "nodrop", "stable_deref_trait", ] @@ -1555,30 +1601,6 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" -[[package]] -name = "structopt" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" -dependencies = [ - "clap", - "lazy_static", - "structopt-derive", -] - -[[package]] -name = "structopt-derive" -version = "0.4.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" -dependencies = [ - "heck", - "proc-macro-error", - "proc-macro2", - "quote", - "syn 1.0.109", -] - [[package]] name = "syn" version = "1.0.109" @@ -1672,6 +1694,12 @@ dependencies = [ "unicode-width 0.1.14", ] +[[package]] +name = "thin-slice" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" + [[package]] name = "tinystr" version = "0.7.6" @@ -1771,12 +1799,6 @@ version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" -[[package]] -name = "unicode-segmentation" -version = "1.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" - [[package]] name = "unicode-width" version = "0.1.14" @@ -1830,12 +1852,6 @@ version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" -[[package]] -name = "version_check" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" - [[package]] name = "want" version = "0.3.1" @@ -1845,6 +1861,12 @@ dependencies = [ "try-lock", ] +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -2176,33 +2198,13 @@ dependencies = [ "synstructure", ] -[[package]] -name = "zerocopy" -version = "0.7.35" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" -dependencies = [ - "zerocopy-derive 0.7.35", -] - [[package]] name = "zerocopy" version = "0.8.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb" dependencies = [ - "zerocopy-derive 0.8.25", -] - -[[package]] -name = "zerocopy-derive" -version = "0.7.35" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.101", + "zerocopy-derive", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 119c161..6a743ac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,16 +4,11 @@ version = "0.1.0" edition = "2024" [dependencies] -tokio = { version = "1.28", features = ["full"] } +indicatif = { version = "0.17", features = ["rayon"] } +tokio = { version = "1.0", features = ["full"] } reqwest = { version = "0.11", features = ["json"] } +scraper = "0.12" +regex = "1.5" +url = "2.2" +clap = "2.33" futures = "0.3" -scraper = "0.17" -structopt = "0.3" -indicatif = "0.17" -colored = "2.0" -regex = "1.10" -url = "2.4" -serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0" -csv = "1.2" -clap = "2.34.0" \ No newline at end of file diff --git a/src/main.rs b/src/main.rs index e4bae70..6bc8e4c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,14 +1,16 @@ -use std::collections::{HashSet, VecDeque}; +use std::collections::{HashMap, HashSet, VecDeque}; use std::path::Path; -use std::sync::{Arc, Mutex}; use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use std::time::{Duration, Instant}; use clap::{App, Arg}; use futures::stream::{FuturesUnordered, StreamExt}; +use indicatif::{ProgressBar, ProgressStyle}; use regex::Regex; use reqwest::Client; use scraper::{Html, Selector}; -use tokio; +use tokio::sync::Mutex; use url::Url; #[derive(Debug, Clone)] @@ -22,251 +24,249 @@ struct CrawlResult { pattern_matches: Vec, } +#[derive(Clone)] +struct CrawlConfig { + max_pages: usize, + max_depth: usize, + delay_ms: u64, + pattern_regex: Option, +} + async fn crawl_url( url: String, client: Arc, visited: Arc>>, - to_visit: Arc>>, - pattern_regex: Option, + to_visit: Arc>>, active_count: Arc, + last_request: Arc>>, + config: CrawlConfig, ) -> Option { - println!("[🌐] Attempting: {}", url); + let domain = match Url::parse(&url).ok()?.host_str() { + Some(d) => d.to_string(), + None => return None, + }; - if { - let mut visited = visited.lock().unwrap(); - if visited.contains(&url) { - println!("[⏭️] Already visited: {}", url); + { + let mut last_req = last_request.lock().await; + if let Some(last) = last_req.get(&domain) { + let elapsed = last.elapsed().as_millis() as u64; + if elapsed < config.delay_ms { + tokio::time::sleep(Duration::from_millis(config.delay_ms - elapsed)).await; + } + } + last_req.insert(domain, Instant::now()); + } + + { + let mut visited_set = visited.lock().await; + if visited_set.contains(&url) { return None; } - visited.insert(url.clone()); - active_count.fetch_add(1, Ordering::SeqCst); - true - } == false { + visited_set.insert(url.clone()); + } + + let current_depth = { + let queue = to_visit.lock().await; + queue.iter().find(|(u, _)| u == &url).map(|(_, d)| *d).unwrap_or(0) + }; + + if current_depth > config.max_depth { return None; } - let res = match client.get(&url).send().await { - Ok(response) => { - println!("[✅] Success: {}", url); - response.text().await.ok()? - } + active_count.fetch_add(1, Ordering::SeqCst); + + let res = match client.get(&url).timeout(Duration::from_secs(5)).send().await { + Ok(response) => match response.text().await { + Ok(text) => text, + Err(e) => { + println!("[❌] Failed to read text: {} - {}", url, e); + return None; + } + }, Err(e) => { - println!("[❌] Failed: {} - {}", url, e); + println!("[❌] Failed to fetch: {} - {}", url, e); return None; } }; let html_content = res.clone(); let url_clone = url.clone(); - let pattern_clone = pattern_regex.clone(); + let pattern_clone = config.pattern_regex.clone(); - println!("[🔍] Processing: {}", url); let (title, text, links, images, pattern_matches) = tokio::task::spawn_blocking(move || { let document = Html::parse_document(&res); + let title = document.select(&Selector::parse("title").unwrap()).next().map(|e| e.inner_html()); + let text = document.select(&Selector::parse("body").unwrap()).next().map(|e| e.text().collect::()); - let title_selector = Selector::parse("title").unwrap(); - let title = document - .select(&title_selector) - .next() - .map(|e| e.inner_html()); - - let body_selector = Selector::parse("body").unwrap(); - let text = document - .select(&body_selector) - .next() - .map(|e| e.text().collect::>().join(" ")); - - let link_selector = Selector::parse("a[href]").unwrap(); - let links: Vec = document - .select(&link_selector) + let links: Vec<_> = document.select(&Selector::parse("a[href]").unwrap()) .filter_map(|e| e.value().attr("href")) - .map(|s| s.to_string()) + .filter_map(|s| Url::parse(s) + .or_else(|_| Url::parse(&format!("{}/{}", url_clone, s.trim_start_matches('/')))) + .ok()) + .map(|u| u.to_string()) .collect(); - let img_selector = Selector::parse("img[src]").unwrap(); - let images: Vec = document - .select(&img_selector) + let images = document.select(&Selector::parse("img[src]").unwrap()) .filter_map(|e| e.value().attr("src")) .map(|s| s.to_string()) .collect(); - let pattern_matches = if let (Some(regex), Some(text)) = (&pattern_clone, &text) { - regex - .captures_iter(text) - .filter_map(|cap| cap.get(0).map(|m| m.as_str().to_string())) - .collect() - } else { - vec![] - }; + let pattern_matches = pattern_clone.as_ref().zip(text.as_ref()) + .map(|(re, txt)| re.captures_iter(txt).filter_map(|c| c.get(0).map(|m| m.as_str().to_string())).collect()) + .unwrap_or_default(); (title, text, links, images, pattern_matches) - }) - .await - .ok()?; + }).await.ok()?; { - let mut queue = to_visit.lock().unwrap(); - let mut new_links = 0; - for link in &links { - if link.starts_with("http") { - queue.push_back(link.clone()); - new_links += 1; + let mut queue = to_visit.lock().await; + let visited = visited.lock().await; // Lock once here + + for link in links.iter() { + if link.starts_with("http") && !visited.contains(link) { + queue.push_back((link.clone(), current_depth + 1)); } } - println!("[🔄] Discovered {} new links from {}", new_links, url); - } + } // Locks released here + active_count.fetch_sub(1, Ordering::SeqCst); - - Some(CrawlResult { - url: url_clone, - html: html_content, - title, - text, - links, - images, - pattern_matches, - }) + Some(CrawlResult { url, html: html_content, title, text, links, images, pattern_matches }) } fn sanitize_filename(url: &str) -> String { let parsed = Url::parse(url).unwrap(); - let mut filename = parsed.path().replace('/', "__"); - - if filename.is_empty() || filename == "__" { - filename = "index".to_string(); - } - + let mut filename = parsed.path_segments().map(|s| s.collect::>().join("_")).unwrap_or_else(|| "index".to_string()); filename = filename.replace(|c: char| !c.is_ascii_alphanumeric(), "_"); - - if filename.len() > 50 { - filename = filename[..50].to_string(); - } - + if filename.len() > 50 { filename = filename[..50].to_string(); } format!("{}.html", filename.trim_matches('_')) } #[tokio::main] -async fn main() { - println!("[🚀] Starting web crawler..."); - - let matches = App::new("Web Crawler") - .version("1.0") - .about("Multi-threaded web crawler with pattern matching and website storage") - .arg( - Arg::with_name("url") - .help("Starting URL") - .required(true), - ) - .arg( - Arg::with_name("pattern") - .short("p") - .long("pattern") - .help("Regex pattern to match in page text") - .takes_value(true), - ) - .arg( - Arg::with_name("concurrency") - .short("c") - .long("concurrency") - .help("Max concurrent requests") - .takes_value(true), - ) +async fn main() -> Result<(), Box> { + let matches = App::new("Web Crawler Pro") + .arg(Arg::with_name("url").required(true)) + .arg(Arg::with_name("pattern").short("p").long("pattern")) + .arg(Arg::with_name("concurrency").short("c").long("concurrency").default_value("20")) + .arg(Arg::with_name("max_pages").long("max-pages").default_value("1000")) + .arg(Arg::with_name("max_depth").long("max-depth").default_value("3")) + .arg(Arg::with_name("delay").long("delay-ms").default_value("100")) .get_matches(); - let start_url = matches.value_of("url").unwrap().to_string(); - let pattern = matches.value_of("pattern"); - let concurrency: usize = matches - .value_of("concurrency") - .unwrap_or("20") - .parse() - .unwrap_or(20); + let config = CrawlConfig { + max_pages: matches.value_of("max_pages").unwrap().parse()?, + max_depth: matches.value_of("max_depth").unwrap().parse()?, + delay_ms: matches.value_of("delay").unwrap().parse()?, + pattern_regex: matches.value_of("pattern").map(|p| Regex::new(p).unwrap()), + }; - let pattern_regex = pattern.map(|p| Regex::new(p).expect("Invalid regex")); - let client = Arc::new(Client::new()); + let pb = ProgressBar::new(config.max_pages as u64); + pb.set_style( + ProgressStyle::default_bar() + .template("{spinner:.green} [{elapsed_precise}] [{wide_bar:.cyan/blue}] {pos}/{len} ({eta}) | {msg}") + .unwrap() + .progress_chars("#>-") + ); - let parsed_url = Url::parse(&start_url).expect("Invalid URL"); - let domain = parsed_url.host_str().unwrap().replace('.', "_"); + let client = Arc::new(Client::builder() + .pool_max_idle_per_host(20) + .timeout(Duration::from_secs(5)) + .build()?); + + let start_url = matches.value_of("url").unwrap(); + let domain = Url::parse(start_url)?.host_str().unwrap().replace('.', "_"); let output_dir = format!("./{}", domain); - std::fs::create_dir_all(&output_dir).expect("Failed to create output directory"); - println!("[📂] Created output directory: {}", output_dir); + tokio::fs::create_dir_all(&output_dir).await?; let visited = Arc::new(Mutex::new(HashSet::new())); - let to_visit = Arc::new(Mutex::new(VecDeque::from([start_url]))); + let to_visit = Arc::new(Mutex::new(VecDeque::from([(start_url.to_string(), 0)]))); let active_count = Arc::new(AtomicUsize::new(0)); + let last_request = Arc::new(Mutex::new(HashMap::new())); let mut tasks = FuturesUnordered::new(); + for _ in 0..matches.value_of("concurrency").unwrap().parse()? { + tasks.push(tokio::spawn({ + let client = client.clone(); + let visited = visited.clone(); + let to_visit = to_visit.clone(); + let active_count = active_count.clone(); + let last_request = last_request.clone(); + let config = config.clone(); + let pb = pb.clone(); - println!("[🔄] Initializing {} worker tasks...", concurrency); - for _ in 0..concurrency { - let client = client.clone(); - let visited = visited.clone(); - let to_visit = to_visit.clone(); - let pattern_regex = pattern_regex.clone(); - let active_count = active_count.clone(); + async move { + let mut results = Vec::new(); + loop { + let next_url = { + let mut queue = to_visit.lock().await; + queue.pop_front() + }; - tasks.push(tokio::spawn(async move { - let mut results = vec![]; - loop { - let next_url = { - let mut queue = to_visit.lock().unwrap(); - queue.pop_front() - }; - - if let Some(url) = next_url { - println!("[📥] Processing: {}", url); - if let Some(result) = crawl_url( - url, - client.clone(), - visited.clone(), - to_visit.clone(), - pattern_regex.clone(), - active_count.clone(), - ) - .await - { - results.push(result); - } - } else { - let active = active_count.load(Ordering::SeqCst); - if active == 0 { - println!("[🛑] Worker detected completion"); + if active_count.load(Ordering::SeqCst) == 0 && to_visit.lock().await.is_empty() { + pb.finish_and_clear(); break; } - println!("[⏳] Worker waiting (active: {})...", active); - tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + + match next_url { + Some((url, depth)) => { + if results.len() >= config.max_pages { + break; + } + + if let Some(result) = crawl_url( + url, + client.clone(), + visited.clone(), + to_visit.clone(), + active_count.clone(), + last_request.clone(), + config.clone(), + ).await { + let visited = visited.lock().await.len(); + pb.set_position(visited as u64); + pb.set_message(format!( + "Active: {} | Queued: {} | Depth: {}", + active_count.load(Ordering::SeqCst), + to_visit.lock().await.len(), + depth + )); + results.push(result); + } + } + None => { + if active_count.load(Ordering::SeqCst) == 0 { + break; + } + tokio::time::sleep(Duration::from_millis(100)).await; + } + } } + results } - results })); } - println!("[📊] Crawling in progress..."); - let mut all_results = vec![]; - while let Some(result_set) = tasks.next().await { - match result_set { - Ok(results) => { - println!("[✔️] Worker completed with {} results", results.len()); - all_results.extend(results); + let mut all_results = Vec::new(); + while let Some(result) = tasks.next().await { + match result { + Ok(mut res) => { + all_results.append(&mut res); + if all_results.len() >= config.max_pages { + break; + } } - Err(e) => println!("[⚠️] Worker error: {}", e), + Err(e) => eprintln!("[⚠️] Task error: {}", e), } } - println!("\n[🏁] Crawling completed!"); - println!("[📋] Total pages crawled: {}", all_results.len()); - println!("[💾] Saving results to {}...", output_dir); - for res in all_results { - let filename = sanitize_filename(&res.url); - let file_path = Path::new(&output_dir).join(filename); - - tokio::fs::write(&file_path, &res.html) - .await - .unwrap_or_else(|_| panic!("Failed to write: {}", file_path.display())); - - println!("[💾] Saved: {}", file_path.display()); + pb.finish_with_message("Saving files..."); + for res in all_results.iter() { // Use iter() to borrow instead of move + let path = Path::new(&output_dir).join(sanitize_filename(&res.url)); + tokio::fs::write(&path, &res.html).await?; // Add reference here too } - println!("[✅] Done! Website stored in: {}", output_dir); -} + println!("[✅] Crawled {} pages to {}", all_results.len(), output_dir); + Ok(()) +} \ No newline at end of file