diff --git a/.gitignore b/.gitignore index 8f2ee4b1..df964c45 100644 --- a/.gitignore +++ b/.gitignore @@ -72,3 +72,6 @@ target/ # ignore uv files uv.lock + +# ingore Cargo.lock +Cargo.lock diff --git a/Cargo.lock b/Cargo.lock deleted file mode 100644 index e25c2a68..00000000 --- a/Cargo.lock +++ /dev/null @@ -1,3485 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 4 - -[[package]] -name = "adblock" -version = "0.8.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80e6cf097ea9bb36bd04e1af44da71aaa336d2a3fb0515504f37ac297486ff5b" -dependencies = [ - "addr", - "base64 0.13.1", - "bitflags 1.3.2", - "idna 0.2.3", - "itertools 0.10.5", - "lifeguard", - "memchr", - "once_cell", - "percent-encoding", - "regex", - "rmp-serde", - "seahash", - "serde", - "serde_json", - "thiserror", - "url", -] - -[[package]] -name = "addr" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c54ccac949a2afafdfc889e15c753bbc6ee8783e026bbe3d057b00b13907db70" -dependencies = [ - "psl", - "psl-types", -] - -[[package]] -name = "addr2line" -version = "0.21.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" -dependencies = [ - "gimli", -] - -[[package]] -name = "adler" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" - -[[package]] -name = "ahash" -version = "0.8.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" -dependencies = [ - "cfg-if", - "getrandom", - "once_cell", - "version_check", - "zerocopy", -] - -[[package]] -name = "aho-corasick" -version = "1.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" -dependencies = [ - "memchr", -] - -[[package]] -name = "allocator-api2" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" - -[[package]] -name = "ansi_term" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" -dependencies = [ - "winapi", -] - -[[package]] -name = "anstream" -version = "0.6.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d96bd03f33fe50a863e394ee9718a706f988b9079b20c3784fb726e7678b62fb" -dependencies = [ - "anstyle", - "anstyle-parse", - "anstyle-query", - "anstyle-wincon", - "colorchoice", - "utf8parse", -] - -[[package]] -name = "anstyle" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8901269c6307e8d93993578286ac0edf7f195079ffff5ebdeea6a59ffb7e36bc" - -[[package]] -name = "anstyle-parse" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c75ac65da39e5fe5ab759307499ddad880d724eed2f6ce5b5e8a26f4f387928c" -dependencies = [ - "utf8parse", -] - -[[package]] -name = "anstyle-query" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e28923312444cdd728e4738b3f9c9cac739500909bb3d3c94b43551b16517648" -dependencies = [ - "windows-sys 0.52.0", -] - -[[package]] -name = "anstyle-wincon" -version = "3.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd54b81ec8d6180e24654d0b371ad22fc3dd083b6ff8ba325b72e00c87660a7" -dependencies = [ - "anstyle", - "windows-sys 0.52.0", -] - -[[package]] -name = "anyhow" -version = "1.0.81" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0952808a6c2afd1aa8947271f3a60f1a6763c7b912d210184c5149b5cf147247" - -[[package]] -name = "atomic-traits" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b29ec3788e96fb4fdb275ccb9d62811f2fa903d76c5eb4dd6fe7d09a7ed5871f" -dependencies = [ - "cfg-if", - "rustc_version 0.3.3", -] - -[[package]] -name = "atty" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" -dependencies = [ - "hermit-abi 0.1.19", - "libc", - "winapi", -] - -[[package]] -name = "autocfg" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" - -[[package]] -name = "aws-config" -version = "1.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48730d0b4c3d91c43d0d37168831d9fd0e065ad4a889a2ee9faf8d34c3d2804d" -dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-sdk-sso", - "aws-sdk-ssooidc", - "aws-sdk-sts", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "fastrand", - "hex", - "http 0.2.12", - "hyper", - "ring", - "time", - "tokio", - "tracing", - "url", - "zeroize", -] - -[[package]] -name = "aws-credential-types" -version = "1.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa8587ae17c8e967e4b05a62d495be2fb7701bec52a97f7acfe8a29f938384c8" -dependencies = [ - "aws-smithy-async", - "aws-smithy-runtime-api", - "aws-smithy-types", - "zeroize", -] - -[[package]] -name = "aws-runtime" -version = "1.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4ee6903f9d0197510eb6b44c4d86b493011d08b4992938f7b9be0333b6685aa" -dependencies = [ - "aws-credential-types", - "aws-sigv4", - "aws-smithy-async", - "aws-smithy-eventstream", - "aws-smithy-http", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "fastrand", - "http 0.2.12", - "http-body 0.4.6", - "percent-encoding", - "pin-project-lite", - "tracing", - "uuid", -] - -[[package]] -name = "aws-sdk-s3" -version = "1.22.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "644c5939c1b78097d37f3341708978d68490070d4b0f8fa91f0878678c06a7ef" -dependencies = [ - "ahash", - "aws-credential-types", - "aws-runtime", - "aws-sigv4", - "aws-smithy-async", - "aws-smithy-checksums", - "aws-smithy-eventstream", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-smithy-xml", - "aws-types", - "bytes", - "fastrand", - "hex", - "hmac", - "http 0.2.12", - "http-body 0.4.6", - "lru", - "once_cell", - "percent-encoding", - "regex-lite", - "sha2", - "tracing", - "url", -] - -[[package]] -name = "aws-sdk-sso" -version = "1.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2be5ba83b077b67a6f7a1927eb6b212bf556e33bd74b5eaa5aa6e421910803a" -dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "http 0.2.12", - "once_cell", - "regex-lite", - "tracing", -] - -[[package]] -name = "aws-sdk-ssooidc" -version = "1.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "022ca669825f841aef17b12d4354ef2b8651e4664be49f2d9ea13e4062a80c9f" -dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-types", - "bytes", - "http 0.2.12", - "once_cell", - "regex-lite", - "tracing", -] - -[[package]] -name = "aws-sdk-sts" -version = "1.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e4a5f5cb007347c1ab34a6d56456301dfada921fc9e57d687ecb08baddd11ff" -dependencies = [ - "aws-credential-types", - "aws-runtime", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-json", - "aws-smithy-query", - "aws-smithy-runtime", - "aws-smithy-runtime-api", - "aws-smithy-types", - "aws-smithy-xml", - "aws-types", - "http 0.2.12", - "once_cell", - "regex-lite", - "tracing", -] - -[[package]] -name = "aws-sigv4" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d6f29688a4be9895c0ba8bef861ad0c0dac5c15e9618b9b7a6c233990fc263" -dependencies = [ - "aws-credential-types", - "aws-smithy-eventstream", - "aws-smithy-http", - "aws-smithy-runtime-api", - "aws-smithy-types", - "bytes", - "crypto-bigint 0.5.5", - "form_urlencoded", - "hex", - "hmac", - "http 0.2.12", - "http 1.1.0", - "once_cell", - "p256", - "percent-encoding", - "ring", - "sha2", - "subtle", - "time", - "tracing", - "zeroize", -] - -[[package]] -name = "aws-smithy-async" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c" -dependencies = [ - "futures-util", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "aws-smithy-checksums" -version = "0.60.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83fa43bc04a6b2441968faeab56e68da3812f978a670a5db32accbdcafddd12f" -dependencies = [ - "aws-smithy-http", - "aws-smithy-types", - "bytes", - "crc32c", - "crc32fast", - "hex", - "http 0.2.12", - "http-body 0.4.6", - "md-5", - "pin-project-lite", - "sha1", - "sha2", - "tracing", -] - -[[package]] -name = "aws-smithy-eventstream" -version = "0.60.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6363078f927f612b970edf9d1903ef5cef9a64d1e8423525ebb1f0a1633c858" -dependencies = [ - "aws-smithy-types", - "bytes", - "crc32fast", -] - -[[package]] -name = "aws-smithy-http" -version = "0.60.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f10fa66956f01540051b0aa7ad54574640f748f9839e843442d99b970d3aff9" -dependencies = [ - "aws-smithy-eventstream", - "aws-smithy-runtime-api", - "aws-smithy-types", - "bytes", - "bytes-utils", - "futures-core", - "http 0.2.12", - "http-body 0.4.6", - "once_cell", - "percent-encoding", - "pin-project-lite", - "pin-utils", - "tracing", -] - -[[package]] -name = "aws-smithy-json" -version = "0.60.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6" -dependencies = [ - "aws-smithy-types", -] - -[[package]] -name = "aws-smithy-query" -version = "0.60.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb" -dependencies = [ - "aws-smithy-types", - "urlencoding", -] - -[[package]] -name = "aws-smithy-runtime" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c53572b4cd934ee5e8461ad53caa36e9d246aaef42166e3ac539e206a925d330" -dependencies = [ - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-runtime-api", - "aws-smithy-types", - "bytes", - "fastrand", - "h2", - "http 0.2.12", - "http-body 0.4.6", - "http-body 1.0.0", - "hyper", - "hyper-rustls", - "once_cell", - "pin-project-lite", - "pin-utils", - "rustls 0.21.12", - "tokio", - "tracing", -] - -[[package]] -name = "aws-smithy-runtime-api" -version = "1.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccb2b3a7030dc9a3c9a08ce0b25decea5130e9db19619d4dffbbff34f75fe850" -dependencies = [ - "aws-smithy-async", - "aws-smithy-types", - "bytes", - "http 0.2.12", - "http 1.1.0", - "pin-project-lite", - "tokio", - "tracing", - "zeroize", -] - -[[package]] -name = "aws-smithy-types" -version = "1.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abe14dceea1e70101d38fbf2a99e6a34159477c0fb95e68e05c66bd7ae4c3729" -dependencies = [ - "base64-simd", - "bytes", - "bytes-utils", - "futures-core", - "http 0.2.12", - "http 1.1.0", - "http-body 0.4.6", - "http-body 1.0.0", - "http-body-util", - "itoa", - "num-integer", - "pin-project-lite", - "pin-utils", - "ryu", - "serde", - "time", - "tokio", - "tokio-util", -] - -[[package]] -name = "aws-smithy-xml" -version = "0.60.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "872c68cf019c0e4afc5de7753c4f7288ce4b71663212771bf5e4542eb9346ca9" -dependencies = [ - "xmlparser", -] - -[[package]] -name = "aws-types" -version = "1.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "afb278e322f16f59630a83b6b2dc992a0b48aa74ed47b4130f193fae0053d713" -dependencies = [ - "aws-credential-types", - "aws-smithy-async", - "aws-smithy-runtime-api", - "aws-smithy-types", - "http 0.2.12", - "rustc_version 0.4.0", - "tracing", -] - -[[package]] -name = "backtrace" -version = "0.3.71" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26b05800d2e817c8b3b4b54abd461726265fa9789ae34330622f2db9ee696f9d" -dependencies = [ - "addr2line", - "cc", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", -] - -[[package]] -name = "base16ct" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" - -[[package]] -name = "base64" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" - -[[package]] -name = "base64" -version = "0.21.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" - -[[package]] -name = "base64-simd" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" -dependencies = [ - "outref", - "vsimd", -] - -[[package]] -name = "base64ct" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" - -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "bitflags" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" - -[[package]] -name = "block-buffer" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] - -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - -[[package]] -name = "bytes" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" - -[[package]] -name = "bytes-utils" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35" -dependencies = [ - "bytes", - "either", -] - -[[package]] -name = "cc" -version = "1.0.92" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2678b2e3449475e95b0aa6f9b506a28e61b3dc8996592b983695e8ebb58a8b41" -dependencies = [ - "jobserver", - "libc", -] - -[[package]] -name = "cfg-if" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" - -[[package]] -name = "chumsky" -version = "0.9.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8eebd66744a15ded14960ab4ccdbfb51ad3b81f51f3f04a80adac98c985396c9" -dependencies = [ - "hashbrown", -] - -[[package]] -name = "clap" -version = "2.34.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" -dependencies = [ - "ansi_term", - "atty", - "bitflags 1.3.2", - "strsim 0.8.0", - "textwrap", - "unicode-width", - "vec_map", -] - -[[package]] -name = "clap" -version = "4.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90bc066a67923782aa8515dbaea16946c5bcc5addbd668bb80af688e53e548a0" -dependencies = [ - "clap_builder", - "clap_derive", -] - -[[package]] -name = "clap_builder" -version = "4.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae129e2e766ae0ec03484e609954119f123cc1fe650337e155d03b022f24f7b4" -dependencies = [ - "anstream", - "anstyle", - "clap_lex", - "strsim 0.11.1", -] - -[[package]] -name = "clap_derive" -version = "4.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "528131438037fd55894f62d6e9f068b8f45ac57ffa77517819645d10aed04f64" -dependencies = [ - "heck 0.5.0", - "proc-macro2", - "quote", - "syn 2.0.58", -] - -[[package]] -name = "clap_lex" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98cc8fbded0c607b7ba9dd60cd98df59af97e84d24e49c8557331cfc26d301ce" - -[[package]] -name = "cmake" -version = "0.1.50" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a31c789563b815f77f4250caee12365734369f942439b7defd71e18a48197130" -dependencies = [ - "cc", -] - -[[package]] -name = "colorchoice" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" - -[[package]] -name = "colored" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbf2150cce219b664a8a70df7a1f933836724b503f8a413af9365b4dcc4d90b8" -dependencies = [ - "lazy_static", - "windows-sys 0.48.0", -] - -[[package]] -name = "console" -version = "0.15.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" -dependencies = [ - "encode_unicode", - "lazy_static", - "libc", - "unicode-width", - "windows-sys 0.52.0", -] - -[[package]] -name = "const-oid" -version = "0.9.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" - -[[package]] -name = "core-foundation" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" -dependencies = [ - "core-foundation-sys", - "libc", -] - -[[package]] -name = "core-foundation-sys" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" - -[[package]] -name = "cpufeatures" -version = "0.2.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" -dependencies = [ - "libc", -] - -[[package]] -name = "crc32c" -version = "0.6.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2" -dependencies = [ - "rustc_version 0.4.0", -] - -[[package]] -name = "crc32fast" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "crossbeam-deque" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "613f8cc01fe9cf1a3eb3d7f488fd2fa8388403e97039e2f73692932e291a770d" -dependencies = [ - "crossbeam-epoch", - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" -dependencies = [ - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" - -[[package]] -name = "crypto-bigint" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef2b4b23cddf68b89b8f8069890e8c270d54e2d5fe1b143820234805e4cb17ef" -dependencies = [ - "generic-array", - "rand_core", - "subtle", - "zeroize", -] - -[[package]] -name = "crypto-bigint" -version = "0.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" -dependencies = [ - "rand_core", - "subtle", -] - -[[package]] -name = "crypto-common" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" -dependencies = [ - "generic-array", - "typenum", -] - -[[package]] -name = "darling" -version = "0.14.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850" -dependencies = [ - "darling_core", - "darling_macro", -] - -[[package]] -name = "darling_core" -version = "0.14.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0" -dependencies = [ - "fnv", - "ident_case", - "proc-macro2", - "quote", - "strsim 0.10.0", - "syn 1.0.109", -] - -[[package]] -name = "darling_macro" -version = "0.14.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e" -dependencies = [ - "darling_core", - "quote", - "syn 1.0.109", -] - -[[package]] -name = "der" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1a467a65c5e759bce6e65eaf91cc29f466cdc57cb65777bd646872a8a1fd4de" -dependencies = [ - "const-oid", - "zeroize", -] - -[[package]] -name = "deranged" -version = "0.3.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" -dependencies = [ - "powerfmt", -] - -[[package]] -name = "derive_builder" -version = "0.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d67778784b508018359cbc8696edb3db78160bab2c2a28ba7f56ef6932997f8" -dependencies = [ - "derive_builder_macro", -] - -[[package]] -name = "derive_builder_core" -version = "0.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f" -dependencies = [ - "darling", - "proc-macro2", - "quote", - "syn 1.0.109", -] - -[[package]] -name = "derive_builder_macro" -version = "0.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e" -dependencies = [ - "derive_builder_core", - "syn 1.0.109", -] - -[[package]] -name = "digest" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" -dependencies = [ - "block-buffer", - "crypto-common", - "subtle", -] - -[[package]] -name = "dirs" -version = "5.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44c45a9d03d6676652bcb5e724c7e988de1acad23a711b5217ab9cbecbec2225" -dependencies = [ - "dirs-sys", -] - -[[package]] -name = "dirs-sys" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" -dependencies = [ - "libc", - "option-ext", - "redox_users", - "windows-sys 0.48.0", -] - -[[package]] -name = "dolma" -version = "1.1.1" -dependencies = [ - "adblock", - "ahash", - "anyhow", - "atomic-traits", - "aws-config", - "aws-sdk-s3", - "byteorder", - "clap 4.5.4", - "console", - "env_logger", - "flate2", - "glob", - "humantime", - "indicatif", - "jaq-core", - "jaq-interpret", - "jaq-parse", - "jaq-std", - "jsonpath-rust", - "log", - "num-traits", - "num_cpus", - "openssl", - "parse-size", - "pyo3", - "rand", - "rayon", - "regex", - "serde", - "serde_json", - "simple_logger", - "structopt", - "tempfile", - "thousands", - "threadpool", - "time", - "tokenizers", - "tokio", - "tokio-util", - "unicode-segmentation", - "zstd", -] - -[[package]] -name = "dyn-clone" -version = "1.0.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" - -[[package]] -name = "ecdsa" -version = "0.14.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" -dependencies = [ - "der", - "elliptic-curve", - "rfc6979", - "signature", -] - -[[package]] -name = "either" -version = "1.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" - -[[package]] -name = "elliptic-curve" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" -dependencies = [ - "base16ct", - "crypto-bigint 0.4.9", - "der", - "digest", - "ff", - "generic-array", - "group", - "pkcs8", - "rand_core", - "sec1", - "subtle", - "zeroize", -] - -[[package]] -name = "encode_unicode" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" - -[[package]] -name = "env_logger" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cd405aab171cb85d6735e5c8d9db038c17d3ca007a4d2c25f337935c3d90580" -dependencies = [ - "humantime", - "is-terminal", - "log", - "regex", - "termcolor", -] - -[[package]] -name = "equivalent" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" - -[[package]] -name = "errno" -version = "0.3.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - -[[package]] -name = "esaxx-rs" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d817e038c30374a4bcb22f94d0a8a0e216958d4c3dcde369b1439fec4bdda6e6" -dependencies = [ - "cc", -] - -[[package]] -name = "fastrand" -version = "2.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "658bd65b1cf4c852a3cc96f18a8ce7b5640f6b703f905c7d74532294c2a63984" - -[[package]] -name = "ff" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d013fc25338cc558c5c2cfbad646908fb23591e2404481826742b651c9af7160" -dependencies = [ - "rand_core", - "subtle", -] - -[[package]] -name = "flate2" -version = "1.0.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46303f565772937ffe1d394a4fac6f411c6013172fadde9dcdb1e147a086940e" -dependencies = [ - "crc32fast", - "libz-ng-sys", - "miniz_oxide", -] - -[[package]] -name = "fnv" -version = "1.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" - -[[package]] -name = "foreign-types" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" -dependencies = [ - "foreign-types-shared", -] - -[[package]] -name = "foreign-types-shared" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" - -[[package]] -name = "form_urlencoded" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" -dependencies = [ - "percent-encoding", -] - -[[package]] -name = "futures-channel" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" -dependencies = [ - "futures-core", -] - -[[package]] -name = "futures-core" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" - -[[package]] -name = "futures-sink" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" - -[[package]] -name = "futures-task" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" - -[[package]] -name = "futures-util" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" -dependencies = [ - "futures-core", - "futures-task", - "pin-project-lite", - "pin-utils", -] - -[[package]] -name = "generic-array" -version = "0.14.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" -dependencies = [ - "typenum", - "version_check", -] - -[[package]] -name = "getrandom" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94b22e06ecb0110981051723910cbf0b5f5e09a2062dd7663334ee79a9d1286c" -dependencies = [ - "cfg-if", - "libc", - "wasi", -] - -[[package]] -name = "gimli" -version = "0.28.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" - -[[package]] -name = "glob" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" - -[[package]] -name = "group" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7" -dependencies = [ - "ff", - "rand_core", - "subtle", -] - -[[package]] -name = "h2" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" -dependencies = [ - "bytes", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http 0.2.12", - "indexmap", - "slab", - "tokio", - "tokio-util", - "tracing", -] - -[[package]] -name = "hashbrown" -version = "0.14.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604" -dependencies = [ - "ahash", - "allocator-api2", -] - -[[package]] -name = "heck" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" -dependencies = [ - "unicode-segmentation", -] - -[[package]] -name = "heck" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" - -[[package]] -name = "hermit-abi" -version = "0.1.19" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" -dependencies = [ - "libc", -] - -[[package]] -name = "hermit-abi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" - -[[package]] -name = "hex" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" - -[[package]] -name = "hf-hub" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b780635574b3d92f036890d8373433d6f9fc7abb320ee42a5c25897fc8ed732" -dependencies = [ - "dirs", - "indicatif", - "log", - "native-tls", - "rand", - "serde", - "serde_json", - "thiserror", - "ureq", -] - -[[package]] -name = "hifijson" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18ae468bcb4dfecf0e4949ee28abbc99076b6a0077f51ddbc94dbfff8e6a870c" - -[[package]] -name = "hmac" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" -dependencies = [ - "digest", -] - -[[package]] -name = "http" -version = "0.2.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" -dependencies = [ - "bytes", - "fnv", - "itoa", -] - -[[package]] -name = "http" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b9ddb458710bc376481b842f5da65cdf31522de232c1ca8146abce2a358258" -dependencies = [ - "bytes", - "fnv", - "itoa", -] - -[[package]] -name = "http-body" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" -dependencies = [ - "bytes", - "http 0.2.12", - "pin-project-lite", -] - -[[package]] -name = "http-body" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cac85db508abc24a2e48553ba12a996e87244a0395ce011e62b37158745d643" -dependencies = [ - "bytes", - "http 1.1.0", -] - -[[package]] -name = "http-body-util" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0475f8b2ac86659c21b64320d5d653f9efe42acd2a4e560073ec61a155a34f1d" -dependencies = [ - "bytes", - "futures-core", - "http 1.1.0", - "http-body 1.0.0", - "pin-project-lite", -] - -[[package]] -name = "httparse" -version = "1.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d897f394bad6a705d5f4104762e116a75639e470d80901eed05a860a95cb1904" - -[[package]] -name = "httpdate" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" - -[[package]] -name = "humantime" -version = "2.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" - -[[package]] -name = "hyper" -version = "0.14.28" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf96e135eb83a2a8ddf766e426a841d8ddd7449d5f00d34ea02b41d2f19eef80" -dependencies = [ - "bytes", - "futures-channel", - "futures-core", - "futures-util", - "h2", - "http 0.2.12", - "http-body 0.4.6", - "httparse", - "httpdate", - "itoa", - "pin-project-lite", - "socket2", - "tokio", - "tower-service", - "tracing", - "want", -] - -[[package]] -name = "hyper-rustls" -version = "0.24.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" -dependencies = [ - "futures-util", - "http 0.2.12", - "hyper", - "log", - "rustls 0.21.12", - "rustls-native-certs", - "tokio", - "tokio-rustls", -] - -[[package]] -name = "ident_case" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" - -[[package]] -name = "idna" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418a0a6fab821475f634efe3ccc45c013f742efe03d853e8d3355d5cb850ecf8" -dependencies = [ - "matches", - "unicode-bidi", - "unicode-normalization", -] - -[[package]] -name = "idna" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" -dependencies = [ - "unicode-bidi", - "unicode-normalization", -] - -[[package]] -name = "indexmap" -version = "2.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" -dependencies = [ - "equivalent", - "hashbrown", -] - -[[package]] -name = "indicatif" -version = "0.17.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "763a5a8f45087d6bcea4222e7b72c291a054edf80e4ef6efd2a4979878c7bea3" -dependencies = [ - "console", - "instant", - "number_prefix", - "portable-atomic", - "unicode-width", -] - -[[package]] -name = "indoc" -version = "1.0.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa799dd5ed20a7e349f3b4639aa80d74549c81716d9ec4f994c9b5815598306" - -[[package]] -name = "instant" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "is-terminal" -version = "0.4.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f23ff5ef2b80d608d61efee834934d862cd92461afc0560dedf493e4c033738b" -dependencies = [ - "hermit-abi 0.3.9", - "libc", - "windows-sys 0.52.0", -] - -[[package]] -name = "itertools" -version = "0.10.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" -dependencies = [ - "either", -] - -[[package]] -name = "itertools" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" -dependencies = [ - "either", -] - -[[package]] -name = "itertools" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" -dependencies = [ - "either", -] - -[[package]] -name = "itoa" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" - -[[package]] -name = "jaq-core" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03d6a5713b8f33675abfac79d1db0022a3f28764b2a6b96a185c199ad8dab86d" -dependencies = [ - "aho-corasick", - "base64 0.21.7", - "hifijson", - "jaq-interpret", - "libm", - "log", - "regex", - "time", - "urlencoding", -] - -[[package]] -name = "jaq-interpret" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f569e38e5fc677db8dfda89ee0b4c25b3f53e811b16434fd14bdc5b43fc362ac" -dependencies = [ - "ahash", - "dyn-clone", - "hifijson", - "indexmap", - "jaq-syn", - "once_cell", - "serde_json", -] - -[[package]] -name = "jaq-parse" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef6f8beb9f9922546419e774e24199e8a968f54c63a5a2323c8f3ef3321ace14" -dependencies = [ - "chumsky", - "jaq-syn", -] - -[[package]] -name = "jaq-std" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d7871c59297cbfdd18f6f1bbbafaad24e97fd555ee1e2a1be7a40a5a20f551a" -dependencies = [ - "bincode", - "jaq-parse", - "jaq-syn", -] - -[[package]] -name = "jaq-syn" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4d60101fb791b20c982731d848ed6e7d25363656497647c2093b68bd88398d6" -dependencies = [ - "serde", -] - -[[package]] -name = "jobserver" -version = "0.1.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2b099aaa34a9751c5bf0878add70444e1ed2dd73f347be99003d4577277de6e" -dependencies = [ - "libc", -] - -[[package]] -name = "jsonpath-rust" -version = "0.3.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06cc127b7c3d270be504572364f9569761a180b981919dd0d87693a7f5fb7829" -dependencies = [ - "pest", - "pest_derive", - "regex", - "serde_json", - "thiserror", -] - -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - -[[package]] -name = "libc" -version = "0.2.153" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" - -[[package]] -name = "libm" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" - -[[package]] -name = "libredox" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" -dependencies = [ - "bitflags 2.5.0", - "libc", -] - -[[package]] -name = "libz-ng-sys" -version = "1.1.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6409efc61b12687963e602df8ecf70e8ddacf95bc6576bcf16e3ac6328083c5" -dependencies = [ - "cmake", - "libc", -] - -[[package]] -name = "lifeguard" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89be94dbd775db37b46ca4f4bf5cf89adfb13ba197bfbcb69b2122848ee73c26" - -[[package]] -name = "linux-raw-sys" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" - -[[package]] -name = "lock_api" -version = "0.4.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" -dependencies = [ - "autocfg", - "scopeguard", -] - -[[package]] -name = "log" -version = "0.4.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" - -[[package]] -name = "lru" -version = "0.12.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3262e75e648fce39813cb56ac41f3c3e3f65217ebf3844d818d1f9398cfb0dc" -dependencies = [ - "hashbrown", -] - -[[package]] -name = "macro_rules_attribute" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a82271f7bc033d84bbca59a3ce3e4159938cb08a9c3aebbe54d215131518a13" -dependencies = [ - "macro_rules_attribute-proc_macro", - "paste", -] - -[[package]] -name = "macro_rules_attribute-proc_macro" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568" - -[[package]] -name = "matches" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5" - -[[package]] -name = "md-5" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" -dependencies = [ - "cfg-if", - "digest", -] - -[[package]] -name = "memchr" -version = "2.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" - -[[package]] -name = "memoffset" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" -dependencies = [ - "autocfg", -] - -[[package]] -name = "minimal-lexical" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" - -[[package]] -name = "miniz_oxide" -version = "0.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" -dependencies = [ - "adler", -] - -[[package]] -name = "mio" -version = "0.8.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" -dependencies = [ - "libc", - "wasi", - "windows-sys 0.48.0", -] - -[[package]] -name = "monostate" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "878c2a1f1c70e5724fa28f101ca787b6a7e8ad5c5e4ae4ca3b0fa4a419fa9075" -dependencies = [ - "monostate-impl", - "serde", -] - -[[package]] -name = "monostate-impl" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f686d68a09079e63b1d2c64aa305095887ce50565f00a922ebfaeeee0d9ba6ce" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.58", -] - -[[package]] -name = "native-tls" -version = "0.2.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" -dependencies = [ - "lazy_static", - "libc", - "log", - "openssl", - "openssl-probe", - "openssl-sys", - "schannel", - "security-framework", - "security-framework-sys", - "tempfile", -] - -[[package]] -name = "nom" -version = "7.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" -dependencies = [ - "memchr", - "minimal-lexical", -] - -[[package]] -name = "num-conv" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" - -[[package]] -name = "num-integer" -version = "0.1.46" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" -dependencies = [ - "num-traits", -] - -[[package]] -name = "num-traits" -version = "0.2.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" -dependencies = [ - "autocfg", -] - -[[package]] -name = "num_cpus" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" -dependencies = [ - "hermit-abi 0.3.9", - "libc", -] - -[[package]] -name = "number_prefix" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" - -[[package]] -name = "object" -version = "0.32.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" -dependencies = [ - "memchr", -] - -[[package]] -name = "once_cell" -version = "1.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" - -[[package]] -name = "onig" -version = "6.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f" -dependencies = [ - "bitflags 1.3.2", - "libc", - "once_cell", - "onig_sys", -] - -[[package]] -name = "onig_sys" -version = "69.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b829e3d7e9cc74c7e315ee8edb185bf4190da5acde74afd7fc59c35b1f086e7" -dependencies = [ - "cc", - "pkg-config", -] - -[[package]] -name = "openssl" -version = "0.10.66" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9529f4786b70a3e8c61e11179af17ab6188ad8d0ded78c5529441ed39d4bd9c1" -dependencies = [ - "bitflags 2.5.0", - "cfg-if", - "foreign-types", - "libc", - "once_cell", - "openssl-macros", - "openssl-sys", -] - -[[package]] -name = "openssl-macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.58", -] - -[[package]] -name = "openssl-probe" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" - -[[package]] -name = "openssl-src" -version = "300.2.3+3.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cff92b6f71555b61bb9315f7c64da3ca43d87531622120fea0195fc761b4843" -dependencies = [ - "cc", -] - -[[package]] -name = "openssl-sys" -version = "0.9.103" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f9e8deee91df40a943c71b917e5874b951d32a802526c85721ce3b776c929d6" -dependencies = [ - "cc", - "libc", - "openssl-src", - "pkg-config", - "vcpkg", -] - -[[package]] -name = "option-ext" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" - -[[package]] -name = "outref" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" - -[[package]] -name = "p256" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594" -dependencies = [ - "ecdsa", - "elliptic-curve", - "sha2", -] - -[[package]] -name = "parking_lot" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" -dependencies = [ - "lock_api", - "parking_lot_core", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall", - "smallvec", - "windows-targets 0.48.5", -] - -[[package]] -name = "parse-size" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "944553dd59c802559559161f9816429058b869003836120e262e8caec061b7ae" - -[[package]] -name = "paste" -version = "1.0.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3145af08024dea9fa9914f381a17b8fc6034dfb00f3a84013f7ff43f29ed4c" - -[[package]] -name = "percent-encoding" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" - -[[package]] -name = "pest" -version = "2.7.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "311fb059dee1a7b802f036316d790138c613a4e8b180c822e3925a662e9f0c95" -dependencies = [ - "memchr", - "thiserror", - "ucd-trie", -] - -[[package]] -name = "pest_derive" -version = "2.7.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f73541b156d32197eecda1a4014d7f868fd2bcb3c550d5386087cfba442bf69c" -dependencies = [ - "pest", - "pest_generator", -] - -[[package]] -name = "pest_generator" -version = "2.7.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c35eeed0a3fab112f75165fdc026b3913f4183133f19b49be773ac9ea966e8bd" -dependencies = [ - "pest", - "pest_meta", - "proc-macro2", - "quote", - "syn 2.0.58", -] - -[[package]] -name = "pest_meta" -version = "2.7.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2adbf29bb9776f28caece835398781ab24435585fe0d4dc1374a61db5accedca" -dependencies = [ - "once_cell", - "pest", - "sha2", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "pkcs8" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba" -dependencies = [ - "der", - "spki", -] - -[[package]] -name = "pkg-config" -version = "0.3.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" - -[[package]] -name = "portable-atomic" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" - -[[package]] -name = "powerfmt" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" - -[[package]] -name = "ppv-lite86" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" - -[[package]] -name = "proc-macro-error" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn 1.0.109", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" -dependencies = [ - "proc-macro2", - "quote", - "version_check", -] - -[[package]] -name = "proc-macro2" -version = "1.0.79" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e835ff2298f5721608eb1a980ecaee1aef2c132bf95ecc026a11b7bf3c01c02e" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "psl" -version = "2.1.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3bb1091ea5d9ac71671164e4a9a0317144816c6c4f76ebaa0a6f43f32586463" -dependencies = [ - "psl-types", -] - -[[package]] -name = "psl-types" -version = "2.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33cb294fe86a74cbcf50d4445b37da762029549ebeea341421c7c70370f86cac" - -[[package]] -name = "pyo3" -version = "0.19.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e681a6cfdc4adcc93b4d3cf993749a4552018ee0a9b65fc0ccfad74352c72a38" -dependencies = [ - "cfg-if", - "indoc", - "libc", - "memoffset", - "parking_lot", - "pyo3-build-config", - "pyo3-ffi", - "pyo3-macros", - "unindent", -] - -[[package]] -name = "pyo3-build-config" -version = "0.19.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "076c73d0bc438f7a4ef6fdd0c3bb4732149136abd952b110ac93e4edb13a6ba5" -dependencies = [ - "once_cell", - "target-lexicon", -] - -[[package]] -name = "pyo3-ffi" -version = "0.19.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e53cee42e77ebe256066ba8aa77eff722b3bb91f3419177cf4cd0f304d3284d9" -dependencies = [ - "libc", - "pyo3-build-config", -] - -[[package]] -name = "pyo3-macros" -version = "0.19.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfeb4c99597e136528c6dd7d5e3de5434d1ceaf487436a3f03b2d56b6fc9efd1" -dependencies = [ - "proc-macro2", - "pyo3-macros-backend", - "quote", - "syn 1.0.109", -] - -[[package]] -name = "pyo3-macros-backend" -version = "0.19.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "947dc12175c254889edc0c02e399476c2f652b4b9ebd123aa655c224de259536" -dependencies = [ - "proc-macro2", - "quote", - "syn 1.0.109", -] - -[[package]] -name = "quote" -version = "1.0.35" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "rand" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "libc", - "rand_chacha", - "rand_core", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom", -] - -[[package]] -name = "rayon" -version = "1.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" -dependencies = [ - "either", - "rayon-core", -] - -[[package]] -name = "rayon-cond" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9" -dependencies = [ - "either", - "itertools 0.11.0", - "rayon", -] - -[[package]] -name = "rayon-core" -version = "1.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" -dependencies = [ - "crossbeam-deque", - "crossbeam-utils", -] - -[[package]] -name = "redox_syscall" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" -dependencies = [ - "bitflags 1.3.2", -] - -[[package]] -name = "redox_users" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891" -dependencies = [ - "getrandom", - "libredox", - "thiserror", -] - -[[package]] -name = "regex" -version = "1.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" -dependencies = [ - "aho-corasick", - "memchr", - "regex-automata", - "regex-syntax", -] - -[[package]] -name = "regex-automata" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" -dependencies = [ - "aho-corasick", - "memchr", - "regex-syntax", -] - -[[package]] -name = "regex-lite" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30b661b2f27137bdbc16f00eda72866a92bb28af1753ffbd56744fb6e2e9cd8e" - -[[package]] -name = "regex-syntax" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adad44e29e4c806119491a7f06f03de4d1af22c3a680dd47f1e6e179439d1f56" - -[[package]] -name = "rfc6979" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7743f17af12fa0b03b803ba12cd6a8d9483a587e89c69445e3909655c0b9fabb" -dependencies = [ - "crypto-bigint 0.4.9", - "hmac", - "zeroize", -] - -[[package]] -name = "ring" -version = "0.17.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" -dependencies = [ - "cc", - "cfg-if", - "getrandom", - "libc", - "spin", - "untrusted", - "windows-sys 0.52.0", -] - -[[package]] -name = "rmp" -version = "0.8.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f9860a6cc38ed1da53456442089b4dfa35e7cedaa326df63017af88385e6b20" -dependencies = [ - "byteorder", - "num-traits", - "paste", -] - -[[package]] -name = "rmp-serde" -version = "0.15.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "723ecff9ad04f4ad92fe1c8ca6c20d2196d9286e9c60727c4cb5511629260e9d" -dependencies = [ - "byteorder", - "rmp", - "serde", -] - -[[package]] -name = "rustc-demangle" -version = "0.1.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" - -[[package]] -name = "rustc_version" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dfe2087c51c460008730de8b57e6a320782fbfb312e1f4d520e6c6fae155ee" -dependencies = [ - "semver 0.11.0", -] - -[[package]] -name = "rustc_version" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" -dependencies = [ - "semver 1.0.22", -] - -[[package]] -name = "rustix" -version = "0.38.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65e04861e65f21776e67888bfbea442b3642beaa0138fdb1dd7a84a52dffdb89" -dependencies = [ - "bitflags 2.5.0", - "errno", - "libc", - "linux-raw-sys", - "windows-sys 0.52.0", -] - -[[package]] -name = "rustls" -version = "0.21.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" -dependencies = [ - "log", - "ring", - "rustls-webpki 0.101.7", - "sct", -] - -[[package]] -name = "rustls" -version = "0.22.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99008d7ad0bbbea527ec27bddbc0e432c5b87d8175178cee68d2eec9c4a1813c" -dependencies = [ - "log", - "ring", - "rustls-pki-types", - "rustls-webpki 0.102.2", - "subtle", - "zeroize", -] - -[[package]] -name = "rustls-native-certs" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" -dependencies = [ - "openssl-probe", - "rustls-pemfile", - "schannel", - "security-framework", -] - -[[package]] -name = "rustls-pemfile" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" -dependencies = [ - "base64 0.21.7", -] - -[[package]] -name = "rustls-pki-types" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecd36cc4259e3e4514335c4a138c6b43171a8d61d8f5c9348f9fc7529416f247" - -[[package]] -name = "rustls-webpki" -version = "0.101.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" -dependencies = [ - "ring", - "untrusted", -] - -[[package]] -name = "rustls-webpki" -version = "0.102.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610" -dependencies = [ - "ring", - "rustls-pki-types", - "untrusted", -] - -[[package]] -name = "ryu" -version = "1.0.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e86697c916019a8588c99b5fac3cead74ec0b4b819707a682fd4d23fa0ce1ba1" - -[[package]] -name = "schannel" -version = "0.1.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbc91545643bcf3a0bbb6569265615222618bdf33ce4ffbbd13c4bbd4c093534" -dependencies = [ - "windows-sys 0.52.0", -] - -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - -[[package]] -name = "sct" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" -dependencies = [ - "ring", - "untrusted", -] - -[[package]] -name = "seahash" -version = "3.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "58f57ca1d128a43733fd71d583e837b1f22239a37ebea09cde11d8d9a9080f47" - -[[package]] -name = "sec1" -version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" -dependencies = [ - "base16ct", - "der", - "generic-array", - "pkcs8", - "subtle", - "zeroize", -] - -[[package]] -name = "security-framework" -version = "2.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "770452e37cad93e0a50d5abc3990d2bc351c36d0328f86cefec2f2fb206eaef6" -dependencies = [ - "bitflags 1.3.2", - "core-foundation", - "core-foundation-sys", - "libc", - "security-framework-sys", -] - -[[package]] -name = "security-framework-sys" -version = "2.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41f3cc463c0ef97e11c3461a9d3787412d30e8e7eb907c79180c4a57bf7c04ef" -dependencies = [ - "core-foundation-sys", - "libc", -] - -[[package]] -name = "semver" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f301af10236f6df4160f7c3f04eec6dbc70ace82d23326abad5edee88801c6b6" -dependencies = [ - "semver-parser", -] - -[[package]] -name = "semver" -version = "1.0.22" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca" - -[[package]] -name = "semver-parser" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00b0bef5b7f9e0df16536d3961cfb6e84331c065b4066afb39768d0e319411f7" -dependencies = [ - "pest", -] - -[[package]] -name = "serde" -version = "1.0.197" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fb1c873e1b9b056a4dc4c0c198b24c3ffa059243875552b2bd0933b1aee4ce2" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.197" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7eb0b34b42edc17f6b7cac84a52a1c5f0e1bb2227e997ca9011ea3dd34e8610b" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.58", -] - -[[package]] -name = "serde_json" -version = "1.0.115" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12dc5c46daa8e9fdf4f5e71b6cf9a53f2487da0e86e55808e2d35539666497dd" -dependencies = [ - "itoa", - "ryu", - "serde", -] - -[[package]] -name = "sha1" -version = "0.10.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - -[[package]] -name = "sha2" -version = "0.10.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - -[[package]] -name = "signal-hook-registry" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" -dependencies = [ - "libc", -] - -[[package]] -name = "signature" -version = "1.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74233d3b3b2f6d4b006dc19dee745e73e2a6bfb6f93607cd3b02bd5b00797d7c" -dependencies = [ - "digest", - "rand_core", -] - -[[package]] -name = "simple_logger" -version = "3.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc20708d703a44b96b3b700578a85b6fe887fc63ab20315757026bb8a12faaad" -dependencies = [ - "atty", - "colored", - "log", - "winapi", -] - -[[package]] -name = "slab" -version = "0.4.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" -dependencies = [ - "autocfg", -] - -[[package]] -name = "smallvec" -version = "1.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" - -[[package]] -name = "socket2" -version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - -[[package]] -name = "spin" -version = "0.9.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" - -[[package]] -name = "spki" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b" -dependencies = [ - "base64ct", - "der", -] - -[[package]] -name = "spm_precompiled" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5851699c4033c63636f7ea4cf7b7c1f1bf06d0cc03cfb42e711de5a5c46cf326" -dependencies = [ - "base64 0.13.1", - "nom", - "serde", - "unicode-segmentation", -] - -[[package]] -name = "strsim" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" - -[[package]] -name = "strsim" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" - -[[package]] -name = "strsim" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" - -[[package]] -name = "structopt" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" -dependencies = [ - "clap 2.34.0", - "lazy_static", - "structopt-derive", -] - -[[package]] -name = "structopt-derive" -version = "0.4.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" -dependencies = [ - "heck 0.3.3", - "proc-macro-error", - "proc-macro2", - "quote", - "syn 1.0.109", -] - -[[package]] -name = "subtle" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" - -[[package]] -name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "syn" -version = "2.0.58" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44cfb93f38070beee36b3fef7d4f5a16f27751d94b187b666a5cc5e9b0d30687" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "target-lexicon" -version = "0.12.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1fc403891a21bcfb7c37834ba66a547a8f402146eba7265b5a6d88059c9ff2f" - -[[package]] -name = "tempfile" -version = "3.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85b77fafb263dd9d05cbeac119526425676db3784113aa9295c88498cbf8bff1" -dependencies = [ - "cfg-if", - "fastrand", - "rustix", - "windows-sys 0.52.0", -] - -[[package]] -name = "termcolor" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "textwrap" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" -dependencies = [ - "unicode-width", -] - -[[package]] -name = "thiserror" -version = "1.0.58" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297" -dependencies = [ - "thiserror-impl", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.58" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.58", -] - -[[package]] -name = "thousands" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" - -[[package]] -name = "threadpool" -version = "1.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa" -dependencies = [ - "num_cpus", -] - -[[package]] -name = "time" -version = "0.3.36" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" -dependencies = [ - "deranged", - "itoa", - "num-conv", - "powerfmt", - "serde", - "time-core", - "time-macros", -] - -[[package]] -name = "time-core" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" - -[[package]] -name = "time-macros" -version = "0.2.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" -dependencies = [ - "num-conv", - "time-core", -] - -[[package]] -name = "tinyvec" -version = "1.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" -dependencies = [ - "tinyvec_macros", -] - -[[package]] -name = "tinyvec_macros" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" - -[[package]] -name = "tokenizers" -version = "0.15.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3dd47962b0ba36e7fd33518fbf1754d136fd1474000162bbf2a8b5fcb2d3654d" -dependencies = [ - "aho-corasick", - "clap 4.5.4", - "derive_builder", - "esaxx-rs", - "getrandom", - "hf-hub", - "indicatif", - "itertools 0.12.1", - "lazy_static", - "log", - "macro_rules_attribute", - "monostate", - "onig", - "paste", - "rand", - "rayon", - "rayon-cond", - "regex", - "regex-syntax", - "serde", - "serde_json", - "spm_precompiled", - "thiserror", - "unicode-normalization-alignments", - "unicode-segmentation", - "unicode_categories", -] - -[[package]] -name = "tokio" -version = "1.37.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1adbebffeca75fcfd058afa480fb6c0b81e165a0323f9c9d39c9697e37c46787" -dependencies = [ - "backtrace", - "bytes", - "libc", - "mio", - "num_cpus", - "parking_lot", - "pin-project-lite", - "signal-hook-registry", - "socket2", - "tokio-macros", - "windows-sys 0.48.0", -] - -[[package]] -name = "tokio-macros" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.58", -] - -[[package]] -name = "tokio-rustls" -version = "0.24.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" -dependencies = [ - "rustls 0.21.12", - "tokio", -] - -[[package]] -name = "tokio-util" -version = "0.7.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "pin-project-lite", - "tokio", - "tracing", -] - -[[package]] -name = "tower-service" -version = "0.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" - -[[package]] -name = "tracing" -version = "0.1.40" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" -dependencies = [ - "pin-project-lite", - "tracing-attributes", - "tracing-core", -] - -[[package]] -name = "tracing-attributes" -version = "0.1.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.58", -] - -[[package]] -name = "tracing-core" -version = "0.1.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" -dependencies = [ - "once_cell", -] - -[[package]] -name = "try-lock" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" - -[[package]] -name = "typenum" -version = "1.17.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" - -[[package]] -name = "ucd-trie" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9" - -[[package]] -name = "unicode-bidi" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" - -[[package]] -name = "unicode-ident" -version = "1.0.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" - -[[package]] -name = "unicode-normalization" -version = "0.1.23" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" -dependencies = [ - "tinyvec", -] - -[[package]] -name = "unicode-normalization-alignments" -version = "0.1.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43f613e4fa046e69818dd287fdc4bc78175ff20331479dab6e1b0f98d57062de" -dependencies = [ - "smallvec", -] - -[[package]] -name = "unicode-segmentation" -version = "1.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" - -[[package]] -name = "unicode-width" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e51733f11c9c4f72aa0c160008246859e340b00807569a0da0e7a1079b27ba85" - -[[package]] -name = "unicode_categories" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e" - -[[package]] -name = "unindent" -version = "0.1.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1766d682d402817b5ac4490b3c3002d91dfa0d22812f341609f97b08757359c" - -[[package]] -name = "untrusted" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" - -[[package]] -name = "ureq" -version = "2.9.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11f214ce18d8b2cbe84ed3aa6486ed3f5b285cf8d8fbdbce9f3f767a724adc35" -dependencies = [ - "base64 0.21.7", - "flate2", - "log", - "native-tls", - "once_cell", - "rustls 0.22.3", - "rustls-pki-types", - "rustls-webpki 0.102.2", - "serde", - "serde_json", - "url", - "webpki-roots", -] - -[[package]] -name = "url" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31e6302e3bb753d46e83516cae55ae196fc0c309407cf11ab35cc51a4c2a4633" -dependencies = [ - "form_urlencoded", - "idna 0.5.0", - "percent-encoding", -] - -[[package]] -name = "urlencoding" -version = "2.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" - -[[package]] -name = "utf8parse" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" - -[[package]] -name = "uuid" -version = "1.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a183cf7feeba97b4dd1c0d46788634f6221d87fa961b305bed08c851829efcc0" - -[[package]] -name = "vcpkg" -version = "0.2.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" - -[[package]] -name = "vec_map" -version = "0.8.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" - -[[package]] -name = "version_check" -version = "0.9.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" - -[[package]] -name = "vsimd" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" - -[[package]] -name = "want" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" -dependencies = [ - "try-lock", -] - -[[package]] -name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" - -[[package]] -name = "webpki-roots" -version = "0.26.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3de34ae270483955a94f4b21bdaaeb83d508bb84a01435f393818edb0012009" -dependencies = [ - "rustls-pki-types", -] - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" -dependencies = [ - "winapi", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - -[[package]] -name = "windows-sys" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" -dependencies = [ - "windows-targets 0.48.5", -] - -[[package]] -name = "windows-sys" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" -dependencies = [ - "windows-targets 0.52.4", -] - -[[package]] -name = "windows-targets" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" -dependencies = [ - "windows_aarch64_gnullvm 0.48.5", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm 0.48.5", - "windows_x86_64_msvc 0.48.5", -] - -[[package]] -name = "windows-targets" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" -dependencies = [ - "windows_aarch64_gnullvm 0.52.4", - "windows_aarch64_msvc 0.52.4", - "windows_i686_gnu 0.52.4", - "windows_i686_msvc 0.52.4", - "windows_x86_64_gnu 0.52.4", - "windows_x86_64_gnullvm 0.52.4", - "windows_x86_64_msvc 0.52.4", -] - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" - -[[package]] -name = "windows_i686_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" - -[[package]] -name = "windows_i686_gnu" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" - -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" - -[[package]] -name = "windows_i686_msvc" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.52.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" - -[[package]] -name = "xmlparser" -version = "0.13.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" - -[[package]] -name = "zerocopy" -version = "0.7.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" -dependencies = [ - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.7.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.58", -] - -[[package]] -name = "zeroize" -version = "1.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "525b4ec142c6b68a2d10f01f7bbf6755599ca3f81ea53b8431b7dd348f5fdb2d" - -[[package]] -name = "zstd" -version = "0.13.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2d789b1514203a1120ad2429eae43a7bd32b90976a7bb8a05f7ec02fa88cc23a" -dependencies = [ - "zstd-safe", -] - -[[package]] -name = "zstd-safe" -version = "7.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1cd99b45c6bc03a018c8b8a86025678c87e55526064e38f9df301989dce7ec0a" -dependencies = [ - "zstd-sys", -] - -[[package]] -name = "zstd-sys" -version = "2.0.10+zstd.1.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c253a4914af5bafc8fa8c86ee400827e83cf6ec01195ec1f1ed8441bf00d65aa" -dependencies = [ - "cc", - "pkg-config", -] diff --git a/pyproject.toml b/pyproject.toml index 7e631900..20778182 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,14 +26,15 @@ dependencies = [ "requests", "rich", "s3fs==2023.6.0", - "smart-open", - "tokenizers>=0.15.0,<=0.19.1", + "smart-open>=7.0.4,<=8.0.0", + "tokenizers==0.21.1", "tqdm", "uniseg", "numpy<2", "necessary>=0.4.3", "charset-normalizer>=3.2.0", "zstandard>=0.23.0", + "backoff>=2.2.0,<=3.0.0", ] classifiers = [ "Development Status :: 5 - Production/Stable", @@ -117,13 +118,14 @@ dev = [ ] # extension to process code code = ["detect-secrets==1.4.0", "beautifulsoup4>=4", "pygments", "regex"] + # extension to detect PIIs using presidio pii = ["presidio_analyzer==2.2.32", "regex"] # language detection; by default, we use fastttext, everything else is optional lang = [ "fasttext-wheel==0.9.2", - "LTpycld2==0.42", # fork of pycld2 that works on Apple Silicon + "pycld2==0.42", "lingua-language-detector>=2.0.0", "langdetect>=1.0.9", ] diff --git a/python/dolma/core/__init__.py b/python/dolma/core/__init__.py index 9219cd50..d27ea07d 100644 --- a/python/dolma/core/__init__.py +++ b/python/dolma/core/__init__.py @@ -2,6 +2,9 @@ from .registry import TaggerRegistry from .taggers import BaseTagger +# importing utils to make sure that decompressors for smart_open are registered +from .utils import add_compression # noqa: F401 + __all__ = [ "BaseTagger", "DocResult", diff --git a/python/dolma/core/analyzer.py b/python/dolma/core/analyzer.py index c8d542c0..8368ee73 100644 --- a/python/dolma/core/analyzer.py +++ b/python/dolma/core/analyzer.py @@ -283,7 +283,8 @@ def create_and_run_analyzer( report: Optional[str] = None, debug: bool = False, seed: int = 0, - num_bins: int = 1000, + compute_bins: int = 1_000, + visualize_bins: int = 10, num_processes: int = 1, name_regex: Optional[str] = None, show_total: bool = False, @@ -300,7 +301,8 @@ def create_and_run_analyzer( report (Optional[str], optional): Path to the report directory. Defaults to None. debug (bool, optional): Enable debug mode. Defaults to False. seed (int, optional): Seed value for randomization. Defaults to 0. - num_bins (int, optional): Number of bins for analysis. Defaults to 1000. + compute_bins (int, optional): Number of bins for analysis. Defaults to 1_000. + visualize_bins (int, optional): Number of bins for visualization. Defaults to 10. num_processes (int, optional): Number of processes to use for analysis. Defaults to 1. name_regex (Optional[str], optional): Regular expression for filtering attribute names. Defaults to None. show_total (bool, optional): Show total summary. Defaults to False. @@ -325,11 +327,11 @@ def create_and_run_analyzer( debug=debug, seed=seed, ignore_existing=True, - retries_on_error=0, + backoff_max_tries=1, num_processes=num_processes, ) - analyzer(num_bins=num_bins, name_regex=name_regex) + analyzer(num_bins=compute_bins, name_regex=name_regex) - summaries = aggregate_summaries(summaries_path=summaries_path, num_bins=num_bins) - visualize_summaries(summaries=summaries, show_total=show_total) + summaries = aggregate_summaries(summaries_path=summaries_path, num_bins=compute_bins) + visualize_summaries(summaries=summaries, show_total=show_total, num_viz_bins=visualize_bins) write_output(summaries=summaries, report=report) diff --git a/python/dolma/core/binning.py b/python/dolma/core/binning.py index 3afee92f..c33fd0da 100644 --- a/python/dolma/core/binning.py +++ b/python/dolma/core/binning.py @@ -205,7 +205,8 @@ def merge_bins( class BaseBucketApi: def __init__(self): - self._total = self._sum = 0 + self._total = 0 + self._sum = 0.0 @abstractproperty def full(self) -> bool: diff --git a/python/dolma/core/data_types.py b/python/dolma/core/data_types.py index d71bbab3..c1a0a7a7 100644 --- a/python/dolma/core/data_types.py +++ b/python/dolma/core/data_types.py @@ -6,10 +6,13 @@ """ -from typing import Any, Dict, List, Optional, Tuple +import functools +import re +from hashlib import sha1 +from typing import Any, Callable, Dict, List, Optional, Tuple, Type from msgspec import Struct -from typing_extensions import TypeAlias +from typing_extensions import Self, TypeAlias TaggerOutputValueType: TypeAlias = Tuple[int, int, float] TaggerOutputType: TypeAlias = List[TaggerOutputValueType] @@ -17,13 +20,18 @@ class InputSpec(Struct): - id: str text: str + id: str = "" source: str = "" - created: str = "" - added: str = "" + created: Optional[Any] = "" + added: Optional[Any] = "" version: Optional[str] = None + def __post_init__(self): + if not self.id: + (h := sha1()).update(self.text.encode()) + self.id = h.hexdigest() + class InputSpecWithMetadata(InputSpec): metadata: Optional[Dict[str, Any]] = None @@ -40,27 +48,38 @@ class OutputSpec(Struct): class Document: - __slots__ = "source", "version", "id", "text" + __slots__ = "source", "version", "id", "text", "added", "created" + spec_cls: Type[InputSpec] = InputSpec - def __init__(self, source: str, id: str, text: str, version: Optional[str] = None) -> None: + def __init__( + self, + source: str, + id: str, + text: str, + version: Optional[str] = None, + added: Optional[str] = None, + created: Optional[str] = None, + ) -> None: self.source = source self.version = version self.id = id self.text = text + self.added = added + self.created = created @classmethod - def from_spec(cls, spec: InputSpec) -> "Document": - return Document(source=spec.source, version=spec.version, id=spec.id, text=spec.text) + def from_spec(cls, spec: InputSpec) -> Self: + return cls(**{k: v for k in cls.__slots__ if (v := getattr(spec, k)) is not None}) def to_spec(self) -> InputSpec: - return InputSpec(source=self.source, version=self.version, id=self.id, text=self.text) + return self.spec_cls(**{k: v for k in self.__slots__ if (v := getattr(self, k)) is not None}) @classmethod - def from_json(cls, d: Dict[str, Any]) -> "Document": - return Document(source=d["source"], version=d["version"], id=d["id"], text=d["text"]) + def from_json(cls, d: Dict[str, Any]) -> Self: + return cls(**{k: v for k in cls.__slots__ if (v := d.get(k)) is not None}) def to_json(self) -> Dict[str, Any]: - return {"source": self.source, "version": self.version, "id": self.id, "text": self.text} + return {k: v for k in self.__slots__ if (v := getattr(self, k)) is not None} def __str__(self) -> str: attributes_string = ",".join([f"{k}:{repr(v)}" for k, v in self.to_json().items()]) @@ -68,110 +87,36 @@ def __str__(self) -> str: class DocumentWithMetadata(Document): - __slots__ = ("metadata",) + __slots__ = Document.__slots__ + ("metadata",) + spec_cls = InputSpecWithMetadata def __init__(self, *args, metadata: Optional[Dict[str, Any]] = None, **kwargs) -> None: super().__init__(*args, **kwargs) self.metadata = metadata or {} - @classmethod - def from_spec(cls, spec: InputSpecWithMetadata) -> "DocumentWithMetadata": - return DocumentWithMetadata( - source=spec.source, - version=spec.version, - id=spec.id, - text=spec.text, - metadata=spec.metadata, - ) - - def to_spec(self) -> InputSpecWithMetadata: - return InputSpecWithMetadata( - source=self.source, - version=self.version, - id=self.id, - text=self.text, - metadata=self.metadata, - ) - - @classmethod - def from_json(cls, d: Dict) -> "DocumentWithMetadata": - return DocumentWithMetadata( - source=d["source"], - version=d["version"], - id=d["id"], - text=d["text"], - metadata=d["metadata"], - ) - - def to_json(self) -> Dict: - return { - "source": self.source, - "version": self.version, - "id": self.id, - "text": self.text, - "metadata": self.metadata, - } - def __str__(self) -> str: repr_ = super().__str__() return repr_.rstrip(")") + f",metadata={'...' if self.metadata else 'none'})" class DocumentWithMetadataAndAttributes(DocumentWithMetadata): + __slots__ = DocumentWithMetadata.__slots__ + ("attributes",) + spec_cls = InputSpecWithMetadataAndAttributes + def __init__( self, *args, attributes: Optional[Dict[str, List[Tuple[int, int, float]]]] = None, **kwargs ) -> None: super().__init__(*args, **kwargs) self.attributes = attributes or {} - @classmethod - def from_spec(cls, spec: InputSpecWithMetadataAndAttributes) -> "DocumentWithMetadataAndAttributes": - return DocumentWithMetadataAndAttributes( - source=spec.source, - version=spec.version, - id=spec.id, - text=spec.text, - metadata=spec.metadata, - attributes=spec.attributes, - ) - - @classmethod - def from_json(cls, d: Dict) -> "DocumentWithMetadataAndAttributes": - return DocumentWithMetadataAndAttributes( - source=d["source"], - version=d["version"], - id=d["id"], - text=d["text"], - metadata=d["metadata"], - attributes=d["attributes"], - ) - - def to_json(self) -> Dict: - return { - "source": self.source, - "version": self.version, - "id": self.id, - "text": self.text, - "metadata": self.metadata, - "attributes": self.attributes, - } - - def to_spec(self) -> InputSpecWithMetadataAndAttributes: - return InputSpecWithMetadataAndAttributes( - source=self.source, - version=self.version, - id=self.id, - text=self.text, - metadata=self.metadata, - attributes=self.attributes, - ) - def __str__(self) -> str: return super().__str__().rstrip(")") + f",attributes={'...' if self.attributes else 'none'})" class Span: - __slots__ = "start", "end", "type", "score", "experiment", "tagger" + __slots__ = "start", "end", "type", "score", "experiment", "tagger", "location" + + __selectors_cache__: Dict[str, Callable[["Document"], str]] = {} def __init__( self, @@ -181,6 +126,7 @@ def __init__( score: float = 1.0, experiment: Optional[str] = None, tagger: Optional[str] = None, + location: str = "text", ): self.start = start self.end = end @@ -188,15 +134,58 @@ def __init__( self.score = float(score) self.experiment = experiment self.tagger = tagger + self.location = location + + def _make_selector(self) -> Callable[["Document"], str]: + if self.location not in self.__selectors_cache__: + + def _nested_selector( + doc: Any, + index: Optional[int] = None, + key: Optional[str] = None, + previous: Optional[Callable] = None, + dict_like: bool = True, + ) -> Any: + prev = previous(doc) if previous is not None else doc + if dict_like or index is not None: + assert (key or index) is not None, "Either key or index must be set" + return prev[key or index] + elif key is not None: + return getattr(prev, key) + else: + raise ValueError("Either key or index must be set") + + matches = list( + re.finditer(r"((^|\.)(?P[a-zA-Z][a-zA-Z0-9]*))|(\[(?P[0-9]+)\])", self.location) + ) + assert len(matches) > 0, f"Invalid location: `{self.location}`" + init_match, *rest_matches = matches + + fn = functools.partial( + _nested_selector, + index=int(init_match.group("index")) if init_match.group("index") is not None else None, + key=init_match.group("key"), + dict_like=False, + ) + for match in rest_matches[::-1]: + fn = functools.partial( + _nested_selector, + index=int(match.group("index")) if match.group("index") is not None else None, + key=match.group("key"), + previous=fn, + ) + self.__selectors_cache__[self.location] = fn + + return self.__selectors_cache__[self.location] def mention(self, text: str, window: int = 0) -> str: return text[max(0, self.start - window) : min(len(text), self.end + window)] - def select(self, doc: Document) -> str: - return doc.text[self.start : self.end] + def select(self, doc: Document, left: int = 0, right: int = 0) -> str: + return self._make_selector()(doc)[self.start - left : self.end + right] @classmethod - def from_spec(cls, attribute_name: str, attribute_value: TaggerOutputValueType) -> "Span": + def from_spec(cls, attribute_name: str, attribute_value: TaggerOutputValueType) -> Self: if "__" in attribute_name: # bff tagger has different name exp_name, tgr_name, attr_type = attribute_name.split("__", 2) @@ -204,7 +193,7 @@ def from_spec(cls, attribute_name: str, attribute_value: TaggerOutputValueType) exp_name = tgr_name = attr_type = attribute_name start, end, score = attribute_value - return Span( + return cls( start=int(start), end=int(end), type=attr_type, @@ -214,19 +203,18 @@ def from_spec(cls, attribute_name: str, attribute_value: TaggerOutputValueType) ) def to_spec(self) -> Tuple[str, TaggerOutputValueType]: + from .utils import format_span_key, format_span_output + assert self.experiment is not None, "Experiment name must be set to convert to spec" assert self.tagger is not None, "Tagger name must be set to convert to spec" - return ( - f"{self.experiment}__{self.tagger}__{self.type}", - (self.start, self.end, self.score), - ) + return format_span_key(self.experiment, self.tagger, self), format_span_output(self) def __len__(self) -> int: return self.end - self.start @classmethod - def from_json(cls, di: Dict) -> "Span": - return Span(start=di["start"], end=di["end"], type=di["type"], score=di["score"]) + def from_json(cls, di: Dict) -> Self: + return cls(**{k: v for k, v in di.items() if k in cls.__slots__}) def to_json(self, text: Optional[str] = None, window: int = 0) -> dict: span_repr = {"start": self.start, "end": self.end, "type": self.type, "score": self.score} diff --git a/python/dolma/core/loggers.py b/python/dolma/core/loggers.py index f34ba864..f0ff05f3 100644 --- a/python/dolma/core/loggers.py +++ b/python/dolma/core/loggers.py @@ -5,15 +5,20 @@ DOLMA_PREFIX = "dolma" -def get_logger(name: str) -> logging.Logger: +def get_logger(name: str, level: Union[int, str] = logging.WARN) -> logging.Logger: if (proc_name := multiprocessing.current_process().name) == "MainProcess": proc_name = "main" proc_name = proc_name.replace(" ", "_") + # set the log level + level = level if isinstance(level, int) else getattr(logging, level.strip().upper(), logging.WARN) + + # set name name = f"{proc_name}.dolma.{name}" logger = logging.getLogger(name) - logger.setLevel(logging.WARN) + logger.setLevel(level) + # add handler if not logger.handlers: handler = logging.StreamHandler() formatter = logging.Formatter( diff --git a/python/dolma/core/mp_tools.py b/python/dolma/core/mp_tools.py new file mode 100644 index 00000000..98c8faf0 --- /dev/null +++ b/python/dolma/core/mp_tools.py @@ -0,0 +1,130 @@ +import multiprocessing +import time +from contextlib import ExitStack +from multiprocessing.managers import SyncManager +from multiprocessing.pool import Pool +from queue import Queue +from typing import Any, Callable, Dict, Generic, Iterable, Optional, TypeVar, Union + +T = TypeVar("T") +R = TypeVar("R") + + +def get_manager(pool: Union[Pool, "PoolWithDebug"]) -> Union[SyncManager, "ManagerWithDebug"]: + if getattr(pool, "debug", False): + return ManagerWithDebug() + else: + return multiprocessing.Manager() + + +class ResultWithDebug(Generic[T]): + def __init__(self, result: T, *args, **kwargs): + self.result = result + + def get(self, timeout: Optional[float] = None) -> T: + return self.result + + def wait(self, timeout: Optional[float] = None) -> None: + time.sleep(timeout or 0) + + def successful(self) -> bool: + return True + + def ready(self) -> bool: + return True + + +class ManagerWithDebug: + def Queue(self): + return Queue() + + def shutdown(self) -> None: + pass + + +class PoolWithDebug: + """A wrapper around multiprocessing.Pool that allows for debugging (i.e., running without multiprocessing). + Supports creating a manager for shared memory objects (mock in case of debugging).""" + + def __init__( + self, + processes: Optional[int] = None, + initializer: Optional[Callable[..., Any]] = None, + initargs: Iterable[Any] = (), + maxtasksperchild: Optional[int] = None, + debug: bool = False, + ): + self.processes = processes + self.initializer = initializer + self.initargs = initargs + self.maxtasksperchild = maxtasksperchild + self.debug = debug + + # we are gonna keep track of resources in stack; but also keeping them indexed + # separately for easy access + self.stack = ExitStack() + self._manager: Optional[SyncManager] = None + self._pool: Optional[Pool] = None + + # let's make sure that the start method is spawn for best performance + try: + multiprocessing.set_start_method("spawn") + except RuntimeError: + assert multiprocessing.get_start_method() == "spawn", "Multiprocessing start method must be spawn" + + def __enter__(self): + if self._pool is None and not self.debug: + self._pool = self.stack.enter_context( + Pool( + processes=self.processes, + initializer=self.initializer, + initargs=self.initargs, + maxtasksperchild=self.maxtasksperchild, + ) + ) + return self + + def Manager(self): + if self._manager is None: + self._manager = ( + ManagerWithDebug() # pyright: ignore + if self.debug + else self.stack.enter_context(multiprocessing.Manager()) + ) + return self._manager + + def __exit__(self, *exc): + return self.stack.close() + + def apply_async( + self, + func: Callable[..., R], + args: Iterable[Any] = (), + kwds: Dict[str, Any] = {}, + callback: Optional[Callable[[R], Any]] = None, + error_callback: Optional[Callable[[Any], Any]] = None, + ): + if self._pool is None: + if self.initializer: + # run the initializer once by calling it with the initargs and then setting it to None + self.initializer(*self.initargs) + self.initializer = None + try: + resp = func(*args, **kwds) + if callback is not None: + callback(resp) + return ResultWithDebug(resp) + except Exception as e: + if error_callback is not None: + error_callback(e) + raise e + else: + return self._pool.apply_async( + func=func, args=args, kwds=kwds, callback=callback, error_callback=error_callback + ) + + def close(self): + return self._pool and self._pool.close() + + def join(self): + return self._pool and self._pool.join() diff --git a/python/dolma/core/mp_tools.pyi b/python/dolma/core/mp_tools.pyi new file mode 100644 index 00000000..30fc1f2e --- /dev/null +++ b/python/dolma/core/mp_tools.pyi @@ -0,0 +1,19 @@ +from collections.abc import Callable, Iterable +from multiprocessing.managers import SyncManager +from multiprocessing.pool import ApplyResult, Pool +from typing import Any + +class ResultWithDebug(ApplyResult): ... # noqa: E701,E302 +class ManagerWithDebug(SyncManager): ... # noqa: E701 + +class PoolWithDebug(Pool): # noqa: E302 + def __init__( # noqa: E704 + self, + processes: int | None = None, + initializer: Callable[..., Any] | None = None, + initargs: Iterable[Any] = (), + maxtasksperchild: int | None = None, + debug: bool = False, + ): ... + +def get_manager(pool: Pool) -> SyncManager: ... # noqa: E701, E704, E302 diff --git a/python/dolma/core/parallel.py b/python/dolma/core/parallel.py index 0bbfc75f..3503f4d3 100644 --- a/python/dolma/core/parallel.py +++ b/python/dolma/core/parallel.py @@ -1,34 +1,45 @@ -import inspect import itertools import logging import multiprocessing import pickle import random import re -import time -from contextlib import ExitStack from datetime import datetime from functools import partial from queue import Queue -from threading import Thread -from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypeVar, Union +from typing import ( + Any, + Dict, + List, + Literal, + NamedTuple, + Optional, + Tuple, + Type, + TypeVar, + Union, +) +import backoff import smart_open -import tqdm +from backoff.types import Details from typing_extensions import TypeAlias from .errors import DolmaError, DolmaRetryableFailure from .loggers import get_logger +from .mp_tools import PoolWithDebug, get_manager from .paths import ( add_suffix, + exists, glob_path, join_path, make_relative, mkdir_p, parent, split_path, - sub_prefix, ) +from .progressbar import BaseProgressBar +from .utils import batch_iterator METADATA_SUFFIX = ".done.txt" @@ -45,9 +56,28 @@ class AllPathsTuple(NamedTuple): kwargs: List[KwargsType] @classmethod - def empty(cls) -> "AllPathsTuple": + def new(cls) -> "AllPathsTuple": return AllPathsTuple([], [], [], []) + def __len__(self) -> int: + return len(self.src) + + @property + def empty(self) -> bool: + return len(self.src) == 0 + + def partition(self, k: int = 1) -> List["AllPathsTuple"]: + """Partition the paths into k / n slices containing k files each.""" + return [ + AllPathsTuple( + src=self.src[i : i + k], + dst=self.dst[i : i + k], + meta=self.meta[i : i + k], + kwargs=self.kwargs[i : i + k], + ) + for i in range(0, len(self.src), k) + ] + class BaseParallelProcessor: """A base parallel processor that supports applying the same process_single method to a list of files. @@ -60,6 +90,8 @@ class BaseParallelProcessor: See documentation of both methods for more details on how to implement them correctly. """ + PROGRESS_BAR_CLS: Type[BaseProgressBar] + def __init__( self, source_prefix: Union[str, List[str]], @@ -70,11 +102,18 @@ def __init__( seed: int = 0, pbar_timeout: float = 1e-3, ignore_existing: bool = False, + skip_source_glob: bool = False, + shuffle_src_paths: bool = True, include_paths: Optional[List[str]] = None, exclude_paths: Optional[List[str]] = None, files_regex_pattern: Optional[str] = None, - retries_on_error: int = 0, + batch_size: int = 1, process_single_kwargs: Union[None, KwargsType, List[KwargsType]] = None, + backoff_max_time: Optional[float] = None, + backoff_max_tries: int = 1, + retries_on_error: Optional[int] = None, + backoff_exceptions: Optional[Union[Type[Exception], Tuple[Type[Exception], ...]]] = None, + progress_bar_mode: Literal["tqdm", "logger"] = "tqdm", ): """Initialize the parallel processor. @@ -95,22 +134,31 @@ def __init__( seed (int, optional): The random seed to use when shuffling input files. Defaults to 0. pbar_timeout (float, optional): How often to update progress bars in seconds. Defaults to 0.01 seconds. + skip_source_glob (bool, optional): Do not glob source files. Off by default. ignore_existing (bool, optional): Whether to ignore files that have been already processed and re-run the processor on all files from scratch. Defaults to False. - include_paths (Optional[List[str]], optional): A list of paths to include. If provided, only files + shuffle_src_paths (bool, optional): Whether to shuffle the source paths before processing them. + Defaults to True. + include_paths (List[str], optional): A list of paths to include. If provided, only files that match one of the paths will be processed. Defaults to None. - exclude_paths (Optional[List[str]], optional): A list of paths to exclude. If provided, files that + exclude_paths (List[str], optional): A list of paths to exclude. If provided, files that match one of the paths will be skipped. Defaults to None. - files_regex_pattern (Optional[str], optional): A regex pattern to match files. If provided, only + files_regex_pattern (str, optional): A regex pattern to match files. If provided, only files that match the pattern will be processed. Defaults to None. - retries_on_error (int, optional): The number of retries to attempt if an error occurs. - Defaults to 0. - process_single_kwargs (Union[None, KwargsType, List[KwargsType]], optional): Additional kwargs to + batch_size: (int, optional): number of files to group in a single bat + process_single_kwargs (Union[None, KwargsType, List[KwargsType], optional): Additional kwargs to pass to the process_single method. If a single dict is provided, it will be used for all source prefixes. If a list of dicts is provided, each dict will be used for the corresponding source. By default, no additional kwargs are passed. + backoff_max_time (float, optional): The maximum time to backoff. Defaults to None. + backoff_max_tries (int, optional): The maximum number of tries to backoff. Defaults to 1. + backoff_exceptions (Union[Type[Exception], Tuple[Type[Exception], ...]], optional): The + exceptions to backoff on. Defaults to `dolma.core.errors.DolmaRetryableFailure`. + retries_on_error (int, optional): Deprecated. The number of retries to attempt on error. + Defaults to None. + progress_bar_mode ("tqdm" or "logger", optional): The mode to use for the progress bar. + Defaults to "tqdm". """ - self.src_prefixes = [source_prefix] if isinstance(source_prefix, str) else source_prefix self.dst_prefixes = [destination_prefix] if isinstance(destination_prefix, str) else destination_prefix self.meta_prefixes = [metadata_prefix] if isinstance(metadata_prefix, str) else metadata_prefix @@ -119,11 +167,36 @@ def __init__( self.seed = seed self.pbar_timeout = pbar_timeout self.ignore_existing = ignore_existing + self.progress_bar_mode = progress_bar_mode + + self.logger = self.get_logger() self.include_paths = set(include_paths) if include_paths is not None else None self.exclude_paths = set(exclude_paths) if exclude_paths is not None else None self.files_regex_pattern = re.compile(files_regex_pattern) if files_regex_pattern else None - self.retries_on_error = retries_on_error + self.shuffle_src_paths = shuffle_src_paths + + # this manages how many files to pass to a single processor + self.batch_size = batch_size + + if retries_on_error is not None: + self.logger.warning( + "The `retries_on_error` parameter is deprecated and will be removed in a future release. " + "Please use `backoff_max_tries` instead." + ) + backoff_max_tries = retries_on_error + 1 + + # this controls backoff + self.backoff_max_time: float = float(backoff_max_time or "inf") + self.backoff_max_tries: int = int(backoff_max_tries) + self.backoff_exceptions: Tuple[Type[Exception], ...] = ( + (backoff_exceptions,) + if isinstance(backoff_exceptions, type) + else backoff_exceptions or (DolmaRetryableFailure,) + ) + + if progress_bar_mode not in ("tqdm", "logger"): + raise ValueError("Progress bar mode must be either 'tqdm' or 'logger'") # this are additional kwargs to pass to the process_single method process_single_kwargs = process_single_kwargs or {} @@ -132,23 +205,8 @@ def __init__( else: self.process_single_kwargs = process_single_kwargs - # checking that the increment_progressbar method is subclassed correctly - sig = inspect.signature(self.increment_progressbar) - if "queue" not in sig.parameters or sig.parameters["queue"].kind != inspect.Parameter.POSITIONAL_ONLY: - raise AttributeError( - "increment_progressbar must have a positional-only argument named 'queue'; " - "Check that you have subclassed BaseParallelProcessor correctly!" - ) - if "kwargs" in sig.parameters and sig.parameters["kwargs"].kind == inspect.Parameter.VAR_KEYWORD: - raise AttributeError( - "increment_progressbar must not have a **kwargs argument; " - "Check that you have subclassed BaseParallelProcessor correctly!" - ) - if any(p.name != "queue" and p.default != 0 for p in sig.parameters.values()): - raise AttributeError( - "increment_progressbar must have a default value of 0 for all arguments except 'queue'; " - "Check that you have subclassed BaseParallelProcessor correctly!" - ) + if not hasattr(self, "PROGRESS_BAR_CLS"): + self.PROGRESS_BAR_CLS = BaseProgressBar.from_increment_function(self) if len(self.src_prefixes) != len(self.dst_prefixes): raise ValueError( @@ -169,13 +227,75 @@ def __init__( if len(self.src_prefixes) == 0: raise ValueError("At least one source prefix must be provided.") + self.skip_source_glob = skip_source_glob + if any("*" in p for p in itertools.chain(self.dst_prefixes, self.meta_prefixes)): raise ValueError("Destination and metadata prefixes cannot contain wildcards.") + if not hasattr(self, "PROGRESS_BAR_CLS"): + raise AttributeError("BaseParallelProcessor subclasses must define the PROGRESS_BAR_CLS attribute.") + + def __add__(self: BPP, other: BPP) -> BPP: + """Combine two parallel processors into one.""" + if not type(self) is type(other): + raise TypeError(f"Cannot add {type(self)} and {type(other)}") + + # we try combining the two list of include paths; if they are both None, then set the combo back to none + include_paths: Union[List[str], None] = [*(self.include_paths or []), *(other.include_paths or [])] + include_paths = sorted(set(include_paths or [])) if len(include_paths or []) else None + + # do the same for exclude paths + exclude_paths: Union[List[str], None] = [*(self.exclude_paths or []), *(other.exclude_paths or [])] + exclude_paths = sorted(set(exclude_paths or [])) if len(exclude_paths or []) else None + + # for the regex, do a simple or if both are set + regex_pattern: Union[str, None] = None + if self.files_regex_pattern and other.files_regex_pattern: + regex_pattern = "(" + self.files_regex_pattern.pattern + "|" + other.files_regex_pattern.pattern + ")" + elif self.files_regex_pattern: + regex_pattern = self.files_regex_pattern.pattern + elif other.files_regex_pattern: + regex_pattern = other.files_regex_pattern.pattern + + return type(self)( + source_prefix=[*self.src_prefixes, *other.src_prefixes], + destination_prefix=[*self.dst_prefixes, *other.dst_prefixes], + metadata_prefix=[*self.meta_prefixes, *other.meta_prefixes], + num_processes=max(self.num_processes, other.num_processes), + debug=self.debug or other.debug, + seed=self.seed, + pbar_timeout=max(self.pbar_timeout, other.pbar_timeout), + ignore_existing=self.ignore_existing or other.ignore_existing, + include_paths=include_paths, + exclude_paths=exclude_paths, + files_regex_pattern=regex_pattern, + batch_size=max(self.batch_size, other.batch_size), + process_single_kwargs=[*self.process_single_kwargs, *other.process_single_kwargs], + backoff_max_time=min(self.backoff_max_time, other.backoff_max_time), + backoff_max_tries=min(self.backoff_max_tries, other.backoff_max_tries), + backoff_exceptions=tuple(set(self.backoff_exceptions + other.backoff_exceptions)), + ) + + def __radd__(self: BPP, other: BPP) -> BPP: + """Combine two parallel processors into one.""" + return other.__add__(self) + @classmethod def get_logger(cls) -> logging.Logger: """Get the logger for the class.""" - return get_logger(cls.__name__) + return get_logger(cls.__name__, "info") + + @classmethod + def process_batch( + cls, + source_paths: List[str], + destination_paths: List[str], + queue: QueueType, + kwargs: List[Dict[str, Any]], + ): + """Process multiple files. Naively calls process_single for each file, but can be overridden.""" + for src_path, dst_path, single_kwargs in zip(source_paths, destination_paths, kwargs): + cls.process_single(source_path=src_path, destination_path=dst_path, queue=queue, **single_kwargs) @classmethod def process_single( @@ -199,36 +319,61 @@ def process_single( raise NotImplementedError() @classmethod - def _process_single_and_save_status( + def _log_backoff(cls, details: Details): + """Log backoff details.""" + message = ( + f"Backing off `{details['target'].__name__}` " + f"after {details['tries']:,} " + f"tries (wait: {details.get('wait', 0.0):.2f}s)" + ) + if ex := details.get("exception"): + # add details about the exception to the message + import traceback # pylint: disable=import-outside-toplevel + + message += " due to " + "".join(traceback.format_exception_only(type(ex), ex)).strip() + + cls.get_logger().warning(message) + + @classmethod + def _process_batch_and_save_status( cls, - source_path: str, - destination_path: str, - metadata_path: str, + source_paths: List[str], + destination_paths: List[str], + metadata_paths: List[str], queue: QueueType, - serialized_kwargs: bytes, + serialized_kwargs: List[bytes], + backoff_max_time: float, + backoff_max_tries: int, + backoff_exceptions: Tuple[Type[Exception], ...], ): """A wrapper around process single that saves a metadata file if processing is successful.""" # make destination directory if it doesn't exist for the destination and metadata paths - mkdir_p(parent(destination_path)) - mkdir_p(parent(metadata_path)) - - kwargs = pickle.loads(serialized_kwargs) - retries_on_error = kwargs.get("retries_on_error", 0) + 1 - while True: - try: - cls.process_single( - source_path=source_path, destination_path=destination_path, queue=queue, **kwargs - ) - break - except DolmaRetryableFailure as exception: - retries_on_error -= 1 - if retries_on_error == 0: - raise DolmaError from exception + for path in itertools.chain(destination_paths, metadata_paths): + mkdir_p(parent(path)) + + # we unpickle the serialized kwargs + deserialized_kwargs = [pickle.loads(kw) for kw in serialized_kwargs] + + # use backoff library to retry on failure; function _log_backoff is called on backoff + # to inform the user of the backoff details. + fn_with_backoff = backoff.on_exception( + backoff.expo, + exception=backoff_exceptions, + max_tries=backoff_max_tries, + max_time=backoff_max_time, + on_backoff=cls._log_backoff, + )(cls.process_batch) + + # start processing the file here + fn_with_backoff( + source_paths=source_paths, destination_paths=destination_paths, queue=queue, kwargs=deserialized_kwargs + ) - # write the metadata file - with smart_open.open(metadata_path, "wt") as f: - f.write(datetime.now().isoformat()) + # write the metadata files + for path in metadata_paths: + with smart_open.open(path, "wt") as f: + f.write(datetime.now().isoformat()) @classmethod def increment_progressbar(cls, queue: QueueType, /, **kwargs: int) -> Dict[str, int]: @@ -247,126 +392,7 @@ def increment_progressbar(self, queue, /, files = 0, documents = 0): # we use queue.put(tuple(kwargs.get(k, 0) for k in kwargs)) return kwargs - @classmethod - def _run_threaded_progressbar( - cls, - queue: QueueType, - timeout: float, - ): - """Run a progress bar in a separate thread. - - Args: - queue (QueueType): The queue to increment the progress bars. - timeout (float): How often to update the progress bars in seconds. - """ - - sample_queue_output = cls.increment_progressbar(queue) - - with ExitStack() as stack: - pbars = [ - stack.enter_context( - tqdm.tqdm(desc=str(k), unit=str(k)[:1], position=i, unit_scale=True) # pyright: ignore - ) - for i, k in enumerate(sample_queue_output) - ] - - while True: - item = queue.get() - if item is None: - break - - for pbar, value in zip(pbars, item): - pbar.update(value) - - time.sleep(timeout) - - def _debug_run_all( - self, - all_source_paths: List[str], - all_destination_paths: List[str], - all_metadata_paths: List[str], - all_process_kwargs: Union[List[KwargsType], None] = None, - **process_single_kwargs: Any, - ): - """Run files one by one on the main process - - Args: - all_source_paths (List[MultiPath]): The list of source paths to process. - all_destination_paths (List[MultiPath]): The list of destination paths to save. - all_metadata_paths (List[MultiPath]): The locations where to save metadata. - all_process_kwargs (Union[List[KwargsType], None]): Additional kwargs to pass to the process_single - """ - - arguments_iterator = zip( - # source paths - all_source_paths, - # destination paths - all_destination_paths, - # this is where we save the metadata to keep track of which files have been processed - all_metadata_paths, - # additional kwargs to pass to the process_single; if not provided, we use an empty dict - # will be merged with the process_single_kwargs - all_process_kwargs or [{} for _ in all_source_paths], - ) - pbar_queue: QueueType = Queue() - thread = Thread(target=self._run_threaded_progressbar, args=(pbar_queue, self.pbar_timeout), daemon=True) - thread.start() - - for source_path, destination_path, metadata_path, process_kwargs in arguments_iterator: - self._process_single_and_save_status( - source_path=source_path, - destination_path=destination_path, - metadata_path=metadata_path, - queue=pbar_queue, - serialized_kwargs=pickle.dumps({**process_kwargs, **process_single_kwargs}), - ) - - pbar_queue.put(None) - thread.join() - - def __add__(self: BPP, other: BPP) -> BPP: - """Combine two parallel processors into one.""" - if not type(self) is type(other): - raise TypeError(f"Cannot add {type(self)} and {type(other)}") - - # we try combining the two list of include paths; if they are both None, then set the combo back to none - include_paths: Union[List[str], None] = [*(self.include_paths or []), *(other.include_paths or [])] - include_paths = sorted(set(include_paths or [])) if len(include_paths or []) else None - - # do the same for exclude paths - exclude_paths: Union[List[str], None] = [*(self.exclude_paths or []), *(other.exclude_paths or [])] - exclude_paths = sorted(set(exclude_paths or [])) if len(exclude_paths or []) else None - - # for the regex, do a simple or if both are set - regex_pattern: Union[str, None] = None - if self.files_regex_pattern and other.files_regex_pattern: - regex_pattern = "(" + self.files_regex_pattern.pattern + "|" + other.files_regex_pattern.pattern + ")" - elif self.files_regex_pattern: - regex_pattern = self.files_regex_pattern.pattern - elif other.files_regex_pattern: - regex_pattern = other.files_regex_pattern.pattern - - return type(self)( - source_prefix=[*self.src_prefixes, *other.src_prefixes], - destination_prefix=[*self.dst_prefixes, *other.dst_prefixes], - metadata_prefix=[*self.meta_prefixes, *other.meta_prefixes], - num_processes=max(self.num_processes, other.num_processes), - debug=self.debug or other.debug, - seed=self.seed, - pbar_timeout=max(self.pbar_timeout, other.pbar_timeout), - ignore_existing=self.ignore_existing or other.ignore_existing, - include_paths=include_paths, - exclude_paths=exclude_paths, - files_regex_pattern=regex_pattern, - retries_on_error=max(self.retries_on_error, other.retries_on_error), - process_single_kwargs=[*self.process_single_kwargs, *other.process_single_kwargs], - ) - - def __radd__(self: BPP, other: BPP) -> BPP: - """Combine two parallel processors into one.""" - return other.__add__(self) - - def _multiprocessing_run_all( + def _run_all( self, all_source_paths: List[str], all_destination_paths: List[str], @@ -389,47 +415,54 @@ def _multiprocessing_run_all( all_process_kwargs = all_process_kwargs or [{} for _ in all_source_paths] - arguments_iterator = zip( - # source paths - all_source_paths, - # destination paths - all_destination_paths, - # this is where we save the metadata to keep track of which files have been processed - all_metadata_paths, - # additional kwargs to pass to the process_single; if not provided, we use an empty dict - # will be merged with the process_single_kwargs - all_process_kwargs, + batches = list( + batch_iterator( + # source paths + all_source_paths, + # destination paths + all_destination_paths, + # this is where we save the metadata to keep track of which files have been processed + all_metadata_paths, + # additional kwargs to pass to the process_single; if not provided, we use an empty dict + # will be merged with the process_single_kwargs + all_process_kwargs, + # batch size is equal to 1 by default + batch_size=self.batch_size, + ) ) + self.logger.info("Processing in %s batches", len(batches)) - # no need to be wasteful with processes: we only need as many cores a the minimum of the number of - # source paths, destination paths, metadata paths, and process kwargs. - num_processes = min( - self.num_processes, - len(all_source_paths), - len(all_destination_paths), - len(all_metadata_paths), - len(all_process_kwargs), - ) + # no need to be wasteful with processes: we only need as many cores a the number of batches + num_processes = min(self.num_processes, len(batches)) + self.logger.info("Using %s processes", num_processes) - with multiprocessing.Pool(processes=num_processes) as pool: - pbar_queue: QueueType = (manager := multiprocessing.Manager()).Queue() - thread = Thread( - target=self._run_threaded_progressbar, args=(pbar_queue, self.pbar_timeout), daemon=True + with PoolWithDebug(processes=num_processes, debug=self.debug) as pool: + pbar_queue: QueueType = (manager := get_manager(pool)).Queue() + pbar = self.PROGRESS_BAR_CLS( + queue=pbar_queue, min_time=self.pbar_timeout, server=self.progress_bar_mode ) - thread.start() + pbar.start() process_single_fn = partial(self.process_single, queue=pbar_queue) results = [] - for source_path, destination_path, metadata_path, process_kwargs in arguments_iterator: + for source_paths, destination_paths, metadata_paths, process_kwargs in batches: + # we need to merge the process_single_kwargs with the additional kwargs + # mypy is confused by the type of process_kwargs; we need to ignore the error + serialized_kwargs = [ + pickle.dumps({**kw, **process_single_kwargs}) for kw in process_kwargs # type: ignore + ] + process_single_fn = partial( - self._process_single_and_save_status, + self._process_batch_and_save_status, queue=pbar_queue, - source_path=source_path, - destination_path=destination_path, - metadata_path=metadata_path, - # we need to merge the process_single_kwargs with the additional kwargs - serialized_kwargs=pickle.dumps({**process_kwargs, **process_single_kwargs}), + source_paths=source_paths, # pyright: ignore + destination_paths=destination_paths, # pyright: ignore + metadata_paths=metadata_paths, # pyright: ignore + serialized_kwargs=serialized_kwargs, + backoff_max_time=self.backoff_max_time, + backoff_max_tries=self.backoff_max_tries, + backoff_exceptions=self.backoff_exceptions, ) result = pool.apply_async(process_single_fn) results.append(result) @@ -439,9 +472,7 @@ def _multiprocessing_run_all( pool.close() pool.join() - - pbar_queue.put(None) - thread.join() + pbar.stop() manager.shutdown() def _valid_path(self, path: str) -> bool: @@ -453,14 +484,17 @@ def _valid_path(self, path: str) -> bool: return False return True - def _get_all_paths(self) -> AllPathsTuple: + def _get_all_paths(self) -> Tuple[AllPathsTuple, bool]: """Get all paths to process using prefixes provided""" - all_paths = AllPathsTuple.empty() + all_paths = AllPathsTuple.new() + + # get a list of which metadata files already exist + some_already_processed = False for src_prefix, dst_prefix, meta_prefix, kwargs_prefix in zip( self.src_prefixes, self.dst_prefixes, self.meta_prefixes, self.process_single_kwargs ): - current_source_prefixes = sorted(glob_path(src_prefix)) + current_source_prefixes = sorted([src_prefix] if self.skip_source_glob else glob_path(src_prefix)) if len(current_source_prefixes) > 1: # make relative only makes sense if there is more than one path; otherwise, it's unclear @@ -474,45 +508,46 @@ def _get_all_paths(self) -> AllPathsTuple: else: raise ValueError(f"Could not find any files matching {src_prefix}") - # shuffle the order of the files so time estimation in progress bars is more accurate - random.shuffle(rel_paths) - - # get a list of which metadata files already exist - existing_metadata_names = set( - re.sub(rf"{METADATA_SUFFIX}$", "", sub_prefix(path, meta_prefix)) - for path in glob_path(meta_prefix) - ) + if self.shuffle_src_paths: + # shuffle the order of the files so time estimation in progress bars is more accurate + random.shuffle(rel_paths) for path in rel_paths: - if not self.ignore_existing and path in existing_metadata_names: - continue + metadata_path = add_suffix(meta_prefix, path) + METADATA_SUFFIX if not self._valid_path(path): + # invalid path; skip + continue + + if not self.ignore_existing and exists(metadata_path): + # metadata file exists, which indicates that the file has already been processed + some_already_processed = True continue # create new paths to pass to taggers all_paths.src.append(add_suffix(prefix, path)) all_paths.dst.append(add_suffix(dst_prefix, path)) - all_paths.meta.append(add_suffix(meta_prefix, path) + METADATA_SUFFIX) + all_paths.meta.append(metadata_path) all_paths.kwargs.append(kwargs_prefix or {}) - return all_paths + return all_paths, some_already_processed def __call__(self, **process_single_kwargs: Any): """Run the processor.""" random.seed(self.seed) - # in case the user wants to override the default kwargs for retries - process_single_kwargs.setdefault("retries_on_error", self.retries_on_error) - - all_paths = self._get_all_paths() + all_paths, some_already_processed = self._get_all_paths() + self.logger.info("Found %s files to process", len(all_paths.src)) - print(f"Found {len(all_paths.src):,} files to process") - - fn = self._debug_run_all if self.debug else self._multiprocessing_run_all + if all_paths.empty: + if some_already_processed: + self.logger.info("All files already processed; skipping.") + return + else: + raise DolmaError("No files found to process.") - fn( + self._run_all( all_source_paths=all_paths.src, all_destination_paths=all_paths.dst, all_metadata_paths=all_paths.meta, diff --git a/python/dolma/core/paths.py b/python/dolma/core/paths.py index ba597e13..8274071c 100644 --- a/python/dolma/core/paths.py +++ b/python/dolma/core/paths.py @@ -1,7 +1,7 @@ import glob import os import re -from functools import partial +from functools import partial, reduce from hashlib import sha256 from itertools import chain from pathlib import Path @@ -223,9 +223,14 @@ def glob_path( protocol, parsed_path = _pathify(path) fs = _get_fs(path) - if fs.isdir(path) and autoglob_dirs: + if autoglob_dirs and fs.isdir(path): path = join_path(protocol, _unescape_glob(parsed_path), "*") + if "*" not in str(path): + # nothing to glob + yield str(path) + return + for gl in fs.glob(path): gl = str(gl) @@ -493,6 +498,10 @@ def decompress_path(path: str, dest: Optional[str] = None) -> str: the original path will be returned. """ for supported_ext in get_supported_extensions(): + + # make type checking happy + assert isinstance(supported_ext, str), "Internal error: supported_ext is not a string" + # not the supported extension if not path.endswith(supported_ext): continue @@ -537,3 +546,25 @@ def split_ext(path: str) -> Tuple[str, Tuple[str, ...], str]: extensions.append(ext) return prot, (*parts[:-1], filename), "".join(reversed(extensions)) + + +def get_unified_path(paths: List[str]) -> str: + """Get a unified path for a list of paths.""" + + if len(paths) == 1: + # if there is only one path, we don't need to unify anything + return paths[0] + + # get shared root for all paths; we will put the unified path here + root, relative = make_relative(paths) + + # get the extension from the first path; assume all paths have the same extension + _, _, ext = split_ext(relative[0]) + + # hash all the sorted relative paths in order to get a unique name + # the type: ignore is needed because mypy fails to infer the type of the lambda + # (the "or" ensures that the lambda returns the same type as the first argument, which is a hash) + h = reduce(lambda h, p: h.update(p.encode()) or h, sorted(relative), sha256()) # type: ignore + + # return the unified path + return join_path(root, h.hexdigest() + ext) diff --git a/python/dolma/core/profile.py b/python/dolma/core/profile.py new file mode 100644 index 00000000..0af8aa9f --- /dev/null +++ b/python/dolma/core/profile.py @@ -0,0 +1,36 @@ +import cProfile +import io +import pstats +from contextlib import ExitStack, contextmanager +from typing import Generator, Optional + +import smart_open + +from .loggers import get_logger + + +@contextmanager +def profiler( + output: Optional[str] = None, sort_key: str = "tottime", lines: int = 100, human_readable: bool = True +) -> Generator[None, None, None]: + logger = get_logger("profiler", "info") + + profile = cProfile.Profile() + logger.info("Starting profiler...") + profile.enable() + yield + profile.disable() + logger.info("Profiler stopped.") + + if not human_readable and output is not None: + logger.info("Dumping profiler stats in binary format to %s...", output) + profile.dump_stats(output) + return + + with ExitStack() as stack: + logger.info("Printing profiler stats %s...", f"to {output}" if output is not None else "to stdout") + output_stream = io.StringIO() if output is None else stack.enter_context(smart_open.open(output, "w")) + ps = pstats.Stats(profile, stream=output_stream).sort_stats(sort_key) + ps.print_stats(lines) + + logger.info("Done printing profiler stats.") diff --git a/python/dolma/core/progressbar.py b/python/dolma/core/progressbar.py new file mode 100644 index 00000000..283d4b01 --- /dev/null +++ b/python/dolma/core/progressbar.py @@ -0,0 +1,330 @@ +import multiprocessing +import time +import warnings +from contextlib import ExitStack +from enum import Enum +from functools import reduce +from hashlib import sha1 +from inspect import Parameter +from inspect import signature as get_signature +from queue import Queue +from threading import Thread +from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Type + +import tqdm +from rich.progress import filesize +from typing_extensions import TypeAlias, Union + +from .loggers import get_logger + +if TYPE_CHECKING: + from .parallel import BaseParallelProcessor + + +QueueType: TypeAlias = "Queue[Union[None, Tuple[int, ...]]]" + + +class ServerType(Enum): + tqdm = "tqdm" + logger = "logger" + null = "null" + + +class BaseProgressBar: + """One or more progress bars that track progress of a process. + + This class is meant to be subclassed. The subclass must provide one or more attributes of type int, e.g. + + ```python + class MyProgressBar(BaseProgressBar): + files: int = 0 + documents: int = 0 + ``` + + This class can be used for both adding and running through the progress bars. To start: + + ```python + queue = Queue() + pb = MyProgressBar(queue) + pb.start() + + ... # do some work + + pb.stop() + ``` + + it can also be used in a multiprocessing context: + + ```python + with Pool(processes=4) as pool: + queue = mutliprocessing.Manager().Queue() + pb = MyProgressBar(queue) + pb.start() + + ... # do some work + + pool.close() + pool.join() + pb.stop() + ``` + + If you want to use this class to update a queue: + + ```python + pb = MyProgressBar(queue) + pb.files += 1 + pb.documents += 100 + ``` + """ + + def __init__( + self, + queue: QueueType, + min_step: int = 1, + min_time: float = 1e-3, + server: Union[ServerType, str] = "null", + ): + """ + Initialize the ProgressBar object. + + Args: + queue (QueueType): The queue object to track progress. + min_step (int, optional): The minimum step size for progress updates. Defaults to 1. + min_time (float, optional): The minimum time interval between progress updates. Defaults to 1e-1. + thread (bool, optional): Whether to start the progress bar or use object as client. Defaults to False. + """ + self._logger = get_logger(self.__class__.__name__, "warn") + self._queue = queue + self._last_update_time = time.time() + self._last_update_step = 0 + + self._update_every_seconds = min_time + self._update_every_steps = min_step + + for field in self.fields(): + setattr(self, field, 0) + + server_mode = ServerType[server] if isinstance(server, str) else server + if server_mode == ServerType.tqdm: + self._thread: Optional[Thread] = Thread( + target=self._run_tqdm, + kwargs={"queue": queue, "update_every_seconds": min_time, "fields": self.fields()}, + daemon=True, + ) + elif server_mode == ServerType.logger: + self._thread = Thread( + target=self._run_logger, + kwargs={"queue": queue, "update_every_seconds": min_time, "fields": self.fields()}, + daemon=True, + ) + else: + self._thread = None + + def __repr__(self) -> str: + return ( + f"{self.__class__.__name__}(" + f"{', '.join(f'{k}={getattr(self, k)}' for k in self.fields())};" + f" min_step={self._update_every_steps}, min_time={self._update_every_seconds})" + ")" + ) + + def __str__(self) -> str: + return self.__repr__() + + def __setattr__(self, name: str, value: Any) -> None: + super().__setattr__(name, value) + if name in self.fields() and value > 0: + self.update() + + @classmethod + def from_increment_function(cls, processor: "BaseParallelProcessor") -> "Type[BaseProgressBar]": + # print deprecation warning + msg = ( + "Deriving progress bar from `increment_progressbar` is deprecated; add a `PROGRESS_BAR_CLS` " + f"attribute to {type(processor).__name__} instead." + ) + warnings.warn(msg, category=DeprecationWarning, stacklevel=2) + + # checking that the increment_progressbar method is subclassed correctly + sig = get_signature(processor.increment_progressbar) + if "queue" not in sig.parameters or sig.parameters["queue"].kind != Parameter.POSITIONAL_ONLY: + raise AttributeError( + "increment_progressbar must have a positional-only argument named 'queue'; " + "Check that you have subclassed BaseParallelProcessor correctly!" + ) + if "kwargs" in sig.parameters and sig.parameters["kwargs"].kind == Parameter.VAR_KEYWORD: + raise AttributeError( + "increment_progressbar must not have a **kwargs argument; " + "Check that you have subclassed BaseParallelProcessor correctly!" + ) + if any(p.name != "queue" and p.default != 0 for p in sig.parameters.values()): + raise AttributeError( + "increment_progressbar must have a default value of 0 for all arguments except 'queue'; " + "Check that you have subclassed BaseParallelProcessor correctly!" + ) + params = [k for k, p in sig.parameters.items() if k != "queue" and p.kind != Parameter.empty] + h = reduce(lambda h, e: h.update(e.encode()) or h, params, sha1()).hexdigest() # type: ignore + + # create a new class + cls_dict = {"__annotations__": {k: int for k in params}, **{p: 0 for p in params}} + new_cls = type(f"{cls.__name__}{h[-6:]}", (cls,), cls_dict) + return new_cls + + @classmethod + def fields(cls) -> Tuple[str, ...]: + """ + Returns a tuple of field names in the class that are of type int. + + Raises: + ValueError: If the class does not have at least one field of type int. + + Returns: + Tuple[str, ...]: A tuple of field names. + """ + fields: Optional[Tuple[str, ...]] = cls.__dict__.get("__fields__") + + if fields is None: + fields = tuple(n for n, t in getattr(cls, "__annotations__", {}).items() if issubclass(t, int)) + setattr(cls, "__fields__", fields) + + if len(fields) == 0: + raise ValueError(f"Class {cls.__name__} must have at least one field of type int.") + + return fields + + @classmethod + def parse(cls, values: Optional[Tuple[int, ...]]) -> Dict[str, int]: + """ + Parses the value from the queue and returns a dictionary mapping field names to their corresponding values. + + Args: + values (Optional[Tuple[int, ...]]): The values to be parsed for the queue. + + Returns: + Dict[str, int]: A dictionary mapping field names to their corresponding values. + """ + if not values: + return {k: 0 for k in cls.fields()} + return {k: v for k, v in zip(cls.fields(), values)} + + def _update(self): + # get the current values + update = tuple(getattr(self, k, 0) for k in self.fields()) + + # time to do an update + self._queue.put_nowait(update) + + # reset the steps + self._last_update_step = 0 + self._last_update_time = time.time() + + # reset the steps + for k in self.fields(): + setattr(self, k, 0) + + def update(self): + # update the number of steps since the last update + self._last_update_step += 1 + + if self._update_every_steps > self._last_update_step: + return + + time_before_update = self._last_update_time + self._update() + + # check if we wanna update frequency based on steps + if self._queue.qsize() >= multiprocessing.cpu_count(): + self._update_every_steps *= 2 + return + + # check if we wanna update frequency based on time + if (self._last_update_time - time_before_update) < self._update_every_seconds: + self._update_every_steps *= 2 + return + + @staticmethod + def _run_tqdm(queue: QueueType, update_every_seconds: float, fields: Tuple[str, ...]): + """ + Runs the progress bar. + + This method initializes and updates the progress bars based on the items in the queue. + It continuously retrieves items from the queue and updates the progress bars accordingly. + The method exits when a `None` item is retrieved from the queue. + + Returns: + None + """ + with ExitStack() as stack: + pbars = [ + stack.enter_context(tqdm.tqdm(desc=k, unit=k[:1], position=i, unit_scale=True)) # pyright: ignore + for i, k in enumerate(fields) + ] + + while True: + # loop until we get a None + item = queue.get() + if item is None: + break + + for pbar, value in zip(pbars, item): + pbar.update(value) + + time.sleep(update_every_seconds) + + @staticmethod + def _run_logger(queue: QueueType, update_every_seconds: float, fields: Tuple[str, ...]): + """ + Run the progress bar update loop. + + Args: + queue (QueueType): The queue to retrieve items from. + update_every_seconds (float): The interval between each update in seconds. + fields (Tuple[str, ...]): The fields to track and display in the progress bar. + + Returns: + None + """ + total_counters = {k: 0 for k in fields} + logger = get_logger("progress", "info") + + while True: + # loop until we get a None + item = queue.get() + if item is None: + break + + messages = [] + for k, v in zip(fields, item): + total_counters[k] += v + unit, suffix = filesize.pick_unit_and_suffix( + total_counters[k], ["", "K", "M", "G", "T", "P", "E", "Z", "Y"], 1000 + ) + precision = 1 if suffix else 0 + messages.append(f"{k}: {total_counters[k] / unit:,.{precision}f}{suffix} (+{v:,})") + + logger.info(", ".join(messages)) + time.sleep(update_every_seconds) + + def start(self): + """Run the progress bar in a separate thread.""" + if self._thread: + self._thread.start() + + def stop(self): + """Stop the progress bar. + + This method stops the progress bar by adding a `None` item to the queue and joining the thread. + """ + self._update() + + if self._thread is not None: + self._queue.put(None) + time.sleep(self._update_every_seconds * 2) + self._thread.join() + + def __enter__(self): + self.start() + return self + + def __exit__(self, *args): + self.stop() diff --git a/python/dolma/core/runtime.py b/python/dolma/core/runtime.py index ac5e2a23..83c19c00 100644 --- a/python/dolma/core/runtime.py +++ b/python/dolma/core/runtime.py @@ -1,5 +1,4 @@ -import io -import multiprocessing +# import multiprocessing import tempfile from contextlib import ExitStack, contextmanager from typing import ( @@ -17,8 +16,7 @@ import msgspec import smart_open - -from dolma.core.taggers import BaseTaggerWithMetadata +from smart_open.compression import get_supported_compression_types from .data_types import ( InputSpec, @@ -29,7 +27,10 @@ from .errors import DolmaFatalError, DolmaRetryableFailure, DolmaShardError from .parallel import BaseParallelProcessor, QueueType from .paths import delete_dir, join_path, make_relative, mkdir_p, split_glob, split_path +from .profile import profiler +from .progressbar import BaseProgressBar from .registry import TaggerRegistry +from .taggers import BaseTaggerWithMetadata from .utils import import_modules, make_variable_name # this placeholder gets used when a user has provided no experiment name, and we want to use taggers' @@ -221,20 +222,13 @@ def _write_sample_to_streams( output_streams[stream_path].write(output) -class TaggerProcessor(BaseParallelProcessor): - @classmethod - def increment_progressbar( # type: ignore - cls, - queue: QueueType, # queue must be the first argument, and it should be a positional-only argument - /, - files: int = 0, - documents: int = 0, - ) -> Dict[str, int]: - """We override this method to specify which units we want to keep track of in a progress bar. - Specifically, we keep track of files and documents in this example. Their default value must be zero.""" +class TaggerProcessorProgessBar(BaseProgressBar): + files: int = 0 + documents: int = 0 + - # we call the super method to increment the progress bar - return super().increment_progressbar(queue, files=files, documents=documents) +class TaggerProcessor(BaseParallelProcessor): + PROGRESS_BAR_CLS = TaggerProcessorProgessBar @classmethod def process_single( @@ -273,12 +267,9 @@ def process_single( # maximum numbers of lines to process steps: Union[int, None] = kwargs.get("steps", None) - # interval at which to update the progress bar; will double if it gets - # too full - update_interval = 1 - - # running document count; gets reset every time we update the progress bar - docs_cnt = 0 + # compression configuration + compression_input = kwargs.get("compression_input", None) + compression_output = kwargs.get("compression_output", None) # total number of documents processed total_docs_cnt = 0 @@ -292,10 +283,15 @@ def process_single( decoder = msgspec.json.Decoder(InputSpec) with ExitStack() as stack: - in_stream = stack.enter_context(smart_open.open(source_path, "rt", encoding="utf-8")) + in_stream = stack.enter_context( + smart_open.open(source_path, "rt", encoding="utf-8", compression=compression_input) + ) output_streams = stack.enter_context( - _make_output_streams(taggers_paths=taggers_paths, mode="wt", encoding="utf-8") + _make_output_streams( + taggers_paths=taggers_paths, mode="wt", encoding="utf-8", compression=compression_output + ) ) + pbar = stack.enter_context(TaggerProcessorProgessBar(queue)) try: for raw in in_stream: row = decoder.decode(raw) @@ -310,23 +306,13 @@ def process_single( samples_collectors[tagger_name] = tagger.tag(row) # increment the number of documents processed so far - docs_cnt += 1 + pbar.documents += 1 total_docs_cnt += 1 if steps is not None and total_docs_cnt >= steps: # if we have reached the maximum number of steps, we break break - if docs_cnt % update_interval == 0: - # update the progress bar every 1000 documents to prevent - # buffering - cls.increment_progressbar(queue, documents=docs_cnt) - docs_cnt = 0 - - if queue.qsize() >= multiprocessing.cpu_count(): - # double the update interval if the queue is full - update_interval *= 2 - except Exception as exp: # handle any exception that might have occurred msg = f"Failed to process {source_path} due to {exp.__class__.__name__}: {' '.join(exp.args)}" @@ -339,28 +325,8 @@ def process_single( else: raise DolmaFatalError(msg) from exp - # increment the files progress bar - cls.increment_progressbar(queue, files=1, documents=docs_cnt) - - -@contextmanager -def profiler( - output: Optional[str] = None, - sort_key: str = "tottime", - lines: int = 100, -) -> Generator[None, None, None]: - import cProfile - import pstats - - profile = cProfile.Profile() - profile.enable() - yield - profile.disable() - - with ExitStack() as stack: - output_stream = io.StringIO() if output is None else stack.enter_context(smart_open.open(output, "w")) - ps = pstats.Stats(profile, stream=output_stream).sort_stats(sort_key) - ps.print_stats(lines) + # increment the files progress bar + pbar.files += 1 @contextmanager @@ -392,6 +358,8 @@ def create_and_run_tagger( profile_steps: Optional[int] = None, profile_sort_key: str = "tottime", profile_lines: int = 100, + compression_input: Optional[str] = None, + compression_output: Optional[str] = None, ): """This function creates a tagger and runs it on a list of documents. @@ -423,8 +391,26 @@ def create_and_run_tagger( profile_steps (Optional[int], optional): Number of steps to profile; if not provided, all steps will be profiled. Defaults to None. profile_sort_key (str, optional): Sort key for the profiling output. Defaults to 'tottime'. + compression_input (Optional[str], optional): Compression algorithm to use for input files. If None, + compression will be inferred from the input file extension. Defaults to None. + compression_output (Optional[str], optional): Compression algorithm to use for output files. If None, + compression will be inferred from the output file extension. Defaults to None. """ + # get a list of supported compression types + compression_type = get_supported_compression_types() + + # if compression is not provided, set it to "infer_from_extension"; this is to maintain consistency with + # how compression is specified in the mixer/deduper code + compression_input = compression_input or "infer_from_extension" + compression_output = compression_output or "infer_from_extension" + + # check if compression is supported + if compression_input not in compression_type: + raise ValueError(f"Compression {compression_input} is not supported") + if compression_output not in compression_type: + raise ValueError(f"Compression {compression_output} is not supported") + # before pre-caching taggers, import any taggers modules if taggers_modules is not None: import_modules(taggers_modules) @@ -467,7 +453,7 @@ def create_and_run_tagger( debug=debug or profile_enable, # if profile is true, debug must be true seed=seed, ignore_existing=ignore_existing, - retries_on_error=retries_on_error, + backoff_max_tries=retries_on_error, num_processes=num_processes, ) @@ -486,4 +472,6 @@ def create_and_run_tagger( taggers_modules=taggers_modules, skip_on_failure=skip_on_failure, steps=profile_steps, + compression_input=compression_input, + compression_output=compression_output, ) diff --git a/python/dolma/core/taggers.py b/python/dolma/core/taggers.py index 59a414f7..1ed99726 100644 --- a/python/dolma/core/taggers.py +++ b/python/dolma/core/taggers.py @@ -17,9 +17,7 @@ InputSpecWithMetadata, TaggerOutputDictType, ) - -# digits after the decimal point -TAGGER_SCORE_PRECISION = 5 +from .utils import format_span_output class BaseTagger: @@ -46,8 +44,7 @@ def predict(self, doc: Document) -> DocResult: def group_output(self, doc_result: DocResult) -> TaggerOutputDictType: tagger_output: TaggerOutputDictType = {field: [] for field in self.defaults} for span in doc_result.spans: - output = (span.start, span.end, round(float(span.score), TAGGER_SCORE_PRECISION)) - tagger_output.setdefault(span.type, []).append(output) + tagger_output.setdefault(span.type, []).append(format_span_output(span)) return tagger_output def tag(self, row: InputSpec) -> TaggerOutputDictType: diff --git a/python/dolma/core/utils.py b/python/dolma/core/utils.py index c8149e74..6c4413d6 100644 --- a/python/dolma/core/utils.py +++ b/python/dolma/core/utils.py @@ -4,14 +4,8 @@ import re import string import sys -from typing import List, Union, cast - -try: - import blingfire - - BLINGFIRE_AVAILABLE = True -except Exception: - BLINGFIRE_AVAILABLE = False +from itertools import islice +from typing import Generator, Iterable, List, Tuple, TypeVar, Union, cast import nltk import uniseg.wordbreak @@ -26,13 +20,26 @@ except LookupError: nltk.download("punkt") - -from .data_types import TextSlice +from .data_types import Span, TextSlice from .loggers import get_logger +try: + import blingfire + + BLINGFIRE_AVAILABLE = True +except (ImportError, OSError): + BLINGFIRE_AVAILABLE = False + + sent_tokenizer = PunktSentenceTokenizer() logger = get_logger(__name__) +T = TypeVar("T") + + +# digits after the decimal point +TAGGER_SCORE_PRECISION = 5 + def make_variable_name(name: str, remove_multiple_underscores: bool = False) -> str: # use underscores for any non-valid characters in variable name @@ -48,6 +55,16 @@ def make_variable_name(name: str, remove_multiple_underscores: bool = False) -> return name +def format_span_output(span: Span) -> Tuple[int, int, float]: + """Formats a span for output.""" + return (span.start, span.end, round(float(span.score), TAGGER_SCORE_PRECISION)) + + +def format_span_key(experiment: str, tagger: str, span: Span) -> str: + """Formats a span key for output.""" + return f"{experiment}__{tagger}__{make_variable_name(span.type)}" + + def split_words(text: str, remove_empty: bool = True) -> List[TextSlice]: """ Split a string into words, as defined by the unicode standard. @@ -138,7 +155,7 @@ def import_modules(modules_path: Union[List[str], None]): sys.path.insert(0, module_parent) importlib.import_module(module_name) elif module_path in sys.modules[module_name].__path__: - logger.info(f"{module_path} has already been imported.") + logger.info("%s has already been imported.", module_path) else: raise ImportError( f"Failed to import {module_path} because the corresponding module name " @@ -154,6 +171,27 @@ def dataclass_to_dict(dataclass_instance) -> dict: return cast(dict, om.to_object(om.structured(dataclass_instance))) +def batch_iterator( + *iterables: Iterable[T], batch_size: int = 1, drop_last: bool = False +) -> Generator[List[Tuple[T, ...]], None, None]: + """ + Group one or more iterables into batches of size `batch_size`. + + Args: + iterables (Iterable[T]): One or more iterables to group into batches. + batch_size (int): The size of each batch. Defaults to 1. + drop_last (bool): Whether to drop the last batch if it is smaller than `batch_size`. Defaults to False. + """ + grouped_iterator = iter(zip(*iterables)) + while True: + batch = list(islice(grouped_iterator, batch_size)) + if not batch: + break + if len(batch) < batch_size and drop_last: + break + yield list(zip(*batch)) + + def add_compression(): """ Adds support for zstandard (.zst) compression format to the smart_open library. diff --git a/python/dolma/tokenizer/tokenizer.py b/python/dolma/tokenizer/tokenizer.py index 04017cdc..64d30500 100644 --- a/python/dolma/tokenizer/tokenizer.py +++ b/python/dolma/tokenizer/tokenizer.py @@ -26,7 +26,7 @@ with necessary("transformers", soft=True) as TRANSFORMERS_AVAILABLE: if TYPE_CHECKING or TRANSFORMERS_AVAILABLE: - from transformers import AutoTokenizer # pylint: disable=import-error + from transformers import AutoTokenizer # pyright: ignore pylint: disable=import-error PathOrStr = Union[str, PathLike] @@ -379,7 +379,7 @@ def tokenize_file( continue # the actual tokenization happens here - tokens = tokenizer.encode(text, add_special_tokens=True) + tokens = tokenizer.encode(text, add_special_tokens=True) # pyright: ignore except Exception: # in case of failure, we log the error and continue # We refresh the tokenizer to prevent memory leaks from affecting the rest of the processing @@ -394,7 +394,7 @@ def tokenize_file( if (refresh_tokenizer_every > 0 and i % refresh_tokenizer_every == 0) or force_refresh: # to prevent memory leaks, we refresh the tokenizer every so often - del tokenizer + del tokenizer # pyright: ignore gc.collect() tokenizer = make_tokenizer(tokenizer_name_or_path, **tokenizer_kwargs)