diff --git a/Cargo.lock b/Cargo.lock index f9733b38bfa..58d3458613d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -74,7 +74,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -85,7 +85,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -94,6 +94,15 @@ version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" +[[package]] +name = "arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" +dependencies = [ + "derive_arbitrary", +] + [[package]] name = "arc-swap" version = "1.9.0" @@ -606,6 +615,12 @@ version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" +[[package]] +name = "cov-mark" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90863d8442510cddf7f46618c4f92413774635771a3e80830c8b30d183420b14" + [[package]] name = "cpufeatures" version = "0.2.17" @@ -807,7 +822,7 @@ dependencies = [ "openssl-probe 0.1.6", "openssl-sys", "schannel", - "socket2", + "socket2 0.6.3", "windows-sys 0.59.0", ] @@ -896,6 +911,17 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "derive_arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "derive_more" version = "2.1.1" @@ -945,6 +971,12 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "dissimilar" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aeda16ab4059c5fd2a83f2b9c9e9c981327b18aa8e3b313f7e6563799d4f093e" + [[package]] name = "document-features" version = "0.2.12" @@ -1028,7 +1060,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -1058,6 +1090,16 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "expect-test" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63af43ff4431e848fb47472a920f14fa71c24de13255a5692e93d4e90302acb0" +dependencies = [ + "dissimilar", + "once_cell", +] + [[package]] name = "expectrl" version = "0.7.1" @@ -1665,6 +1707,8 @@ dependencies = [ "gix-filter", "gix-fs", "gix-hash", + "gix-imara-diff", + "gix-imara-diff-01", "gix-index", "gix-object", "gix-path", @@ -1673,8 +1717,6 @@ dependencies = [ "gix-trace", "gix-traverse", "gix-worktree", - "imara-diff 0.1.8", - "imara-diff 0.2.0", "serde", "thiserror 2.0.18", ] @@ -1874,6 +1916,24 @@ dependencies = [ "unicode-bom", ] +[[package]] +name = "gix-imara-diff" +version = "0.2.0" +dependencies = [ + "cov-mark", + "expect-test", + "hashbrown 0.16.1", + "memchr", +] + +[[package]] +name = "gix-imara-diff-01" +version = "0.1.8" +dependencies = [ + "expect-test", + "hashbrown 0.15.5", +] + [[package]] name = "gix-index" version = "0.49.0" @@ -1958,6 +2018,7 @@ dependencies = [ name = "gix-merge" version = "0.14.0" dependencies = [ + "arbitrary", "bstr", "document-features", "gix-command", @@ -1965,6 +2026,7 @@ dependencies = [ "gix-filter", "gix-fs", "gix-hash", + "gix-imara-diff-01", "gix-index", "gix-object", "gix-odb", @@ -1977,7 +2039,6 @@ dependencies = [ "gix-trace", "gix-utils", "gix-worktree", - "imara-diff 0.1.8", "nonempty", "pretty_assertions", "serde", @@ -2139,7 +2200,7 @@ dependencies = [ "serial_test", "thiserror 2.0.18", "windows 0.62.2", - "winreg", + "winreg 0.56.0", ] [[package]] @@ -2915,7 +2976,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2", + "socket2 0.6.3", "tokio", "tower-service", "tracing", @@ -3035,25 +3096,6 @@ dependencies = [ "icu_properties", ] -[[package]] -name = "imara-diff" -version = "0.1.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17d34b7d42178945f775e84bc4c36dde7c1c6cdfea656d3354d009056f2bb3d2" -dependencies = [ - "hashbrown 0.15.5", -] - -[[package]] -name = "imara-diff" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f01d462f766df78ab820dd06f5eb700233c51f0f4c2e846520eaf4ba6aa5c5c" -dependencies = [ - "hashbrown 0.15.5", - "memchr", -] - [[package]] name = "indexmap" version = "2.13.0" @@ -3122,15 +3164,14 @@ dependencies = [ [[package]] name = "ipconfig" -version = "0.3.4" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d40460c0ce33d6ce4b0630ad68ff63d6661961c48b6dba35e5a4d81cfb48222" +checksum = "b58db92f96b720de98181bbbe63c831e87005ab460c1bf306eb2622b4707997f" dependencies = [ - "socket2", + "socket2 0.5.10", "widestring", - "windows-registry", - "windows-result", - "windows-sys 0.61.2", + "windows-sys 0.48.0", + "winreg 0.50.0", ] [[package]] @@ -3166,7 +3207,7 @@ checksum = "3640c1c38b8e4e43584d8df18be5fc6b0aa314ce6ebf51b53313d4306cca8e46" dependencies = [ "hermit-abi", "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -3227,7 +3268,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -3265,7 +3306,7 @@ dependencies = [ "cesu8", "cfg-if", "combine", - "jni-sys 0.3.1", + "jni-sys", "log", "thiserror 1.0.69", "walkdir", @@ -3274,31 +3315,9 @@ dependencies = [ [[package]] name = "jni-sys" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41a652e1f9b6e0275df1f15b32661cf0d4b78d4d87ddec5e0c3c20f097433258" -dependencies = [ - "jni-sys 0.4.1", -] - -[[package]] -name = "jni-sys" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6377a88cb3910bee9b0fa88d4f42e1d2da8e79915598f65fb0c7ee14c878af2" -dependencies = [ - "jni-sys-macros", -] - -[[package]] -name = "jni-sys-macros" -version = "0.4.1" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38c0b942f458fe50cdac086d2f946512305e5631e720728f2a61aabcd47a6264" -dependencies = [ - "quote", - "syn 2.0.117", -] +checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" [[package]] name = "jobserver" @@ -3386,9 +3405,9 @@ checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" [[package]] name = "libredox" -version = "0.1.15" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ddbf48fd451246b1f8c2610bd3b4ac0cc6e149d89832867093ab69a17194f08" +checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a" dependencies = [ "bitflags 2.11.0", "libc", @@ -3490,22 +3509,6 @@ dependencies = [ "pkg-config", ] -[[package]] -name = "macro_rules_attribute" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65049d7923698040cd0b1ddcced9b0eb14dd22c5f86ae59c3740eab64a676520" -dependencies = [ - "macro_rules_attribute-proc_macro", - "paste", -] - -[[package]] -name = "macro_rules_attribute-proc_macro" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "670fdfda89751bc4a84ac13eaa63e205cf0fd22b4c9a5fbfa085b63c1f1d3a30" - [[package]] name = "maplit" version = "1.0.2" @@ -3646,7 +3649,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -3727,9 +3730,9 @@ dependencies = [ [[package]] name = "openssl" -version = "0.10.76" +version = "0.10.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "951c002c75e16ea2c65b8c7e4d3d51d5530d8dfa7d060b4776828c88cfb18ecf" +checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" dependencies = [ "bitflags 2.11.0", "cfg-if", @@ -3814,12 +3817,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "paste" -version = "1.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" - [[package]] name = "pathdiff" version = "0.2.3" @@ -4028,7 +4025,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", - "socket2", + "socket2 0.6.3", "thiserror 2.0.18", "tokio", "tracing", @@ -4066,7 +4063,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2", + "socket2 0.6.3", "tracing", "windows-sys 0.60.2", ] @@ -4288,7 +4285,7 @@ dependencies = [ "quinn", "rustls", "rustls-pki-types", - "rustls-platform-verifier", + "rustls-platform-verifier 0.6.2", "sync_wrapper", "tokio", "tokio-native-tls", @@ -4372,7 +4369,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -4391,15 +4388,14 @@ dependencies = [ [[package]] name = "rustls-ffi" -version = "0.15.1" +version = "0.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e34a2a1daa2947d4ea0106492399c59a1149ddf1b100058a8c39a62323aa9e97" +checksum = "4128514cb6472050cba340cdac098a235c53e6aad276737ce1d7b24a19260392" dependencies = [ "libc", "log", - "macro_rules_attribute", "rustls", - "rustls-platform-verifier", + "rustls-platform-verifier 0.5.3", "rustls-webpki", ] @@ -4425,6 +4421,27 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rustls-platform-verifier" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19787cda76408ec5404443dc8b31795c87cd8fec49762dc75fa727740d34acc1" +dependencies = [ + "core-foundation", + "core-foundation-sys", + "jni", + "log", + "once_cell", + "rustls", + "rustls-native-certs", + "rustls-platform-verifier-android", + "rustls-webpki", + "security-framework", + "security-framework-sys", + "webpki-root-certs 0.26.11", + "windows-sys 0.59.0", +] + [[package]] name = "rustls-platform-verifier" version = "0.6.2" @@ -4442,8 +4459,8 @@ dependencies = [ "rustls-webpki", "security-framework", "security-framework-sys", - "webpki-root-certs", - "windows-sys 0.60.2", + "webpki-root-certs 1.0.6", + "windows-sys 0.61.2", ] [[package]] @@ -4743,6 +4760,16 @@ dependencies = [ "serde", ] +[[package]] +name = "socket2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "socket2" version = "0.6.3" @@ -4750,7 +4777,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -4905,7 +4932,7 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -4924,7 +4951,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "230a1b821ccbd75b185820a1f1ff7b14d21da1e442e22c0863ea5f08771a8874" dependencies = [ "rustix", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -5048,7 +5075,7 @@ dependencies = [ "libc", "mio", "pin-project-lite", - "socket2", + "socket2 0.6.3", "tokio-macros", "windows-sys 0.61.2", ] @@ -5583,6 +5610,15 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "webpki-root-certs" +version = "0.26.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75c7f0ef91146ebfb530314f5f1d24528d7f0767efbfd31dce919275413e393e" +dependencies = [ + "webpki-root-certs 1.0.6", +] + [[package]] name = "webpki-root-certs" version = "1.0.6" @@ -5620,7 +5656,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -5721,17 +5757,6 @@ dependencies = [ "windows-link", ] -[[package]] -name = "windows-registry" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" -dependencies = [ - "windows-link", - "windows-result", - "windows-strings", -] - [[package]] name = "windows-result" version = "0.4.1" @@ -5759,6 +5784,15 @@ dependencies = [ "windows-targets 0.42.2", ] +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + [[package]] name = "windows-sys" version = "0.52.0" @@ -5810,6 +5844,21 @@ dependencies = [ "windows_x86_64_msvc 0.42.2", ] +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + [[package]] name = "windows-targets" version = "0.52.6" @@ -5858,6 +5907,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" @@ -5876,6 +5931,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" @@ -5894,6 +5955,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -5924,6 +5991,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + [[package]] name = "windows_i686_msvc" version = "0.52.6" @@ -5942,6 +6015,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" @@ -5960,6 +6039,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" @@ -5978,6 +6063,12 @@ version = "0.42.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" @@ -5999,6 +6090,16 @@ dependencies = [ "memchr", ] +[[package]] +name = "winreg" +version = "0.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "524e57b2c537c0f9b1e69f1965311ec12182b4122e45035b1508cd24d2adadb1" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + [[package]] name = "winreg" version = "0.56.0" @@ -6006,7 +6107,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d6f32a0ff4a9f6f01231eb2059cc85479330739333e0e58cadf03b6af2cca10" dependencies = [ "cfg-if", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 9e35c0ef80d..abc4d34d712 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -242,6 +242,8 @@ members = [ "gix-object", "gix-glob", "gix-diff", + "gix-imara-diff-01", + "gix-imara-diff", "gix-merge", "gix-date", "gix-traverse", diff --git a/gix-diff/Cargo.toml b/gix-diff/Cargo.toml index cd2e2d02fa8..d50d4e66ef7 100644 --- a/gix-diff/Cargo.toml +++ b/gix-diff/Cargo.toml @@ -44,6 +44,7 @@ doctest = false [[bench]] name = "line-count" harness = false +required-features = ["blob-experimental"] path = "./benches/line_count.rs" [dependencies] @@ -60,10 +61,10 @@ gix-fs = { version = "^0.19.2", path = "../gix-fs", optional = true } gix-tempfile = { version = "^21.0.0", path = "../gix-tempfile", optional = true } gix-trace = { version = "^0.1.18", path = "../gix-trace", optional = true } gix-traverse = { version = "^0.55.0", path = "../gix-traverse", optional = true } +imara-diff = { package = "gix-imara-diff-01", version = "0.1.8", optional = true, path = "../gix-imara-diff-01" } +imara-diff-v2 = { package = "gix-imara-diff", version = "0.2.0", optional = true, path = "../gix-imara-diff" } thiserror = "2.0.18" -imara-diff = { version = "0.1.8", optional = true } -imara-diff-v2 = { version = "0.2.0", optional = true, package = "imara-diff" } serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } getrandom = { version = "0.2.17", optional = true, default-features = false, features = ["js"] } bstr = { version = "1.12.0", default-features = false } @@ -72,7 +73,6 @@ document-features = { version = "0.2.0", optional = true } [dev-dependencies] criterion = "0.8.2" -imara-diff-v2 = { version = "0.2.0", package = "imara-diff" } gix-hash = { path = "../gix-hash", features = ["sha1"] } [package.metadata.docs.rs] diff --git a/gix-imara-diff-01/.gitattributes b/gix-imara-diff-01/.gitattributes new file mode 100644 index 00000000000..92b18997c33 --- /dev/null +++ b/gix-imara-diff-01/.gitattributes @@ -0,0 +1,3 @@ +*.before text eol=lf +*.after text eol=lf +*.diff text eol=lf diff --git a/gix-imara-diff-01/.gitignore b/gix-imara-diff-01/.gitignore new file mode 100644 index 00000000000..6dd7cb0b55e --- /dev/null +++ b/gix-imara-diff-01/.gitignore @@ -0,0 +1,3 @@ +/target +/Cargo.lock +/bench_data diff --git a/gix-imara-diff-01/Cargo.toml b/gix-imara-diff-01/Cargo.toml new file mode 100644 index 00000000000..966417852ac --- /dev/null +++ b/gix-imara-diff-01/Cargo.toml @@ -0,0 +1,34 @@ +[package] +name = "gix-imara-diff-01" +version = "0.1.8" +edition = "2021" +authors = ["pascalkuthe "] +rust-version = "1.71" +license = "Apache-2.0" + +description = "A high performance library for computing diffs." +repository = "https://github.com/pascalkuthe/imara-diff" +keywords = ["diff", "difference", "myers", "compare", "changes"] +readme = "README.md" +exclude = [ + "tests", + "bench_data", + "plt.py", +] + +[dependencies] +hashbrown = { version = "0.15", default-features = false, features = ["default-hasher", "inline-more"] } + +[features] +default = ["unified_diff"] +unified_diff = [] + +[dev-dependencies] +expect-test = "1.4.0" + +[profile.release] +debug = true + +# [[bench]] +# name = "git_repo" +# harness = false diff --git a/gix-imara-diff-01/LICENSE b/gix-imara-diff-01/LICENSE new file mode 100644 index 00000000000..16fe87b06e8 --- /dev/null +++ b/gix-imara-diff-01/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/gix-imara-diff-01/README.md b/gix-imara-diff-01/README.md new file mode 100644 index 00000000000..2b8abcaed94 --- /dev/null +++ b/gix-imara-diff-01/README.md @@ -0,0 +1,112 @@ +# imara-diff + +[![crates.io](https://img.shields.io/crates/v/imara-diff?style=flat-square)](https://crates.io/crates/imara-diff) +[![crates.io](https://img.shields.io/docsrs/imara-diff?style=flat-square)](https://docs.rs/imara-diff/latest/imara_diff/) +![crates.io](https://img.shields.io/crates/l/imara-diff?style=flat-square) + +`imara-diff` is a solid (imara in swahili) diff library for rust. +Solid refers to the fact that imara-diff provides very good runtime performance even +in pathologic cases so that your application never appears to freeze while waiting on a diff. +The performance improvements are achieved using battle tested heuristics used in gnu-diff and git +that are known to perform well while still providing good results. + +`imara-diff` is also designed to be flexible so that it can be used with arbitrary collections and +not just lists and strings and even allows reusing large parts of the computation when +comparing the same file to multiple different files. + +`imara-diff` provides two diff algorithms: + +* The linear-space variant of the well known [Myers algorithm](http://www.xmailserver.org/diff2.pdf) +* The **Histogram** algorithm which is a variant of the patience diff algorithm. + +Myers algorithm has been enhanced with preprocessing and multiple heuristics to ensure fast runtime in pathological +cases to avoid quadratic time complexity and closely matches the behavior of gnu-diff and git. +The histogram algorithm was originally ported from git but has been heavily optimized. +The **Histogram algorithm outperforms Myers algorithm** by 10% - 100% across a **wide variety of workloads**. + +## Limitations + +Even with the optimizations in this crate, performing a large diff without any tokenization (like character diff for a string) does not perform well. +To work around this problem a diff of the entire file with large tokens (like lines for a string) can be performed first. +The `Sink` implementation can then perform fine-grained diff on changed regions. +Note that this fine-grained diff should not be performed for pure insertions, pure deletions and very large changes. + +In an effort to improve performance, `imara-diff` makes heavy use of pointer compression. +That means that it can only support files with at most `2^31 - 2` tokens. +This should be rarely an issue in practice for textual diffs, because most (large) real-world files +have an average line-length of at least 8. +That means that this limitation only becomes a problem for files above 16GB while performing line-diffs. + +## Benchmarks + +The most used diffing libraries in the rust ecosystem are [similar](https://crates.io/crates/similar) and [dissimilar](https://crates.io/crates/dissimilar). +The fastest diff implementation both of these offer is a simple implementation of Myers algorithm +without preprocessing or additional heuristics. +As these implementations are very similar only `similar` was included in the benchmark. + +To provide a benchmark to reflects real-world workloads, the git history of different open source projects were used. +For each repo two (fairly different) tags were chosen. +A tree diff is performed with [gitoxide](https://github.com/Byron/gitoxide) and the pairs of files that should be saved are stored in memory. +The diffs collected using this method are often fairly large, because the repositories are compared over a large span of time. +Therefore, the tree diff of the last 30 commit before the tag (equivalent of `git diff TAG^ TAG`, `git diff TAG^^ TAG^^`) were also used to also include smaller diffs. + +The benchmark measures the runtime of performing a **line diff** between the collected files. +As a measure of complexity for each change `(M + N) D` was used where `M` and `N` are the lengths of the two compared files +and `D` is the length of the edit script required to transform these files into each other (determined with Myers algorithm). +This complexity measure is used to divide the changes into 10 badges. +The time to compute the line diffs in each badge was benchmarked. + +The plots below show the runtime for each **average** complexity (runtime is normalized by the number of diffs). +Note that these plots are shown in logarithmic scale due to the large runtime of `similar` for complex diffs. +Furthermore, to better highlight the performance of the Histogram algorithm, the speedup of the Histogram algorithm +compared to the Myers algorithm is shown separately. + +* [Linux](###Linux) +* [Rust](###Rust) +* [VSCode](###VSCode) +* [Helix](###Helix) + +### Linux + +The sourcecode of the linux kernel. + +- **Repo** - https://kernel.org +- **Tags** - `v5.7` and `v6.0` + +### Rust + +The sourcecode of the rust compiler, standard library and various related tooling. + +- **Repo** - https://github.com/rust-lang/rust +- **Tags** - `1.50.0` and `1.64.0` + +### VScode + +The sourcecode of the vscode editor. + +- **Repo** - https://github.com/microsoft/vscode +- **Tags** - `1.41.0` and `1.72.2` + +### Helix + +The sourcecode of the helix editor. + +- **Repo** - https://github.com/helix-editor/helix +- **Tags** - `v0.5.0` and `22.08.1` + + +## Stability Policy + +`imara-diff` uses [Semantic Versioning (SemVer)](https://semver.org/). +All non-breaking changes to the public rust API will cause a minor `SemVer` bump. +All breaking changes to to the public rust API will cause a major `SemVer` bump. +Changes in the produced diffs are also considered breaking changes if the produced diff was valid. +If the produced diff was invalid the change will be considered a bugfix. + +Additionally all changes to the minimum stable rust version (MSRV) are also considered breaking changes. +The current **MSRV is 1.61**. +`imara-diff` will roughly follow the MSRV of Firefox (stable) to remain +compatible many platforms that try to include its latest version. +To predict future changes to the MSRV the [Firefox documentation] can be consulted. + +[Firefox documentation]: https://firefox-source-docs.mozilla.org/writing-rust-code/update-policy.html diff --git a/gix-imara-diff-01/src/histogram.rs b/gix-imara-diff-01/src/histogram.rs new file mode 100644 index 00000000000..6e775dc3f53 --- /dev/null +++ b/gix-imara-diff-01/src/histogram.rs @@ -0,0 +1,122 @@ +use std::ops::Range; + +use crate::histogram::lcs::find_lcs; +use crate::histogram::list_pool::{ListHandle, ListPool}; +use crate::intern::Token; +use crate::util::{strip_common_postfix, strip_common_prefix}; +use crate::{myers, Sink}; + +mod lcs; +mod list_pool; + +const MAX_CHAIN_LEN: u32 = 63; + +struct Histogram { + token_occurrences: Vec, + pool: ListPool, +} + +pub fn diff(mut before: &[Token], mut after: &[Token], num_tokens: u32, mut sink: S) -> S::Out { + let mut histogram = Histogram::new(num_tokens); + let prefix = strip_common_prefix(&mut before, &mut after); + strip_common_postfix(&mut before, &mut after); + histogram.run(before, prefix, after, prefix, &mut sink); + sink.finish() +} + +impl Histogram { + fn new(num_buckets: u32) -> Histogram { + Histogram { + token_occurrences: vec![ListHandle::default(); num_buckets as usize], + pool: ListPool::new(2 * num_buckets), + } + } + + fn clear(&mut self) { + self.pool.clear(); + } + + fn token_occurrences(&self, token: Token) -> &[u32] { + self.token_occurrences[token.0 as usize].as_slice(&self.pool) + } + + fn num_token_occurrences(&self, token: Token) -> u32 { + self.token_occurrences[token.0 as usize].len(&self.pool) + } + + fn populate(&mut self, file: &[Token]) { + for (i, &token) in file.iter().enumerate() { + self.token_occurrences[token.0 as usize].push(i as u32, &mut self.pool); + } + } + + fn run( + &mut self, + mut before: &[Token], + mut before_off: u32, + mut after: &[Token], + mut after_off: u32, + sink: &mut impl Sink, + ) { + loop { + if before.is_empty() { + if !after.is_empty() { + sink.process_change(before_off..before_off, after_off..after_off + after.len() as u32); + } + return; + } else if after.is_empty() { + sink.process_change(before_off..before_off + before.len() as u32, after_off..after_off); + return; + } + + self.populate(before); + match find_lcs(before, after, self) { + // no lcs was found, that means that file1 and file2 two have nothing in common + Some(lcs) if lcs.len == 0 => { + sink.process_change( + before_off..before_off + before.len() as u32, + after_off..after_off + after.len() as u32, + ); + return; + } + Some(lcs) => { + self.run( + &before[..lcs.before_start as usize], + before_off, + &after[..lcs.after_start as usize], + after_off, + sink, + ); + + // this is equivalent to (tail) recursion but implement as a loop for efficeny reasons + let before_end = lcs.before_start + lcs.len; + before = &before[before_end as usize..]; + before_off += before_end; + + let after_end = lcs.after_start + lcs.len; + after = &after[after_end as usize..]; + after_off += after_end; + } + None => { + // we are diffing two extremely large repetitive files + // this is a worst case for histogram diff with O(N^2) performance + // fallback to myers to maintain linear time complexity + myers::diff( + before, + after, + 0, // not used by myers + |mut before: Range, mut after: Range| { + before.start += before_off; + before.end += before_off; + after.start += after_off; + after.end += after_off; + sink.process_change(before, after) + }, + false, + ); + return; + } + } + } + } +} diff --git a/gix-imara-diff-01/src/histogram/lcs.rs b/gix-imara-diff-01/src/histogram/lcs.rs new file mode 100644 index 00000000000..0534c8cc046 --- /dev/null +++ b/gix-imara-diff-01/src/histogram/lcs.rs @@ -0,0 +1,130 @@ +use crate::histogram::{Histogram, MAX_CHAIN_LEN}; +use crate::intern::Token; + +pub(super) fn find_lcs(before: &[Token], after: &[Token], histogram: &mut Histogram) -> Option { + let mut search = LcsSearch { + lcs: Lcs::default(), + min_occurrences: MAX_CHAIN_LEN + 1, + found_cs: false, + }; + search.run(before, after, histogram); + if search.success() { + Some(search.lcs) + } else { + None + } +} + +#[derive(Default, Debug)] +pub struct Lcs { + pub before_start: u32, + pub after_start: u32, + pub len: u32, +} + +pub struct LcsSearch { + lcs: Lcs, + min_occurrences: u32, + found_cs: bool, +} + +impl LcsSearch { + fn run(&mut self, before: &[Token], after: &[Token], histogram: &mut Histogram) { + let mut pos = 0; + while let Some(&token) = after.get(pos as usize) { + if histogram.num_token_occurrences(token) != 0 { + self.found_cs = true; + if histogram.num_token_occurrences(token) <= self.min_occurrences { + pos = self.update_lcs(pos, token, histogram, before, after); + continue; + } + } + + pos += 1; + } + + histogram.clear(); + } + + fn success(&mut self) -> bool { + !self.found_cs || self.min_occurrences <= MAX_CHAIN_LEN + } + + fn update_lcs( + &mut self, + after_pos: u32, + token: Token, + histogram: &Histogram, + before: &[Token], + after: &[Token], + ) -> u32 { + let mut next_token_idx2 = after_pos + 1; + let mut occurrences_iter = histogram.token_occurrences(token).iter().copied(); + let mut token_idx1 = occurrences_iter.next().unwrap(); + + 'occurrences_iter: loop { + let mut occurrences = histogram.num_token_occurrences(token); + let mut start1 = token_idx1; + let mut start2 = after_pos; + loop { + if start1 == 0 || start2 == 0 { + break; + } + let token1 = before.get(start1 as usize - 1); + let token2 = after.get(start2 as usize - 1); + if matches!((token1, token2), (Some(token1), Some(token2)) if token1 == token2) { + start1 -= 1; + start2 -= 1; + let new_occurrences = histogram.num_token_occurrences(before[start1 as usize]); + occurrences = occurrences.min(new_occurrences); + } else { + break; + } + } + + let mut end1 = token_idx1 + 1; + let mut end2 = after_pos + 1; + + loop { + let token1 = before.get(end1 as usize); + let token2 = after.get(end2 as usize); + if matches!((token1, token2), (Some(token1), Some(token2)) if token1 == token2) { + let new_occurrences = histogram.num_token_occurrences(before[end1 as usize]); + occurrences = occurrences.min(new_occurrences); + end1 += 1; + end2 += 1; + } else { + break; + } + } + + if next_token_idx2 < end2 { + next_token_idx2 = end2; + } + + let len = end2 - start2; + debug_assert_eq!(len, end1 - start1); + if self.lcs.len < len || self.min_occurrences > occurrences { + self.min_occurrences = occurrences; + self.lcs = Lcs { + before_start: start1, + after_start: start2, + len, + }; + } + + loop { + if let Some(next_token_idx) = occurrences_iter.next() { + if next_token_idx > end2 { + token_idx1 = next_token_idx; + break; + } + } else { + break 'occurrences_iter; + } + } + } + + next_token_idx2 + } +} diff --git a/gix-imara-diff-01/src/histogram/list_pool.rs b/gix-imara-diff-01/src/histogram/list_pool.rs new file mode 100644 index 00000000000..98472bcc580 --- /dev/null +++ b/gix-imara-diff-01/src/histogram/list_pool.rs @@ -0,0 +1,256 @@ +use crate::histogram::MAX_CHAIN_LEN; + +/// A small list of entity references allocated from a pool. +/// +/// An `ListHandle` type provides similar functionality to `Vec`, but with some important +/// differences in the implementation: +/// +/// 1. Memory is allocated from a `ListPool` instead of the global heap. +/// 2. The footprint of an entity list is 4 bytes, compared with the 24 bytes for `Vec`. +/// 3. An entity list doesn't implement `Drop`, leaving it to the pool to manage memory. +/// +/// The list pool is intended to be used as a LIFO allocator. After building up a larger data +/// structure with many list references, the whole thing can be discarded quickly by clearing the +/// pool. +/// +/// # Safety +/// +/// Entity lists are not as safe to use as `Vec`, but they never jeopardize Rust's memory safety +/// guarantees. These are the problems to be aware of: +/// +/// - If you lose track of an entity list, its memory won't be recycled until the pool is cleared. +/// This can cause the pool to grow very large with leaked lists. +/// - If entity lists are used after their pool is cleared, they may contain garbage data, and +/// modifying them may corrupt other lists in the pool. +/// - If an entity list is used with two different pool instances, both pools are likely to become +/// corrupted. +/// +/// Entity lists can be cloned, but that operation should only be used as part of cloning the whole +/// function they belong to. *Cloning an entity list does not allocate new memory for the clone*. +/// It creates an alias of the same memory. +/// +/// Entity lists cannot be hashed and compared for equality because it's not possible to compare the +/// contents of the list without the pool reference. +/// +/// # Implementation +/// +/// The `ListHandle` itself is designed to have the smallest possible footprint. This is important +/// because it is used inside very compact data structures like `InstructionData`. The list +/// contains only a 32-bit index into the pool's memory vector, pointing to the first element of +/// the list. +/// +/// The pool is just a single `Vec` containing all of the allocated lists. Each list is +/// represented as three contiguous parts: +/// +/// 1. The number of elements in the list. +/// 2. The list elements. +/// 3. Excess capacity elements. +/// +/// The total size of the three parts is always a power of two, and the excess capacity is always +/// as small as possible. This means that shrinking a list may cause the excess capacity to shrink +/// if a smaller power-of-two size becomes available. +/// +/// Both growing and shrinking a list may cause it to be reallocated in the pool vector. +/// +/// The index stored in an `ListHandle` points to part 2, the list elements. The value 0 is +/// reserved for the empty list which isn't allocated in the vector. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ListHandle { + index: u32, + generation: u32, + len: u32, +} + +/// Create an empty list. +impl Default for ListHandle { + fn default() -> Self { + Self { + index: 0, + generation: 0, + len: 0, + } + } +} + +const MAX_SIZE_CLASS: SizeClass = sclass_for_length(super::MAX_CHAIN_LEN - 1); +const NUM_SIZE_CLASS: usize = MAX_SIZE_CLASS as usize + 1; + +/// A memory pool for storing lists of `T`. +#[derive(Clone, Debug)] +pub struct ListPool { + // The main array containing the lists. + data: Vec, + + // Heads of the free lists, one for each size class. + free: [u32; NUM_SIZE_CLASS], + + generation: u32, +} + +/// Lists are allocated in sizes that are powers of two, starting from 4. +/// Each power of two is assigned a size class number, so the size is `4 << SizeClass`. +type SizeClass = u8; + +/// Get the size of a given size class. The size includes the length field, so the maximum list +/// length is one less than the class size. +#[inline] +const fn sclass_size(sclass: SizeClass) -> usize { + 4 << sclass +} + +/// Get the size class to use for a given list length. +/// This always leaves room for the length element in addition to the list elements. +#[inline] +const fn sclass_for_length(len: u32) -> SizeClass { + 30 - (len | 3).leading_zeros() as SizeClass +} + +/// Is `len` the minimum length in its size class? +#[inline] +fn is_sclass_max_length(len: u32) -> bool { + len > 3 && len.is_power_of_two() +} + +impl ListPool { + /// Create a new list pool. + pub fn new(capacity: u32) -> Self { + Self { + data: Vec::with_capacity(capacity as usize), + free: [u32::MAX; NUM_SIZE_CLASS], + generation: 1, + } + } + + /// Clear the pool, forgetting about all lists that use it. + /// + /// This invalidates any existing entity lists that used this pool to allocate memory. + /// + /// The pool's memory is not released to the operating system, but kept around for faster + /// allocation in the future. + pub fn clear(&mut self) { + self.data.clear(); + self.free.fill(u32::MAX); + self.generation += 1; + } + + /// Allocate a storage block with a size given by `sclass`. + /// + /// Returns the first index of an available segment of `self.data` containing + /// `sclass_size(sclass)` elements. The allocated memory is filled with reserved + /// values. + fn alloc(&mut self, sclass: SizeClass) -> usize { + let freelist_head = self.free[sclass as usize]; + // First try the free list for this size class. + if freelist_head == u32::MAX { + // Nothing on the free list. Allocate more memory. + let offset = self.data.len(); + self.data.resize(offset + sclass_size(sclass), u32::MAX); + offset + } else { + // take allocation of the free list (linked list) + self.free[sclass as usize] = self.data[freelist_head as usize]; + freelist_head as usize + } + } + + /// Free a storage block with a size given by `sclass`. + /// + /// This must be a block that was previously allocated by `alloc()` with the same size class. + fn free(&mut self, block: usize, sclass: SizeClass) { + let sclass = sclass as usize; + // Insert the block on the free list which is a single linked list. + self.data[block] = self.free[sclass]; + self.free[sclass] = block as u32 + } + + /// Returns two mutable slices representing the two requested blocks. + /// + /// The two returned slices can be longer than the blocks. Each block is located at the front + /// of the respective slice. + fn mut_slices(&mut self, block0: usize, block1: usize) -> (&mut [u32], &mut [u32]) { + if block0 < block1 { + let (s0, s1) = self.data.split_at_mut(block1); + (&mut s0[block0..], s1) + } else { + let (s1, s0) = self.data.split_at_mut(block0); + (s0, &mut s1[block1..]) + } + } + + /// Reallocate a block to a different size class. + /// + /// Copy `elems_to_copy` elements from the old to the new block. + fn realloc(&mut self, block: usize, from_sclass: SizeClass, to_sclass: SizeClass, elems_to_copy: usize) -> usize { + debug_assert!(elems_to_copy <= sclass_size(from_sclass)); + debug_assert!(elems_to_copy <= sclass_size(to_sclass)); + let new_block = self.alloc(to_sclass); + + let (old, new) = self.mut_slices(block, new_block); + new[0..elems_to_copy].copy_from_slice(&old[0..elems_to_copy]); + + self.free(block, from_sclass); + new_block + } +} + +impl ListHandle { + /// Get the number of elements in the list. + #[allow(clippy::len_without_is_empty)] + pub fn len(&self, pool: &ListPool) -> u32 { + if self.generation == pool.generation { + self.len + } else { + 0 + } + } + + /// Get the list as a slice. + pub fn as_slice<'a>(&'a self, pool: &'a ListPool) -> &'a [u32] { + let idx = self.index as usize; + match self.len(pool) { + 0 => &[], + 1 => std::slice::from_ref(&self.index), + len => &pool.data[idx..idx + len as usize], + } + } + + /// Appends an element to the back of the list. + /// Returns the index where the element was inserted. + pub fn push(&mut self, element: u32, pool: &mut ListPool) { + let len = self.len(pool); + match len { + 0 => { + self.generation = pool.generation; + self.index = element; + self.len = 1; + } + 1 => { + // This is an empty list. Allocate a block and set length=1. + let block = pool.alloc(0); + pool.data[block] = self.index; + pool.data[block + 1] = element; + self.index = block as u32; + self.len = 2; + } + 2..=MAX_CHAIN_LEN => { + // Do we need to reallocate? + let block; + let idx = self.index as usize; + if is_sclass_max_length(len) { + // Reallocate, preserving length + all old elements. + let sclass = sclass_for_length(len); + block = pool.realloc(idx, sclass - 1, sclass, len as usize); + self.index = block as u32; + } else { + block = idx; + } + pool.data[block + len as usize] = element; + self.len += 1; + } + + // ignore elements longer then MAX_CHAIN_LEN + // these are rarely relevant and if they are we fall back to myers + _ => (), + } + } +} diff --git a/gix-imara-diff-01/src/intern.rs b/gix-imara-diff-01/src/intern.rs new file mode 100644 index 00000000000..a57b5186fa4 --- /dev/null +++ b/gix-imara-diff-01/src/intern.rs @@ -0,0 +1,183 @@ +use std::hash::{BuildHasher as _, Hash}; +use std::ops::Index; + +use hashbrown::hash_table::{Entry, HashTable}; +use hashbrown::DefaultHashBuilder as RandomState; + +/// A token represented as an interned integer. +/// +/// A token represents the smallest possible unit of change during a diff. +/// For text this is usually a line, a word or a single character. +/// All [algorithms](crate::Algorithm) operate on interned tokens instead +/// of using the token data directly. +/// This allows for much better performance by amortizing the cost of hashing/equality. +/// +/// While you can intern tokens yourself it is strongly recommended to use [`InternedInput`] module. +#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)] +#[repr(transparent)] +pub struct Token(pub u32); + +impl From for Token { + fn from(token: u32) -> Self { + Token(token) + } +} + +impl From for u32 { + fn from(token: Token) -> Self { + token.0 + } +} + +pub trait TokenSource { + type Token: Hash + Eq; + type Tokenizer: Iterator; + fn tokenize(&self) -> Self::Tokenizer; + fn estimate_tokens(&self) -> u32; +} + +/// Two lists of interned [tokens](crate::intern::Token) that can be compared with the [`diff`](crate::diff) function. +/// +/// A token represents the smallest possible unit of change during a diff. +/// For text this is usually a line, a word or a single character. +/// All [algorithms](crate::Algorithm) operate on interned tokens instead +/// of using the token data directly. +/// This allows for much better performance by amortizing the cost of hashing/equality. +/// +/// While you can intern tokens yourself it is strongly recommended to use [`InternedInput`] module. +#[derive(Default)] +pub struct InternedInput { + pub before: Vec, + pub after: Vec, + pub interner: Interner, +} + +impl InternedInput { + pub fn clear(&mut self) { + self.before.clear(); + self.after.clear(); + self.interner.clear(); + } +} + +impl InternedInput { + pub fn new>(before: I, after: I) -> Self { + let token_estimate_before = before.estimate_tokens() as usize; + let token_estimate_after = after.estimate_tokens() as usize; + let mut res = Self { + before: Vec::with_capacity(token_estimate_before), + after: Vec::with_capacity(token_estimate_after), + interner: Interner::new(token_estimate_before + token_estimate_after), + }; + res.update_before(before.tokenize()); + res.update_after(after.tokenize()); + res + } + + /// replaces `self.before` with the interned Tokens yielded by `input` + /// Note that this does not erase any tokens from the interner and might therefore be considered + /// a memory leak. If this function is called often over a long_running process + /// consider clearing the interner with [`clear`](crate::intern::Interner::clear). + pub fn update_before(&mut self, input: impl Iterator) { + self.before.clear(); + self.before.extend(input.map(|token| self.interner.intern(token))); + } + + /// replaces `self.before` with the interned Tokens yielded by `input` + /// Note that this does not erase any tokens from the interner and might therefore be considered + /// a memory leak. If this function is called often over a long_running process + /// consider clearing the interner with [`clear`](crate::intern::Interner::clear) or + /// [`erase_tokens_after`](crate::intern::Interner::erase_tokens_after). + pub fn update_after(&mut self, input: impl Iterator) { + self.after.clear(); + self.after.extend(input.map(|token| self.interner.intern(token))); + } +} + +/// An interner that allows for fast access of tokens produced by a [`TokenSource`]. +#[derive(Default)] +pub struct Interner { + tokens: Vec, + table: HashTable, + hasher: RandomState, +} + +impl Interner { + /// Create an Interner with an initial capacity calculated by summing the results of calling + /// [`estimate_tokens`](crate::intern::TokenSource::estimate_tokens) methods of `before` and `after`. + pub fn new_for_token_source>(before: &S, after: &S) -> Self { + Self::new(before.estimate_tokens() as usize + after.estimate_tokens() as usize) + } + + /// Create an Interner with initial capacity `capacity`. + pub fn new(capacity: usize) -> Interner { + Interner { + tokens: Vec::with_capacity(capacity), + table: HashTable::with_capacity(capacity), + hasher: RandomState::default(), + } + } + + /// Remove all interned tokens. + pub fn clear(&mut self) { + self.table.clear(); + self.tokens.clear(); + } + + /// Returns to total number of **distinct** tokens currently interned. + pub fn num_tokens(&self) -> u32 { + self.tokens.len() as u32 + } +} + +impl Interner { + /// Intern `token` and return a the interned integer. + pub fn intern(&mut self, token: T) -> Token { + let hash = self.hasher.hash_one(&token); + match self.table.entry( + hash, + |&it| self.tokens[it.0 as usize] == token, + |&token| self.hasher.hash_one(&self.tokens[token.0 as usize]), + ) { + Entry::Occupied(entry) => *entry.get(), + Entry::Vacant(entry) => { + let interned = Token(self.tokens.len() as u32); + entry.insert(interned); + self.tokens.push(token); + interned + } + } + } + + /// Erases `first_erased_token` and any tokens interned afterward from the interner. + pub fn erase_tokens_after(&mut self, first_erased_token: Token) { + assert!(first_erased_token.0 <= self.tokens.len() as u32); + let retained = first_erased_token.0 as usize; + let erased = self.tokens.len() - retained; + if retained <= erased { + self.table.clear(); + for (i, token) in self.tokens[0..retained].iter().enumerate() { + let hash = self.hasher.hash_one(token); + self.table.insert_unique(hash, Token(i as u32), |&token| { + self.hasher.hash_one(&self.tokens[token.0 as usize]) + }); + } + } else { + for (i, token) in self.tokens[retained..].iter().enumerate() { + let hash = self.hasher.hash_one(token); + match self.table.find_entry(hash, |token| token.0 == (retained + i) as u32) { + Ok(occupied) => drop(occupied.remove()), + Err(_absent) => unreachable!(), + } + } + } + self.tokens.truncate(first_erased_token.0 as usize); + } +} + +impl Index for Interner { + type Output = T; + fn index(&self, index: Token) -> &Self::Output { + &self.tokens[index.0 as usize] + } +} diff --git a/gix-imara-diff-01/src/lib.rs b/gix-imara-diff-01/src/lib.rs new file mode 100644 index 00000000000..9ff943cf5f6 --- /dev/null +++ b/gix-imara-diff-01/src/lib.rs @@ -0,0 +1,268 @@ +//! Imara-diff is a solid (imara in swahili) diff library for rust. +//! Solid refers to the fact that imara-diff provides very good runtime performance even +//! in pathologic cases so that your application never appears to freeze while waiting on a diff. +//! The performance improvements are achieved using battle tested heuristics used in gnu-diff and git +//! that are known to yield fast runtime and performance. +//! +//! Imara-diff is also designed to be flexible so that it can be used with arbitrary collections and +//! not just lists and strings and even allows reusing large parts of the computation when +//! comparing the same file to multiple different files. +//! +//! Imara-diff provides two diff algorithms: +//! +//! * The linear-space variant of the well known [**Myers** algorithm](http://www.xmailserver.org/diff2.pdf) +//! * The **Histogram** algorithm which is a variant of the patience diff algorithm. +//! +//! Myers algorithm has been enhanced with preprocessing and multiple heuristics to ensure fast runtime in pathological +//! cases to avoid quadratic time complexity and closely matches the behaviour of gnu-diff and git. +//! The Histogram algorithm was originally ported from git but has been heavily optimized. +//! The **Histogram algorithm outperforms Myers diff** by 10% - 100% across a **wide variety of workloads**. +//! +//! Imara-diffs algorithms have been benchmarked over a wide variety of real-world code. +//! For example while comparing multiple different linux kernel it performs up to 30 times better than the `similar` crate. +//! +//! # API Overview +//! +//! Imara-diff provides the [`UnifiedDiffBuilder`](crate::UnifiedDiffBuilder) for building +//! a human-readable diff similar to the output of `git diff` or `diff -u`. +//! This makes building a tool similar to gnu diff easy: +//! +//! ``` +//! use gix_imara_diff_01::intern::InternedInput; +//! use gix_imara_diff_01::{diff, Algorithm, UnifiedDiffBuilder}; +//! +//! let before = r#"fn foo() -> Bar { +//! let mut foo = 2; +//! foo *= 50; +//! println!("hello world") +//! }"#; +//! +//! let after = r#"// lorem ipsum +//! fn foo() -> Bar { +//! let mut foo = 2; +//! foo *= 50; +//! println!("hello world"); +//! println!("{foo}"); +//! } +//! // foo +//! "#; +//! +//! let input = InternedInput::new(before, after); +//! let diff = diff(Algorithm::Histogram, &input, UnifiedDiffBuilder::new(&input)); +//! assert_eq!( +//! diff, +//! r#"@@ -1,5 +1,8 @@ +//! +// lorem ipsum +//! fn foo() -> Bar { +//! let mut foo = 2; +//! foo *= 50; +//! - println!("hello world") +//! + println!("hello world"); +//! + println!("{foo}"); +//! } +//! +// foo +//! "# +//! ); +//! ``` +//! +//! If you want to process the diff in some way you can provide your own implementation of [`Sink`](crate::sink::Sink). +//! For closures [`Sink`](crate::sink::Sink) is already implemented, so simple [`Sink`]s can be easily added: +//! +//! ``` +//! use std::ops::Range; +//! +//! use gix_imara_diff_01::intern::InternedInput; +//! use gix_imara_diff_01::{diff, Algorithm, UnifiedDiffBuilder}; +//! +//! let before = r#"fn foo() -> Bar { +//! let mut foo = 2; +//! foo *= 50; +//! println!("hello world") +//! }"#; +//! +//! let after = r#"// lorem ipsum +//! fn foo() -> Bar { +//! let mut foo = 2; +//! foo *= 50; +//! println!("hello world"); +//! println!("{foo}"); +//! } +//! // foo +//! "#; +//! +//! let mut insertions = Vec::new(); +//! let mut removals = Vec::new(); +//! let mut replacements = Vec::new(); +//! +//! let input = InternedInput::new(before, after); +//! let sink = |before: Range, after: Range| { +//! let hunk_before: Vec<_> = input.before[before.start as usize..before.end as usize] +//! .iter() +//! .map(|&line| input.interner[line]) +//! .collect(); +//! let hunk_after: Vec<_> = input.after[after.start as usize..after.end as usize] +//! .iter() +//! .map(|&line| input.interner[line]) +//! .collect(); +//! if hunk_after.is_empty() { +//! removals.push(hunk_before) +//! } else if hunk_before.is_empty() { +//! insertions.push(hunk_after) +//! } else { +//! replacements.push((hunk_before, hunk_after)) +//! } +//! }; +//! let diff = diff(Algorithm::Histogram, &input, sink); +//! assert_eq!(&insertions, &[vec!["// lorem ipsum"], vec!["// foo"]]); +//! assert!(removals.is_empty()); +//! assert_eq!( +//! &replacements, +//! &[( +//! vec![" println!(\"hello world\")"], +//! vec![" println!(\"hello world\");", " println!(\"{foo}\");"] +//! )] +//! ); +//! ``` +//! +//! For `&str` and `&[u8]` imara-diff will compute a line diff by default. +//! To perform diffs of different tokenizations and collections you can implement the [`TokenSource`](crate::intern::TokenSource) trait. +//! For example the imara-diff provides an alternative tokenizer for line-diffs that includes the line terminator in the line: +//! +//! ``` +//! use gix_imara_diff_01::intern::InternedInput; +//! use gix_imara_diff_01::sink::Counter; +//! use gix_imara_diff_01::sources::lines_with_terminator; +//! use gix_imara_diff_01::{diff, Algorithm, UnifiedDiffBuilder}; +//! +//! let before = "foo"; +//! let after = "foo\n"; +//! +//! let input = InternedInput::new(before, after); +//! let changes = diff(Algorithm::Histogram, &input, Counter::default()); +//! assert_eq!(changes.insertions, 0); +//! assert_eq!(changes.removals, 0); +//! +//! let input = InternedInput::new(lines_with_terminator(before), lines_with_terminator(after)); +//! let changes = diff(Algorithm::Histogram, &input, Counter::default()); +//! assert_eq!(changes.insertions, 1); +//! assert_eq!(changes.removals, 1); +//! ``` + +#[cfg(feature = "unified_diff")] +pub use unified_diff::UnifiedDiffBuilder; + +use crate::intern::{InternedInput, Token, TokenSource}; +pub use crate::sink::Sink; +mod histogram; +pub mod intern; +mod myers; +pub mod sink; +pub mod sources; +#[cfg(feature = "unified_diff")] +mod unified_diff; +mod util; + +#[cfg(test)] +mod tests; + +/// `imara-diff` supports multiple different algorithms +/// for computing an edit sequence. +/// These algorithms have different performance and all produce different output. +#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)] +pub enum Algorithm { + /// A variation of the [`patience` diff algorithm described by Bram Cohen's blog post](https://bramcohen.livejournal.com/73318.html) + /// that uses a histogram to find the least common LCS. + /// Just like the `patience` diff algorithm, this algorithm usually produces + /// more human readable output then myers algorithm. + /// However compared to the `patience` diff algorithm (which is slower then myers algorithm), + /// the Histogram algorithm performs much better. + /// + /// The implementation here was originally ported from `git` but has been significantly + /// modified to improve performance. + /// As a result it consistently **performs better then myers algorithm** (5%-100%) over + /// a wide variety of test data. + /// + /// For pathological subsequences that only contain highly repeating tokens (64+ occurrences) + /// the algorithm falls back on Myers algorithm (with heuristics) to avoid quadratic behavior. + /// + /// Compared to Myers algorithm, the Histogram diff algorithm is more focused on providing + /// human readable diffs instead of minimal diffs. In practice this means that the edit-sequences + /// produced by the histogram diff are often longer then those produced by Myers algorithm. + /// + /// The heuristic used by the histogram diff does not work well for inputs with small (often repeated) + /// tokens. For example **character diffs do not work well** as most (english) text is madeup of + /// a fairly small set of characters. The `Histogram` algorithm will automatically these cases and + /// fallback to Myers algorithm. However this detection has a nontrivial overhead, so + /// if its known upfront that the sort of tokens is very small `Myers` algorithm should + /// be used instead. + #[default] + Histogram, + /// An implementation of the linear space variant of + /// [Myers `O((N+M)D)` algorithm](http://www.xmailserver.org/diff2.pdf). + /// The algorithm is enhanced with preprocessing that removes + /// tokens that don't occur in the other file at all. + /// Furthermore two heuristics to the middle snake search are implemented + /// that ensure reasonable runtime (mostly linear time complexity) even for large files. + /// + /// Due to the divide and conquer nature of the algorithm + /// the edit sequenced produced are still fairly small even when the middle snake + /// search is aborted by a heuristic. + /// However, the produced edit sequences are not guaranteed to be fully minimal. + /// If that property is vital to you, use the `MyersMinimal` algorithm instead. + /// + /// The implementation (including the preprocessing) are mostly + /// ported from `git` and `gnu-diff` where Myers algorithm is used + /// as the default diff algorithm. + /// Therefore the used heuristics have been heavily battle tested and + /// are known to behave well over a large variety of inputs + Myers, + /// Same as `Myers` but the early abort heuristics are disabled to guarantee + /// a minimal edit sequence. + /// This can mean significant slowdown in pathological cases. + MyersMinimal, +} + +impl Algorithm { + #[cfg(test)] + const ALL: [Self; 2] = [Algorithm::Histogram, Algorithm::Myers]; +} + +/// Computes an edit-script that transforms `input.before` into `input.after` using +/// the specified `algorithm` +/// The edit-script is passed to `sink.process_change` while it is produced. +pub fn diff(algorithm: Algorithm, input: &InternedInput, sink: S) -> S::Out { + diff_with_tokens( + algorithm, + &input.before, + &input.after, + input.interner.num_tokens(), + sink, + ) +} + +/// Computes an edit-script that transforms `before` into `after` using +/// the specified `algorithm` +/// The edit-script is passed to `sink.process_change` while it is produced. +pub fn diff_with_tokens( + algorithm: Algorithm, + before: &[Token], + after: &[Token], + num_tokens: u32, + sink: S, +) -> S::Out { + assert!( + before.len() < i32::MAX as usize, + "imara-diff only supports up to {} tokens", + i32::MAX + ); + assert!( + after.len() < i32::MAX as usize, + "imara-diff only supports up to {} tokens", + i32::MAX + ); + match algorithm { + Algorithm::Histogram => histogram::diff(before, after, num_tokens, sink), + Algorithm::Myers => myers::diff(before, after, num_tokens, sink, false), + Algorithm::MyersMinimal => myers::diff(before, after, num_tokens, sink, true), + } +} diff --git a/gix-imara-diff-01/src/myers.rs b/gix-imara-diff-01/src/myers.rs new file mode 100644 index 00000000000..3407e729db3 --- /dev/null +++ b/gix-imara-diff-01/src/myers.rs @@ -0,0 +1,263 @@ +use std::ptr::NonNull; + +use crate::intern::Token; +use crate::myers::middle_snake::{MiddleSnakeSearch, SearchResult}; +use crate::myers::preprocess::PreprocessedFile; +use crate::myers::slice::FileSlice; +use crate::util::sqrt; +use crate::Sink; + +mod middle_snake; +mod preprocess; +mod slice; + +pub struct Myers { + kvec: NonNull<[i32]>, + kforward: NonNull, + kbackward: NonNull, + max_cost: u32, +} + +pub fn diff(before: &[Token], after: &[Token], _num_tokens: u32, mut sink: S, minimal: bool) -> S::Out { + // preprocess the files by removing parts of the file that are not contained in the other file at all + // this process remaps the token indices and therefore requires us to track changed files in a char array + // PERF use a bitset? + let (mut before, mut after) = preprocess::preprocess(before, after); + + // Perform the actual diff + Myers::new(before.tokens.len(), after.tokens.len()).run( + FileSlice::new(&mut before), + FileSlice::new(&mut after), + minimal, + ); + + process_changes_with_sink(&before, &after, &mut sink); + sink.finish() +} + +const HEUR_MIN_COST: u32 = 256; +const MAX_COST_MIN: u32 = 256; + +impl Drop for Myers { + fn drop(&mut self) { + unsafe { drop(Box::from_raw(self.kvec.as_ptr())) } + } +} + +impl Myers { + fn new(len1: usize, len2: usize) -> Self { + let ndiags = len1 + len2 + 3; + let kvec: *mut [i32] = Box::into_raw(vec![0; 2 * ndiags + 2].into_boxed_slice()); + let (kforward, kbackward) = unsafe { + ( + NonNull::new_unchecked((kvec as *mut i32).add(len2 + 1)), + NonNull::new_unchecked((kvec as *mut i32).add(ndiags + len2 + 1)), + ) + }; + Self { + kvec: unsafe { NonNull::new_unchecked(kvec) }, + kforward, + kbackward, + max_cost: sqrt(ndiags).max(MAX_COST_MIN), + } + } + + fn run<'f>(&mut self, mut file1: FileSlice<'f>, mut file2: FileSlice<'f>, mut need_min: bool) { + loop { + file1.strip_common(&mut file2); + + if file1.is_empty() { + file2.mark_changed(); + return; + } else if file2.is_empty() { + file1.mark_changed(); + return; + } + + let split = self.split(&file1, &file2, need_min); + self.run( + file1.borrow().slice(..split.token_idx1 as u32), + file2.borrow().slice(..split.token_idx2 as u32), + split.minimized_lo, + ); + + file1 = file1.slice(split.token_idx1 as u32..); + file2 = file2.slice(split.token_idx2 as u32..); + need_min = split.minimized_hi + } + } + + /// See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers. + /// Basically considers a "box" (off1, off2, lim1, lim2) and scan from both + /// the forward diagonal starting from (off1, off2) and the backward diagonal + /// starting from (lim1, lim2). If the K values on the same diagonal crosses + /// returns the furthest point of reach. We might encounter expensive edge cases + /// using this algorithm, so a little bit of heuristic is needed to cut the + /// search and to return a suboptimal point. + fn split(&mut self, file1: &FileSlice, file2: &FileSlice, need_min: bool) -> Split { + let mut forward_search = unsafe { MiddleSnakeSearch::::new(self.kforward, file1, file2) }; + let mut backwards_search = unsafe { MiddleSnakeSearch::::new(self.kbackward, file1, file2) }; + let is_odd = file2.len().wrapping_sub(file1.len()) & 1 != 0; + + let mut ec = 0; + + while ec <= self.max_cost { + let mut found_snake = false; + forward_search.next_d(); + if is_odd { + if let Some(res) = forward_search.run(file1, file2, |k, token_idx1| { + backwards_search.contains(k) && backwards_search.x_pos_at_diagonal(k) <= token_idx1 + }) { + match res { + SearchResult::Snake => found_snake = true, + SearchResult::Found { token_idx1, token_idx2 } => { + return Split { + token_idx1, + token_idx2, + minimized_lo: true, + minimized_hi: true, + }; + } + } + } + } else { + found_snake |= forward_search.run(file1, file2, |_, _| false).is_some() + }; + + backwards_search.next_d(); + if !is_odd { + if let Some(res) = backwards_search.run(file1, file2, |k, token_idx1| { + forward_search.contains(k) && token_idx1 <= forward_search.x_pos_at_diagonal(k) + }) { + match res { + SearchResult::Snake => found_snake = true, + SearchResult::Found { token_idx1, token_idx2 } => { + return Split { + token_idx1, + token_idx2, + minimized_lo: true, + minimized_hi: true, + }; + } + } + } + } else { + found_snake |= backwards_search.run(file1, file2, |_, _| false).is_some() + }; + + if need_min { + continue; + } + + // If the edit cost is above the heuristic trigger and if + // we got a good snake, we sample current diagonals to see + // if some of them have reached an "interesting" path. Our + // measure is a function of the distance from the diagonal + // corner (i1 + i2) penalized with the distance from the + // mid diagonal itself. If this value is above the current + // edit cost times a magic factor (XDL_K_HEUR) we consider + // it interesting. + if found_snake && ec > HEUR_MIN_COST { + if let Some((token_idx1, token_idx2)) = forward_search.found_snake(ec, file1, file2) { + return Split { + token_idx1, + token_idx2, + minimized_lo: true, + minimized_hi: false, + }; + } + + if let Some((token_idx1, token_idx2)) = backwards_search.found_snake(ec, file1, file2) { + return Split { + token_idx1, + token_idx2, + minimized_lo: false, + minimized_hi: true, + }; + } + } + + ec += 1; + } + + let (distance_forward, token_idx1_forward) = forward_search.best_position(file1, file2); + let (distance_backwards, token_idx1_backwards) = backwards_search.best_position(file1, file2); + if distance_forward > file1.len() as isize + file2.len() as isize - distance_backwards { + Split { + token_idx1: token_idx1_forward, + token_idx2: (distance_forward - token_idx1_forward as isize) as i32, + minimized_lo: true, + minimized_hi: false, + } + } else { + Split { + token_idx1: token_idx1_backwards, + token_idx2: (distance_backwards - token_idx1_backwards as isize) as i32, + minimized_lo: false, + minimized_hi: true, + } + } + } +} + +#[derive(Debug)] +struct Split { + token_idx1: i32, + token_idx2: i32, + minimized_lo: bool, + minimized_hi: bool, +} + +/// the mapping performed during preprocessing makes it impossible to directly call +/// the `sink` during the diff itself. Instead `file.changed` is set to true for all +/// tokens that are changed +/// below these arrays are used to call the sink function +fn process_changes_with_sink(before: &PreprocessedFile, after: &PreprocessedFile, sink: &mut impl Sink) { + let before_end = before.is_changed.len() as u32 + before.offset; + let after_end = after.is_changed.len() as u32 + after.offset; + + let mut before = before + .is_changed + .iter() + .enumerate() + .map(|(i, removed)| (i as u32 + before.offset, *removed)); + + let mut after = after + .is_changed + .iter() + .enumerate() + .map(|(i, inserted)| (i as u32 + after.offset, *inserted)); + + let mut next1 = before.next(); + let mut next2 = after.next(); + + while let (Some((before_pos, removed)), Some((after_pos, inserted))) = (next1, next2) { + if !(removed | inserted) { + next1 = before.next(); + next2 = after.next(); + continue; + } + + let mut hunk_before = before_pos..before_pos; + let mut hunk_after = after_pos..after_pos; + if removed { + let end = before.find(|(_, changed)| !changed); + next1 = end.map(|(end, _)| (end, false)); + hunk_before.end = end.map_or(before_end, |(end, _)| end); + }; + + if inserted { + let end = after.find(|(_, changed)| !changed); + next2 = end.map(|(end, _)| (end, false)); + hunk_after.end = end.map_or(after_end, |(end, _)| end); + } + + sink.process_change(hunk_before, hunk_after); + } + + if let Some((before_pos, _)) = next1 { + sink.process_change(before_pos..before_end, after_end..after_end); + } else if let Some((after_pos, _)) = next2 { + sink.process_change(before_end..before_end, after_pos..after_end); + } +} diff --git a/gix-imara-diff-01/src/myers/middle_snake.rs b/gix-imara-diff-01/src/myers/middle_snake.rs new file mode 100644 index 00000000000..99fcf0c4d0a --- /dev/null +++ b/gix-imara-diff-01/src/myers/middle_snake.rs @@ -0,0 +1,252 @@ +use std::ptr::NonNull; + +use crate::myers::slice::FileSlice; +use crate::util::{common_postfix, common_prefix}; + +const SNAKE_CNT: u32 = 20; +const K_HEUR: u32 = 4; + +pub struct MiddleSnakeSearch { + kvec: NonNull, + kmin: i32, + kmax: i32, + dmin: i32, + dmax: i32, +} + +impl MiddleSnakeSearch { + /// # Safety + /// `data` must be valid for reads and writes between `-file2.len() - 1` and `file1.len() + 1` + pub unsafe fn new(data: NonNull, file1: &FileSlice, file2: &FileSlice) -> Self { + let dmin = -(file2.len() as i32); + let dmax = file1.len() as i32; + let kmid = if BACK { dmin + dmax } else { 0 }; + let mut res = Self { + kvec: data, + kmin: kmid, + kmax: kmid, + dmin, + dmax, + }; + let init = if BACK { file1.len() as i32 } else { 0 }; + res.write_xpos_at_diagonal(kmid, init); + res + } + + pub fn contains(&self, k: i32) -> bool { + (self.kmin..=self.kmax).contains(&k) + } + + pub fn bounds_check(&self, k: i32) { + debug_assert!((self.dmin - 1..=self.dmax + 1).contains(&k)); + } + + fn write_xpos_at_diagonal(&mut self, k: i32, token_idx1: i32) { + self.bounds_check(k); + unsafe { self.kvec.as_ptr().offset(k as isize).write(token_idx1) } + } + + pub fn x_pos_at_diagonal(&self, diagonal: i32) -> i32 { + self.bounds_check(diagonal); + unsafe { self.kvec.as_ptr().offset(diagonal as isize).read() } + } + + pub fn pos_at_diagonal(&self, diagonal: i32) -> (i32, i32) { + self.bounds_check(diagonal); + let token_idx1 = unsafe { self.kvec.as_ptr().offset(diagonal as isize).read() }; + let token_idx2 = token_idx1 - diagonal; + (token_idx1, token_idx2) + } + + /// We need to extend the diagonal "domain" by one. If the next + /// values exits the box boundaries we need to change it in the + /// opposite direction because (max - min) must be a power of + /// two. + /// + /// Also we initialize the external K value to -1 so that we can + /// avoid extra conditions in the check inside the core loop. + pub fn next_d(&mut self) { + let init_val = if BACK { + // value should always be larger then bounds + i32::MAX + } else { + // value should always be smaller then bounds + i32::MIN + }; + + if self.kmin > self.dmin { + self.kmin -= 1; + self.write_xpos_at_diagonal(self.kmin - 1, init_val); + } else { + self.kmin += 1; + } + + if self.kmax < self.dmax { + self.kmax += 1; + self.write_xpos_at_diagonal(self.kmax + 1, init_val); + } else { + self.kmax -= 1; + } + } + + pub fn run( + &mut self, + file1: &FileSlice, + file2: &FileSlice, + mut f: impl FnMut(i32, i32) -> bool, + ) -> Option { + let mut res = None; + let mut k = self.kmax; + while k >= self.kmin { + let mut token_idx1 = if BACK { + if self.x_pos_at_diagonal(k - 1) < self.x_pos_at_diagonal(k + 1) { + self.x_pos_at_diagonal(k - 1) + } else { + self.x_pos_at_diagonal(k + 1) - 1 + } + } else if self.x_pos_at_diagonal(k - 1) >= self.x_pos_at_diagonal(k + 1) { + self.x_pos_at_diagonal(k - 1) + 1 + } else { + self.x_pos_at_diagonal(k + 1) + }; + + let mut token_idx2 = token_idx1 - k; + let off = if BACK { + if token_idx1 > 0 && token_idx2 > 0 { + let tokens1 = &file1.tokens[..token_idx1 as usize]; + let tokens2 = &file2.tokens[..token_idx2 as usize]; + common_postfix(tokens1, tokens2) + } else { + 0 + } + } else if token_idx1 < file1.len() as i32 && token_idx2 < file2.len() as i32 { + let tokens1 = &file1.tokens[token_idx1 as usize..]; + let tokens2 = &file2.tokens[token_idx2 as usize..]; + common_prefix(tokens1, tokens2) + } else { + 0 + }; + + if off > SNAKE_CNT { + res = Some(SearchResult::Snake) + } + + if BACK { + token_idx1 -= off as i32; + token_idx2 -= off as i32; + } else { + token_idx1 += off as i32; + token_idx2 += off as i32; + } + self.write_xpos_at_diagonal(k, token_idx1); + + if f(k, token_idx1) { + return Some(SearchResult::Found { token_idx1, token_idx2 }); + } + + k -= 2; + } + + res + } + + pub fn best_position(&self, file1: &FileSlice, file2: &FileSlice) -> (isize, i32) { + let mut best_distance: isize = if BACK { isize::MAX } else { -1 }; + let mut best_token_idx1 = if BACK { i32::MAX } else { -1 }; + let mut k = self.kmax; + while k >= self.kmin { + let mut token_idx1 = self.x_pos_at_diagonal(k); + if BACK { + token_idx1 = token_idx1.max(0); + } else { + token_idx1 = token_idx1.min(file1.len() as i32); + } + let mut token_idx2 = token_idx1 - k; + if BACK { + if token_idx2 < 0 { + token_idx1 = k; + token_idx2 = 0; + } + } else if token_idx2 > file2.len() as i32 { + token_idx1 = file2.len() as i32 + k; + token_idx2 = file2.len() as i32; + } + + let distance = token_idx1 as isize + token_idx2 as isize; + if BACK && distance < best_distance || !BACK && distance > best_distance { + best_distance = distance; + best_token_idx1 = token_idx1; + } + + k -= 2; + } + (best_distance, best_token_idx1) + } + + pub fn found_snake(&self, ec: u32, file1: &FileSlice, file2: &FileSlice) -> Option<(i32, i32)> { + let mut best_score = 0; + let mut best_token_idx1 = 0; + let mut best_token_idx2 = 0; + let mut k = self.kmax; + while k >= self.kmin { + let (token_idx1, token_idx2) = self.pos_at_diagonal(k); + if BACK { + if !(0..file1.len() as i32 - SNAKE_CNT as i32).contains(&token_idx1) { + k -= 2; + continue; + } + if !(0..file2.len() as i32 - SNAKE_CNT as i32).contains(&token_idx2) { + k -= 2; + continue; + } + } else { + if !(SNAKE_CNT as i32..file1.len() as i32).contains(&token_idx1) { + k -= 2; + continue; + } + if !(SNAKE_CNT as i32..file2.len() as i32).contains(&token_idx2) { + k -= 2; + continue; + } + } + + let main_diagonal_distance = k.unsigned_abs() as usize; + let distance = if BACK { + (file1.len() - token_idx1 as u32) + (file2.len() - token_idx2 as u32) + } else { + token_idx1 as u32 + token_idx2 as u32 + }; + let score = distance as usize + main_diagonal_distance; + if score > (K_HEUR * ec) as usize && score > best_score { + let is_snake = if BACK { + file1.tokens[token_idx1 as usize..] + .iter() + .zip(&file2.tokens[token_idx2 as usize..]) + .take(SNAKE_CNT as usize) + .all(|(token1, token2)| token1 == token2) + } else { + file1.tokens[..token_idx1 as usize] + .iter() + .zip(&file2.tokens[..token_idx2 as usize]) + .rev() + .take(SNAKE_CNT as usize) + .all(|(token1, token2)| token1 == token2) + }; + if is_snake { + best_token_idx1 = token_idx1; + best_token_idx2 = token_idx2; + best_score = score + } + } + + k -= 2; + } + + (best_score > 0).then_some((best_token_idx1, best_token_idx2)) + } +} + +pub enum SearchResult { + Snake, + Found { token_idx1: i32, token_idx2: i32 }, +} diff --git a/gix-imara-diff-01/src/myers/preprocess.rs b/gix-imara-diff-01/src/myers/preprocess.rs new file mode 100644 index 00000000000..a0267fa56ef --- /dev/null +++ b/gix-imara-diff-01/src/myers/preprocess.rs @@ -0,0 +1,195 @@ +use crate::intern::Token; +use crate::myers::sqrt; +use crate::util::{strip_common_postfix, strip_common_prefix}; + +pub fn preprocess(mut file1: &[Token], mut file2: &[Token]) -> (PreprocessedFile, PreprocessedFile) { + let common_prefix = strip_common_prefix(&mut file1, &mut file2); + strip_common_postfix(&mut file1, &mut file2); + let (hdiff1, hdiff2) = token_occurrences(file1, file2); + let file1 = PreprocessedFile::new(common_prefix, &hdiff1, file1); + let file2 = PreprocessedFile::new(common_prefix, &hdiff2, file2); + (file1, file2) +} + +/// computes how +fn token_occurrences(file1: &[Token], file2: &[Token]) -> (Vec, Vec) { + const MAX_EQLIMIT: u32 = 1024; + + // compute the limit after which tokens are treated as `Occurrences::COMMON` + let eqlimit1 = sqrt(file1.len()).min(MAX_EQLIMIT); + let eqlimit2 = sqrt(file2.len()).min(MAX_EQLIMIT); + + // first collect how often each token occurs in a file + let mut occurrences1 = Vec::new(); + for token in file1 { + let bucket = token.0 as usize; + if bucket >= occurrences1.len() { + occurrences1.resize(bucket + 1, 0u32); + } + occurrences1[bucket] += 1; + } + + // do the same thing for + let mut occurrences2 = Vec::new(); + let token_occurrences2: Vec<_> = file2 + .iter() + .map(|token| { + let bucket = token.0 as usize; + if bucket >= occurrences2.len() { + occurrences2.resize(bucket + 1, 0); + } + occurrences2[bucket] += 1; + let occurrences1 = *occurrences1.get(bucket).unwrap_or(&0); + Occurrences::from_occurrences(occurrences1, eqlimit2) + }) + .collect(); + + let token_occurrences1: Vec<_> = file1 + .iter() + .map(|token| { + let bucket = token.0 as usize; + let occurrences2 = *occurrences2.get(bucket).unwrap_or(&0); + Occurrences::from_occurrences(occurrences2, eqlimit1) + }) + .collect(); + + (token_occurrences1, token_occurrences2) +} + +#[derive(Clone, Copy, Debug)] +enum Occurrences { + /// Token does not occur in this file + None, + /// Token occurs at least once + Some, + /// Token occurs very frequently (exact number depends on file size). + /// Such tokens are usually empty lines or braces and are often not meaningful to a diff + Common, +} + +impl Occurrences { + pub fn from_occurrences(occurrences: u32, eqlimit: u32) -> Occurrences { + if occurrences == 0 { + Occurrences::None + } else if occurrences >= eqlimit { + Occurrences::Common + } else { + Occurrences::Some + } + } +} + +#[derive(Debug)] +pub struct PreprocessedFile { + pub offset: u32, + pub is_changed: Vec, + pub indices: Vec, + pub tokens: Vec, +} + +impl PreprocessedFile { + fn new(offset: u32, token_diff: &[Occurrences], tokens: &[Token]) -> PreprocessedFile { + let mut changed = vec![false; tokens.len()]; + let (tokens, indices) = prune_unmatched_tokens(tokens, token_diff, &mut changed); + PreprocessedFile { + offset, + is_changed: changed, + indices, + tokens, + } + } +} + +fn prune_unmatched_tokens( + file: &[Token], + token_status: &[Occurrences], + changed: &mut [bool], +) -> (Vec, Vec) { + assert_eq!(token_status.len(), file.len()); + file.iter() + .zip(token_status) + .enumerate() + .filter_map(|(i, (&token, &status))| { + let prune = match status { + Occurrences::None => true, + Occurrences::Some => false, + Occurrences::Common => should_prune_common_line(token_status, i), + }; + if prune { + changed[i] = true; + None + } else { + Some((token, i as u32)) + } + }) + .unzip() +} + +// TODO do not unnecessarily rescan lines +fn should_prune_common_line(token_status: &[Occurrences], pos: usize) -> bool { + const WINDOW_SIZE: usize = 100; + + let mut unmatched_before = 0; + let mut common_before = 0; + + let start = pos.saturating_sub(WINDOW_SIZE); + for status in token_status[start..pos].iter().rev() { + match status { + Occurrences::None => { + unmatched_before += 1; + } + Occurrences::Common => { + common_before += 1; + } + Occurrences::Some => break, + } + } + + if unmatched_before == 0 { + return false; + } + + let end = token_status.len().min(pos + WINDOW_SIZE); + let mut unmatched_after = 0; + let mut common_after = 0; + for status in token_status[pos..end].iter() { + match status { + Occurrences::None => { + unmatched_after += 1; + } + Occurrences::Common => { + common_after += 1; + } + Occurrences::Some => break, + } + } + + if unmatched_after == 0 { + return false; + } + + let common = common_before + common_after; + let unmatched = unmatched_before + unmatched_after; + + unmatched > 3 * common +} + +#[cfg(test)] +mod tests { + use super::{should_prune_common_line, Occurrences}; + + #[test] + fn common_line_pruning_ignores_distant_context() { + let mut token_status = vec![Occurrences::Some; 700]; + token_status[100..400].fill(Occurrences::None); + token_status[400..450].fill(Occurrences::None); + token_status[450..500].fill(Occurrences::Common); + token_status[500..550].fill(Occurrences::Common); + token_status[550..600].fill(Occurrences::None); + + assert!( + !should_prune_common_line(&token_status, 500), + "only the last 100 items before the current line should contribute to the backward scan" + ); + } +} diff --git a/gix-imara-diff-01/src/myers/slice.rs b/gix-imara-diff-01/src/myers/slice.rs new file mode 100644 index 00000000000..526b61505fd --- /dev/null +++ b/gix-imara-diff-01/src/myers/slice.rs @@ -0,0 +1,73 @@ +use std::mem::take; +use std::ops::RangeBounds; + +use crate::intern::Token; +use crate::myers::preprocess::PreprocessedFile; +use crate::util::common_edges; + +#[derive(Default)] +pub struct FileSlice<'a> { + pub tokens: &'a [Token], + indices: &'a [u32], + changed: &'a mut [bool], +} + +impl<'a> FileSlice<'a> { + pub fn new(file: &'a mut PreprocessedFile) -> Self { + Self { + tokens: &file.tokens, + indices: &file.indices, + changed: &mut file.is_changed, + } + } + + pub fn mark_changed(&mut self) { + for &i in self.indices { + self.changed[i as usize] = true; + } + } + + pub fn borrow(&mut self) -> FileSlice<'_> { + FileSlice { + tokens: self.tokens, + changed: self.changed, + indices: self.indices, + } + } + + pub fn slice>(self, range: R) -> Self { + let start = match range.start_bound() { + std::ops::Bound::Included(&start) => start, + std::ops::Bound::Excluded(&start) => start + 1, + std::ops::Bound::Unbounded => 0, + }; + + let end = match range.end_bound() { + std::ops::Bound::Included(&end) => end + 1, + std::ops::Bound::Excluded(&end) => end, + std::ops::Bound::Unbounded => self.len(), + }; + + Self { + tokens: &self.tokens[start as usize..end as usize], + changed: self.changed, + indices: &self.indices[start as usize..end as usize], + } + } + + pub fn strip_common(&mut self, other: &mut Self) { + let (start, common_postfix) = common_edges(self.tokens, other.tokens); + let end = self.len() - common_postfix; + *self = take(self).slice(start..end); + let end = other.len() - common_postfix; + *other = take(other).slice(start..end) + } + + pub fn len(&self) -> u32 { + self.tokens.len() as u32 + } + + pub fn is_empty(&self) -> bool { + self.tokens.is_empty() + } +} diff --git a/gix-imara-diff-01/src/sink.rs b/gix-imara-diff-01/src/sink.rs new file mode 100644 index 00000000000..baa00ed131f --- /dev/null +++ b/gix-imara-diff-01/src/sink.rs @@ -0,0 +1,114 @@ +use std::ops::Range; + +/// Trait for processing the edit-scripts computed with [`diff`](crate::diff) +pub trait Sink: Sized { + type Out; + + /// This method is called whenever a diff [`algorithm`](crate::Algorithm) + /// finds a change between the two processed input files. + /// A change is a continuous subsequence of [tokens](crate::intern::Token) `before` that needs + /// to be replaced by a different continuous subsequence of tokens `after` to construct the second file from the first. + /// + /// These token subsequences are passed to this function in **strictly monotonically increasing order**. + /// That means that for two subsequent calls `process_change(before1, after1)` and `process_change(before2, after2)` + /// the following always holds: + /// + /// ``` no_compile + /// assert!(before1.end < before2.start); + /// assert!(after1.end < after2.start); + /// ``` + /// + /// # Parameters + /// - **`before`** - the **position** of the removed token subsequence in the original file. + /// - **`after`** - the **position** of the inserted token subsequence in the destination file. + /// + /// # Notes + //// + /// A `Sink` has no function to indicate that a section of a file remains unchanged. + /// However due to the monotonically increasing calls, implementations can easily determine + /// which subsequences remain unchanged by saving `before.end`/`after.end`. + /// The range between `before.start`/`after.end` and the previous `before.end`/`after.end` + /// is always unchanged. + fn process_change(&mut self, before: Range, after: Range); + + /// This function is called after all calls to `process_change` are complete + /// to obtain the final diff result + fn finish(self) -> Self::Out; + + /// Utility method that constructs a [`Counter`] that tracks the total number + /// of inserted and removed tokens in the changes passed to [`process_change`](crate::Sink::process_change). + fn with_counter(self) -> Counter { + Counter::new(self) + } +} + +impl, Range)> Sink for T { + type Out = (); + + fn process_change(&mut self, before: Range, after: Range) { + self(before, after) + } + + fn finish(self) -> Self::Out {} +} + +impl Sink for () { + type Out = (); + fn process_change(&mut self, _before: Range, _after: Range) {} + fn finish(self) -> Self::Out {} +} + +/// A [`Sink`] which wraps a different sink +/// and counts the number of `removed` and `inserted` [tokens](crate::intern::Token). +pub struct Counter { + /// Total number of recorded inserted [`tokens`](crate::intern::Token). + /// Computed by summing the lengths of the `after` subsequences pass to [`process_change`](crate::Sink::process_change). + pub removals: u32, + /// Total number of recorded inserted [`tokens`](crate::intern::Token). + /// Computed by summing the lengths of the `after` subsequences pass to [`process_change`](crate::Sink::process_change). + pub insertions: u32, + /// The [`Sink`] for which the counter records [`tokens`](crate::intern::Token). + /// All calls to [`process_change`](crate::Sink::process_change) are forwarded to the `sink` by the counter. + /// After [`finish`](crate::Sink::finish) is called, this field contains the output returned by the [`finish`](crate::Sink::finish) + /// method of the wrapped [`Sink`]. + pub wrapped: T, +} + +impl Counter { + pub fn new(sink: S) -> Self { + Self { + insertions: 0, + removals: 0, + wrapped: sink, + } + } +} + +impl Sink for Counter { + type Out = Counter; + fn process_change(&mut self, before: Range, after: Range) { + self.removals += before.end - before.start; + self.insertions += after.end - after.start; + self.wrapped.process_change(before, after) + } + + fn finish(self) -> Self::Out { + Counter { + removals: self.removals, + insertions: self.insertions, + wrapped: self.wrapped.finish(), + } + } +} + +impl Counter { + pub fn total(&self) -> usize { + self.insertions as usize + self.removals as usize + } +} + +impl Default for Counter<()> { + fn default() -> Self { + Counter::new(()) + } +} diff --git a/gix-imara-diff-01/src/sources.rs b/gix-imara-diff-01/src/sources.rs new file mode 100644 index 00000000000..865912d3c43 --- /dev/null +++ b/gix-imara-diff-01/src/sources.rs @@ -0,0 +1,149 @@ +use std::mem::take; +use std::str::from_utf8_unchecked; + +use crate::TokenSource; + +/// Returns a [`TokenSource`] that uses +/// the lines in `data` as Tokens. The newline separator (`\r\n` or `\n`) is +/// not included in the emitted tokens. +/// This means that changing the newline separator from `\r\n` to `\n` +/// (or omitting it fully on the last line) is not detected by [`diff`](crate::diff). +pub fn lines(data: &str) -> Lines<'_, false> { + Lines(ByteLines(data.as_bytes())) +} + +/// Returns a [`TokenSource`] that uses +/// the lines in `data` as Tokens. The newline separator (`\r\n` or `\n`) is +/// included in the emitted tokens. +/// This means that changing the newline separator from `\r\n` to `\n` +/// (or omitting it fully on the last line) is detected by [`diff`](crate::diff). +pub fn lines_with_terminator(data: &str) -> Lines<'_, true> { + Lines(ByteLines(data.as_bytes())) +} + +/// Returns a [`TokenSource`] that uses +/// the lines in `data` as Tokens. A lines is a continuous subslice of +/// `data` which does not contain `\n` (or `\r\n`). +/// The newline separator (`\r\n` or `\n`) is not included in the emitted tokens. +/// This means that changing the newline separator from `\r\n` to `\n` +/// (or omitting it fully on the last line) is not detected by [`diff`](crate::diff). +pub fn byte_lines_with_terminator(data: &[u8]) -> ByteLines<'_, true> { + ByteLines(data) +} + +/// Returns a [`TokenSource`] that uses +/// the lines in `data` as Tokens. The newline separator (`\r\n` or `\n`) is +/// included in the emitted tokens. +/// This means that changing the newline separator from `\r\n` to `\n` +/// (or omitting it fully on the last line) is detected by [`diff`](crate::diff). +pub fn byte_lines(data: &[u8]) -> ByteLines<'_, false> { + ByteLines(data) +} + +/// By default, a line diff is produced for a string +impl<'a> TokenSource for &'a str { + type Token = &'a str; + + type Tokenizer = Lines<'a, false>; + + fn tokenize(&self) -> Self::Tokenizer { + lines(self) + } + + fn estimate_tokens(&self) -> u32 { + lines_with_terminator(self).estimate_tokens() + } +} + +/// By default, a line diff is produced for a bytes +impl<'a> TokenSource for &'a [u8] { + type Token = Self; + type Tokenizer = ByteLines<'a, false>; + + fn tokenize(&self) -> Self::Tokenizer { + byte_lines(self) + } + + fn estimate_tokens(&self) -> u32 { + byte_lines(self).estimate_tokens() + } +} + +/// A [`TokenSource`] that returns the lines of a `str` as tokens. +/// See [`lines`] and [`lines_with_terminator`] for details +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct Lines<'a, const INCLUDE_LINE_TERMINATOR: bool>(ByteLines<'a, INCLUDE_LINE_TERMINATOR>); + +impl<'a, const INCLUDE_LINE_TERMINATOR: bool> Iterator for Lines<'a, INCLUDE_LINE_TERMINATOR> { + type Item = &'a str; + + fn next(&mut self) -> Option { + // safety invariant: this struct may only contain valid utf8 + // dividing valid utf8 bytes by ascii characters always produces valid utf-8 + self.0.next().map(|it| unsafe { from_utf8_unchecked(it) }) + } +} + +/// By default a line diff is produced for a string +impl<'a, const INCLUDE_LINE_TERMINATOR: bool> TokenSource for Lines<'a, INCLUDE_LINE_TERMINATOR> { + type Token = &'a str; + + type Tokenizer = Self; + + fn tokenize(&self) -> Self::Tokenizer { + *self + } + + fn estimate_tokens(&self) -> u32 { + self.0.estimate_tokens() + } +} + +/// A [`TokenSource`] that returns the lines of a byte slice as tokens. +/// See [`byte_lines`] and [`byte_lines_with_terminator`] for details +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct ByteLines<'a, const INCLUDE_LINE_TERMINATOR: bool>(&'a [u8]); + +impl<'a, const INCLUDE_LINE_TERMINATOR: bool> Iterator for ByteLines<'a, INCLUDE_LINE_TERMINATOR> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option { + let mut saw_carriage_return = false; + let mut iter = self.0.iter().enumerate(); + let line_len = loop { + match iter.next() { + Some((i, b'\n')) => break i + 1, + None => { + return (!self.0.is_empty()).then(|| take(&mut self.0)); + } + Some((_, &it)) => saw_carriage_return = it == b'\r', + } + }; + let (mut line, rem) = self.0.split_at(line_len); + self.0 = rem; + if !INCLUDE_LINE_TERMINATOR { + line = &line[..line_len - 1 - saw_carriage_return as usize]; + } + Some(line) + } +} + +/// By default a line diff is produced for a string +impl<'a, const INCLUDE_LINE_TERMINATOR: bool> TokenSource for ByteLines<'a, INCLUDE_LINE_TERMINATOR> { + type Token = &'a [u8]; + + type Tokenizer = Self; + + fn tokenize(&self) -> Self::Tokenizer { + *self + } + + fn estimate_tokens(&self) -> u32 { + let len: usize = self.take(20).map(|line| line.len()).sum(); + if len == 0 { + 100 + } else { + (self.0.len() * 20 / len) as u32 + } + } +} diff --git a/gix-imara-diff-01/src/tests.rs b/gix-imara-diff-01/src/tests.rs new file mode 100644 index 00000000000..c7e4d3e17e9 --- /dev/null +++ b/gix-imara-diff-01/src/tests.rs @@ -0,0 +1,271 @@ +use std::mem::swap; + +use expect_test::expect; + +use crate::intern::InternedInput; +use crate::{diff, Algorithm, UnifiedDiffBuilder}; + +#[test] +fn replace() { + let before = r#"fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; + println!("hello world") +}"#; + + let after = r#"const TEST: i32 = 0; +fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; + println!("hello world"); + println!("hello foo {TEST}"); +} + +"#; + let input = InternedInput::new(before, after); + for algorithm in Algorithm::ALL { + println!("{algorithm:?}"); + let diff = diff(algorithm, &input, UnifiedDiffBuilder::new(&input)); + expect![[r#" + @@ -1,5 +1,8 @@ + +const TEST: i32 = 0; + fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; + - println!("hello world") + + println!("hello world"); + + println!("hello foo {TEST}"); + } + + + "#]] + .assert_eq(&diff); + } +} + +#[test] +fn identical_files() { + let file = r#"fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; +}"#; + + for algorithm in Algorithm::ALL { + println!("{algorithm:?}"); + let input = InternedInput::new(file, file); + let diff = diff(algorithm, &input, UnifiedDiffBuilder::new(&input)); + assert_eq!(diff, ""); + } +} + +#[test] +fn simple_insert() { + let before = r#"fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; +}"#; + + let after = r#"fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; + println("hello world") +}"#; + + let mut input = InternedInput::new(before, after); + for algorithm in Algorithm::ALL { + println!("{algorithm:?}"); + let res = diff(algorithm, &input, UnifiedDiffBuilder::new(&input)); + expect![[r#" + @@ -1,4 +1,5 @@ + fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; + + println("hello world") + } + "#]] + .assert_eq(&res); + + swap(&mut input.before, &mut input.after); + + let res = diff(algorithm, &input, UnifiedDiffBuilder::new(&input)); + expect![[r#" + @@ -1,5 +1,4 @@ + fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; + - println("hello world") + } + "#]] + .assert_eq(&res); + + swap(&mut input.before, &mut input.after); + } +} + +#[test] +#[cfg(not(miri))] +fn hand_checked_udiffs() { + let before = r#"use crate::{ + alpha::Alpha, + beta::Beta, + gamma::Gamma, +}; + +use std::{ + collections::{HashMap, HashSet}, + path::Path, +}; + +pub struct Engine { + cache: HashMap, + steps: Vec<&'static str>, +} + +impl Engine { + pub fn new() -> Self { + Self { + cache: HashMap::new(), + steps: vec!["parse", "render"], + } + } + + pub fn update(&mut self, path: &Path) { + let _ = path; + self.steps.push("scan"); + } +} + +fn unchanged_one() { + println!("one"); +} + +fn unchanged_two() { + println!("two"); +} + +pub enum Error { + InvalidPath, + Unknown, +} + +pub struct Layer { + pub depth: usize, +} + +impl Layer { + pub fn parse(&self) -> Result<(), Error> { + Ok(()) + } +} +"#; + let after = r#"use crate::{ + alpha::Alpha, + beta::Beta, + gamma::Gamma, +}; + +use std::{ + collections::HashMap, + mem::replace, + path::Path, +}; + +pub struct Engine { + cache: HashMap, + steps: Vec<&'static str>, + dirty: bool, +} + +impl Engine { + pub fn new() -> Self { + Self { + cache: HashMap::new(), + steps: vec!["parse", "render"], + dirty: false, + } + } + + pub fn update(&mut self, path: &Path) { + let _previous = replace(&mut self.dirty, true); + let _ = path; + self.steps.push("scan"); + } +} + +fn unchanged_one() { + println!("one"); +} + +fn unchanged_two() { + println!("two"); +} + +pub enum Error { + InvalidPath, + InvalidState, + Unknown, +} + +pub struct Layer { + pub depth: u32, +} + +impl Layer { + pub fn parse(&self) -> Result<(), Error> { + Ok(()) + } +} +"#; + + for algorithm in Algorithm::ALL { + println!("{algorithm:?}"); + let input = InternedInput::new(before, after); + let diff = diff(algorithm, &input, UnifiedDiffBuilder::new(&input)); + expect![[r#" +@@ -5,13 +5,15 @@ + }; + + use std::{ +- collections::{HashMap, HashSet}, ++ collections::HashMap, ++ mem::replace, + path::Path, + }; + + pub struct Engine { + cache: HashMap, + steps: Vec<&'static str>, ++ dirty: bool, + } + + impl Engine { +@@ -19,10 +21,12 @@ + Self { + cache: HashMap::new(), + steps: vec!["parse", "render"], ++ dirty: false, + } + } + + pub fn update(&mut self, path: &Path) { ++ let _previous = replace(&mut self.dirty, true); + let _ = path; + self.steps.push("scan"); + } +@@ -38,11 +42,12 @@ + + pub enum Error { + InvalidPath, ++ InvalidState, + Unknown, + } + + pub struct Layer { +- pub depth: usize, ++ pub depth: u32, + } + + impl Layer { +"#]] + .assert_eq(&diff); + } +} diff --git a/gix-imara-diff-01/src/unified_diff.rs b/gix-imara-diff-01/src/unified_diff.rs new file mode 100644 index 00000000000..4087d383682 --- /dev/null +++ b/gix-imara-diff-01/src/unified_diff.rs @@ -0,0 +1,135 @@ +use std::fmt::{Display, Write}; +use std::ops::Range; + +use crate::intern::{InternedInput, Interner, Token}; +use crate::Sink; + +/// A [`Sink`] that creates a textual diff +/// in the format typically output by git or gnu-diff if the `-u` option is used +pub struct UnifiedDiffBuilder<'a, W, T> +where + W: Write, + T: Display, +{ + before: &'a [Token], + after: &'a [Token], + interner: &'a Interner, + + pos: u32, + before_hunk_start: u32, + after_hunk_start: u32, + before_hunk_len: u32, + after_hunk_len: u32, + + buffer: String, + dst: W, +} + +impl<'a, T> UnifiedDiffBuilder<'a, String, T> +where + T: Display, +{ + /// Create a new `UnifiedDiffBuilder` for the given `input`, + /// that will return a [`String`]. + pub fn new(input: &'a InternedInput) -> Self { + Self { + before_hunk_start: 0, + after_hunk_start: 0, + before_hunk_len: 0, + after_hunk_len: 0, + buffer: String::with_capacity(8), + dst: String::new(), + interner: &input.interner, + before: &input.before, + after: &input.after, + pos: 0, + } + } +} + +impl<'a, W, T> UnifiedDiffBuilder<'a, W, T> +where + W: Write, + T: Display, +{ + /// Create a new `UnifiedDiffBuilder` for the given `input`, + /// that will write its output to the provided implementation of [`Write`]. + pub fn with_writer(input: &'a InternedInput, writer: W) -> Self { + Self { + before_hunk_start: 0, + after_hunk_start: 0, + before_hunk_len: 0, + after_hunk_len: 0, + buffer: String::with_capacity(8), + dst: writer, + interner: &input.interner, + before: &input.before, + after: &input.after, + pos: 0, + } + } + + fn print_tokens(&mut self, tokens: &[Token], prefix: char) { + for &token in tokens { + writeln!(&mut self.buffer, "{prefix}{}", self.interner[token]).unwrap(); + } + } + + fn flush(&mut self) { + if self.before_hunk_len == 0 && self.after_hunk_len == 0 { + return; + } + + let end = (self.pos + 3).min(self.before.len() as u32); + self.update_pos(end, end); + + writeln!( + &mut self.dst, + "@@ -{},{} +{},{} @@", + self.before_hunk_start + 1, + self.before_hunk_len, + self.after_hunk_start + 1, + self.after_hunk_len, + ) + .unwrap(); + write!(&mut self.dst, "{}", &self.buffer).unwrap(); + self.buffer.clear(); + self.before_hunk_len = 0; + self.after_hunk_len = 0 + } + + fn update_pos(&mut self, print_to: u32, move_to: u32) { + self.print_tokens(&self.before[self.pos as usize..print_to as usize], ' '); + let len = print_to - self.pos; + self.pos = move_to; + self.before_hunk_len += len; + self.after_hunk_len += len; + } +} + +impl Sink for UnifiedDiffBuilder<'_, W, T> +where + W: Write, + T: Display, +{ + type Out = W; + + fn process_change(&mut self, before: Range, after: Range) { + if before.start - self.pos > 6 { + self.flush(); + self.pos = before.start - 3; + self.before_hunk_start = self.pos; + self.after_hunk_start = after.start - 3; + } + self.update_pos(before.start, before.end); + self.before_hunk_len += before.end - before.start; + self.after_hunk_len += after.end - after.start; + self.print_tokens(&self.before[before.start as usize..before.end as usize], '-'); + self.print_tokens(&self.after[after.start as usize..after.end as usize], '+'); + } + + fn finish(mut self) -> Self::Out { + self.flush(); + self.dst + } +} diff --git a/gix-imara-diff-01/src/util.rs b/gix-imara-diff-01/src/util.rs new file mode 100644 index 00000000000..fc944a1f4ec --- /dev/null +++ b/gix-imara-diff-01/src/util.rs @@ -0,0 +1,48 @@ +use crate::intern::Token; + +pub fn common_prefix(file1: &[Token], file2: &[Token]) -> u32 { + let mut off = 0; + for (token1, token2) in file1.iter().zip(file2) { + if token1 != token2 { + break; + } + off += 1; + } + off +} + +pub fn common_postfix(file1: &[Token], file2: &[Token]) -> u32 { + let mut off = 0; + for (token1, token2) in file1.iter().rev().zip(file2.iter().rev()) { + if token1 != token2 { + break; + } + off += 1; + } + off +} + +pub fn common_edges(file1: &[Token], file2: &[Token]) -> (u32, u32) { + let prefix = common_prefix(file1, file2); + let postfix = common_postfix(&file1[prefix as usize..], &file2[prefix as usize..]); + (prefix, postfix) +} + +pub fn strip_common_prefix(file1: &mut &[Token], file2: &mut &[Token]) -> u32 { + let off = common_prefix(file1, file2); + *file1 = &file1[off as usize..]; + *file2 = &file2[off as usize..]; + off +} + +pub fn strip_common_postfix(file1: &mut &[Token], file2: &mut &[Token]) -> u32 { + let off = common_postfix(file1, file2); + *file1 = &file1[..file1.len() - off as usize]; + *file2 = &file2[..file2.len() - off as usize]; + off +} + +pub fn sqrt(val: usize) -> u32 { + let nbits = (usize::BITS - val.leading_zeros()) / 2; + 1 << nbits +} diff --git a/gix-imara-diff/.gitattributes b/gix-imara-diff/.gitattributes new file mode 100644 index 00000000000..92b18997c33 --- /dev/null +++ b/gix-imara-diff/.gitattributes @@ -0,0 +1,3 @@ +*.before text eol=lf +*.after text eol=lf +*.diff text eol=lf diff --git a/gix-imara-diff/.gitignore b/gix-imara-diff/.gitignore new file mode 100644 index 00000000000..b9810132ad0 --- /dev/null +++ b/gix-imara-diff/.gitignore @@ -0,0 +1,2 @@ +/target +/bench_data diff --git a/gix-imara-diff/Cargo.toml b/gix-imara-diff/Cargo.toml new file mode 100644 index 00000000000..c1dd67246f8 --- /dev/null +++ b/gix-imara-diff/Cargo.toml @@ -0,0 +1,36 @@ +[package] +name = "gix-imara-diff" +version = "0.2.0" +edition = "2021" +authors = ["pascalkuthe "] +rust-version = "1.71" +license = "Apache-2.0" + +description = "A high performance library for computing diffs." +repository = "https://github.com/pascalkuthe/imara-diff" +keywords = ["diff", "difference", "myers", "compare", "changes"] +readme = "README.md" +exclude = [ + "tests", + "bench_data", + "plt.py", +] + +[dependencies] +hashbrown = { version = ">=0.15,<=0.16", default-features = false, features = ["default-hasher", "inline-more"] } +memchr = "2.7.4" + +[features] +default = ["unified_diff"] +unified_diff = [] + +[dev-dependencies] +cov-mark = "2.1.0" +expect-test = "1.4.0" + +[profile.release] +debug = true + +# [[bench]] +# name = "git_repo" +# harness = false diff --git a/gix-imara-diff/LICENSE b/gix-imara-diff/LICENSE new file mode 100644 index 00000000000..16fe87b06e8 --- /dev/null +++ b/gix-imara-diff/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright [yyyy] [name of copyright owner] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/gix-imara-diff/README.md b/gix-imara-diff/README.md new file mode 100644 index 00000000000..2e3f251bca5 --- /dev/null +++ b/gix-imara-diff/README.md @@ -0,0 +1,115 @@ +# imara-diff + +[![crates.io](https://img.shields.io/crates/v/imara-diff?style=flat-square)](https://crates.io/crates/imara-diff) +[![crates.io](https://img.shields.io/docsrs/imara-diff?style=flat-square)](https://docs.rs/imara-diff/latest/imara_diff/) +![crates.io](https://img.shields.io/crates/l/imara-diff?style=flat-square) + +`imara-diff` is a solid (imara in swahili) diff library for rust. +Solid refers to the fact that imara-diff provides very good runtime performance even +in pathological cases so that your application never appears to freeze while waiting on a diff. +The performance improvements are achieved using battle tested heuristics used in gnu-diff and git +that are known to perform well while still providing good results. + +`imara-diff` is also designed to be flexible so that it can be used with arbitrary collections and +not just lists and strings and even allows reusing large parts of the computation when +comparing the same file to multiple different files. + +`imara-diff` provides two diff algorithms: + +* The linear-space variant of the well known [Myers algorithm](http://www.xmailserver.org/diff2.pdf) +* The **Histogram** algorithm which is a variant of the patience diff algorithm. + +Myers algorithm has been enhanced with preprocessing and multiple heuristics to ensure fast runtime in pathological +cases to avoid quadratic time complexity and closely matches the behavior of gnu-diff and git. +The histogram algorithm was originally ported from git but has been heavily optimized. +The **Histogram algorithm outperforms Myers algorithm** by 10% - 100% across a **wide variety of workloads**. + +## Limitations + +Even with the optimizations in this crate, performing a large diff without any tokenization (like character diff for a string) does not perform well. +To work around this problem a diff of the entire file with large tokens (like lines for a string) can be performed first. +The `Sink` implementation can then perform fine-grained diff on changed regions. +Note that this fine-grained diff should not be performed for pure insertions, pure deletions and very large changes. + +In an effort to improve performance, `imara-diff` makes heavy use of pointer compression. +That means that it can only support files with at most `2^31 - 2` tokens. +This should be rarely an issue in practice for textual diffs, because most (large) real-world files +have an average line-length of at least 8. +That means that this limitation only becomes a problem for files above 16GB while performing line-diffs. + +## Benchmarks + +The most used diffing libraries in the rust ecosystem are [similar](https://crates.io/crates/similar) and [dissimilar](https://crates.io/crates/dissimilar). +The fastest diff implementation both of these offer is a simple implementation of Myers algorithm +without preprocessing or additional heuristics. +As these implementations are very similar only `similar` was included in the benchmark. + +To provide a benchmark to reflects real-world workloads, the git history of different open source projects were used. +For each repo two (fairly different) tags were chosen. +A tree diff is performed with [gitoxide](https://github.com/Byron/gitoxide) and the pairs of files that should be saved are stored in memory. +The diffs collected using this method are often fairly large, because the repositories are compared over a large span of time. +Therefore, the tree diff of the last 30 commit before the tag (equivalent of `git diff TAG^ TAG`, `git diff TAG^^ TAG^^`) were also used to also include smaller diffs. + +The benchmark measures the runtime of performing a **line diff** between the collected files. +As a measure of complexity for each change `(M + N) D` was used where `M` and `N` are the lengths of the two compared files +and `D` is the length of the edit script required to transform these files into each other (determined with Myers algorithm). +This complexity measure is used to divide the changes into 10 badges. +The time to compute the line diffs in each badge was benchmarked. + +The plots below show the runtime for each **average** complexity (runtime is normalized by the number of diffs). +Note that these plots are shown in logarithmic scale due to the large runtime of `similar` for complex diffs. +Furthermore, to better highlight the performance of the Histogram algorithm, the speedup of the Histogram algorithm +compared to the Myers algorithm is shown separately. + +* [Linux](###Linux) +* [Rust](###Rust) +* [VSCode](###VSCode) +* [Helix](###Helix) + +### Linux + +The sourcecode of the linux kernel. + +- **Repo** - https://kernel.org +- **Tags** - `v5.7` and `v6.0` + +### Rust + +The sourcecode of the rust compiler, standard library and various related tooling. + +- **Repo** - https://github.com/rust-lang/rust +- **Tags** - `1.50.0` and `1.64.0` + +### VSCode + +The sourcecode of the vscode editor. + +- **Repo** - https://github.com/microsoft/vscode +- **Tags** - `1.41.0` and `1.72.2` + +### Helix + +The sourcecode of the helix editor. + +- **Repo** - https://github.com/helix-editor/helix +- **Tags** - `v0.5.0` and `22.08.1` + +## Testing + +`imara-diff` includes comprehensive fuzz testing using [cargo-fuzz](https://rust-fuzz.github.io/book/cargo-fuzz.html) to ensure robustness against arbitrary inputs. Fuzz tests cover all major algorithms (Myers, Histogram, MyersMinimal), postprocessing with different heuristics, and unified diff printing. See the [fuzz/README.md](fuzz/README.md) for more details on running fuzz tests locally. + +## Stability Policy + +`imara-diff` uses [Semantic Versioning (SemVer)](https://semver.org/). +All non-breaking changes to the public rust API will cause a minor `SemVer` bump. +All breaking changes to to the public rust API will cause a major `SemVer` bump. +Changes in the produced diffs are also considered breaking changes if the produced diff was valid. +If the produced diff was invalid the change will be considered a bugfix. + +Additionally all changes to the minimum stable rust version (MSRV) are also considered breaking changes. +The current **MSRV is 1.61**. +`imara-diff` will roughly follow the MSRV of Firefox (stable) to remain +compatible many platforms that try to include its latest version. +To predict future changes to the MSRV the [Firefox documentation] can be consulted. + +[Firefox documentation]: https://firefox-source-docs.mozilla.org/writing-rust-code/update-policy.html diff --git a/gix-imara-diff/fuzz/.gitignore b/gix-imara-diff/fuzz/.gitignore new file mode 100644 index 00000000000..1a45eee7760 --- /dev/null +++ b/gix-imara-diff/fuzz/.gitignore @@ -0,0 +1,4 @@ +target +corpus +artifacts +coverage diff --git a/gix-imara-diff/fuzz/Cargo.toml b/gix-imara-diff/fuzz/Cargo.toml new file mode 100644 index 00000000000..b629af139b0 --- /dev/null +++ b/gix-imara-diff/fuzz/Cargo.toml @@ -0,0 +1,48 @@ +[package] +name = "imara-diff-fuzz" +version = "0.0.0" +publish = false +edition = "2021" + +[package.metadata] +cargo-fuzz = true + +[dependencies] +libfuzzer-sys = { version = "0.4", features = ["arbitrary-derive"] } + +[dependencies.imara-diff] +package = "gix-imara-diff" +path = ".." +features = ["unified_diff"] + +[[bin]] +name = "comprehensive_diff" +path = "fuzz_targets/comprehensive_diff.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "diff_compute_with" +path = "fuzz_targets/diff_compute_with.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "postprocess_heuristics" +path = "fuzz_targets/postprocess_heuristics.rs" +test = false +doc = false +bench = false + +[[bin]] +name = "unified_diff_printer" +path = "fuzz_targets/unified_diff_printer.rs" +test = false +doc = false +bench = false + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] diff --git a/gix-imara-diff/fuzz/README.md b/gix-imara-diff/fuzz/README.md new file mode 100644 index 00000000000..dae0670d109 --- /dev/null +++ b/gix-imara-diff/fuzz/README.md @@ -0,0 +1,41 @@ +# Fuzz Testing + +This directory contains fuzz tests for imara-diff using [cargo-fuzz](https://rust-fuzz.github.io/book/cargo-fuzz.html). + +## Running Fuzz Tests + +### Prerequisites +- Nightly Rust toolchain: `rustup install nightly` +- cargo-fuzz: `cargo install cargo-fuzz` + +### Running a specific target +```bash +# Run for a specific time (e.g., 60 seconds) +cargo +nightly fuzz run comprehensive_diff -- -max_total_time=60 + +# Run with a specific number of runs +cargo +nightly fuzz run comprehensive_diff -- -runs=1000000 +``` + +### Running all targets +```bash +for target in comprehensive_diff diff_compute_with postprocess_heuristics unified_diff_printer; do + cargo +nightly fuzz run --release $target -- -max_total_time=60 +done +``` + +### Analyzing coverage +```bash +cargo +nightly fuzz coverage comprehensive_diff +``` + +## CI Integration + +Fuzz tests are automatically run in CI for 3 minutes total (45 seconds per target × 4 targets) to ensure no regressions in robustness. + +## Adding New Fuzz Targets + +1. Create a new file in `fuzz_targets/` directory +2. Add a new `[[bin]]` entry in `Cargo.toml` +3. Update the CI workflow to run the new target +4. Update this README with a description of the new target diff --git a/gix-imara-diff/fuzz/fuzz_targets/comprehensive_diff.rs b/gix-imara-diff/fuzz/fuzz_targets/comprehensive_diff.rs new file mode 100644 index 00000000000..26642dcb61f --- /dev/null +++ b/gix-imara-diff/fuzz/fuzz_targets/comprehensive_diff.rs @@ -0,0 +1,72 @@ +#![no_main] + +use imara_diff::{Algorithm, Diff, InternedInput}; +use libfuzzer_sys::fuzz_target; + +use libfuzzer_sys::arbitrary; + +#[derive(arbitrary::Arbitrary, Debug)] +struct Input<'a> { + before: &'a [u8], + before_str: &'a str, + after: &'a [u8], + after_str: &'a str, +} + +/// Tests all three diff algorithms (Myers, Histogram, MyersMinimal) with: +/// - Computing diffs on arbitrary string inputs +/// - Postprocessing with no heuristic and line heuristic +/// - Unified diff printing +/// - Basic queries (count_additions, count_removals, is_added, is_removed) +/// - Hunks iteration +fn do_fuzz( + Input { + before, + before_str, + after, + after_str, + }: Input<'_>, +) { + // Create interned input + let input = InternedInput::new(before, after); + + // Test all three diff algorithms + for algorithm in [ + Algorithm::Histogram, + Algorithm::Myers, + Algorithm::MyersMinimal, + ] { + // Compute diff + let mut diff = Diff::compute(algorithm, &input); + + // Test basic queries + let _ = diff.count_additions(); + let _ = diff.count_removals(); + + // Test hunks iteration + for hunk in diff.hunks() { + let _ = hunk.is_pure_insertion(); + let _ = hunk.is_pure_removal(); + let _ = hunk.invert(); + } + + // Test postprocessing with no heuristic + diff.postprocess_no_heuristic(&input); + + // Test postprocessing with line heuristic + diff.postprocess_lines(&input); + } + + let input = InternedInput::new(before_str, after_str); + let mut word_input = InternedInput::default(); + let mut word_diff = Diff::default(); + + let diff = Diff::compute(Algorithm::Myers, &input); + for hunk in diff.hunks() { + hunk.latin_word_diff(&input, &mut word_input, &mut word_diff); + } +} + +fuzz_target!(|input: Input<'_>| { + do_fuzz(input); +}); diff --git a/gix-imara-diff/fuzz/fuzz_targets/diff_compute_with.rs b/gix-imara-diff/fuzz/fuzz_targets/diff_compute_with.rs new file mode 100644 index 00000000000..c81300b82d6 --- /dev/null +++ b/gix-imara-diff/fuzz/fuzz_targets/diff_compute_with.rs @@ -0,0 +1,68 @@ +#![no_main] + +use imara_diff::{Algorithm, Diff, Token}; +use libfuzzer_sys::fuzz_target; + +/// Tests the lower-level `compute_with` API that works directly with Token sequences: +/// - Creating arbitrary token sequences +/// - Computing diffs with all algorithms +/// - Querying individual token states +/// - Iterating through hunks +fn do_fuzz(data: &[u8]) { + // Test the lower-level compute_with API that works directly with tokens + if data.len() < 4 { + return; + } + + // Use first two bytes to determine before/after lengths + let before_len = (data[0] as usize % 100).min(data.len() / 2); + let after_len = (data[1] as usize % 100).min(data.len() / 2); + + // Create token sequences from remaining bytes + let mut before_tokens = Vec::new(); + let mut after_tokens = Vec::new(); + + for i in 0..before_len { + if i + 2 < data.len() { + before_tokens.push(Token::from(data[i + 2] as u32 % 256)); + } + } + + for i in 0..after_len { + if i + 2 + before_len < data.len() { + after_tokens.push(Token::from(data[i + 2 + before_len] as u32 % 256)); + } + } + + // Test all algorithms with compute_with + for algorithm in [ + Algorithm::Histogram, + Algorithm::Myers, + Algorithm::MyersMinimal, + ] { + let mut diff = Diff::default(); + diff.compute_with(algorithm, &before_tokens, &after_tokens, 256); + + // Test basic queries + let _ = diff.count_additions(); + let _ = diff.count_removals(); + + // Test is_removed and is_added for valid indices + for i in 0..before_tokens.len() as u32 { + let _ = diff.is_removed(i); + } + for i in 0..after_tokens.len() as u32 { + let _ = diff.is_added(i); + } + + // Test hunks + for hunk in diff.hunks() { + let _ = hunk.is_pure_insertion(); + let _ = hunk.is_pure_removal(); + } + } +} + +fuzz_target!(|data: &[u8]| { + do_fuzz(data); +}); diff --git a/gix-imara-diff/fuzz/fuzz_targets/postprocess_heuristics.rs b/gix-imara-diff/fuzz/fuzz_targets/postprocess_heuristics.rs new file mode 100644 index 00000000000..5e8e01b12bd --- /dev/null +++ b/gix-imara-diff/fuzz/fuzz_targets/postprocess_heuristics.rs @@ -0,0 +1,68 @@ +#![no_main] + +use imara_diff::{Algorithm, Diff, IndentHeuristic, IndentLevel, InternedInput}; +use libfuzzer_sys::fuzz_target; + +use libfuzzer_sys::arbitrary; + +#[derive(arbitrary::Arbitrary, Debug)] +struct Input<'a> { + before: &'a str, + after: &'a str, + ident_level: u8, +} + +/// Tests postprocessing with different heuristics: +/// - No heuristic +/// - Line heuristic (default indent-based) +/// - Custom indent heuristic with different tab sizes +/// - Validates hunk ranges are valid after postprocessing +fn do_fuzz( + Input { + before, + after, + ident_level, + }: Input<'_>, +) { + let input = InternedInput::new(before, after); + + // Test with different algorithms + for algorithm in [Algorithm::Histogram, Algorithm::Myers] { + let mut diff = Diff::compute(algorithm, &input); + + // Test postprocess with no heuristic + diff.postprocess_no_heuristic(&input); + let _ = diff.count_additions(); + let _ = diff.count_removals(); + + // Test postprocess with line heuristic + let mut diff2 = Diff::compute(algorithm, &input); + diff2.postprocess_lines(&input); + let _ = diff2.count_additions(); + let _ = diff2.count_removals(); + + // Test postprocess with custom indent heuristic + let mut diff3 = Diff::compute(algorithm, &input); + diff3.postprocess_with_heuristic( + &input, + IndentHeuristic::new(|token| { + IndentLevel::for_ascii_line( + input.interner[token].as_bytes().iter().copied(), + ident_level, + ) + }), + ); + let _ = diff3.count_additions(); + let _ = diff3.count_removals(); + + // Verify hunks are valid after postprocessing + for hunk in diff.hunks() { + assert!(hunk.before.start <= hunk.before.end); + assert!(hunk.after.start <= hunk.after.end); + } + } +} + +fuzz_target!(|input: Input<'_>| { + do_fuzz(input); +}); diff --git a/gix-imara-diff/fuzz/fuzz_targets/unified_diff_printer.rs b/gix-imara-diff/fuzz/fuzz_targets/unified_diff_printer.rs new file mode 100644 index 00000000000..1ab3a316e19 --- /dev/null +++ b/gix-imara-diff/fuzz/fuzz_targets/unified_diff_printer.rs @@ -0,0 +1,70 @@ +#![no_main] + +use imara_diff::{Algorithm, BasicLineDiffPrinter, Diff, InternedInput, UnifiedDiffConfig}; +use libfuzzer_sys::arbitrary; +use libfuzzer_sys::fuzz_target; + +/// Valid prefixes for unified diff output lines +const VALID_DIFF_LINE_PREFIXES: [char; 4] = [' ', '+', '-', '@']; + +#[derive(arbitrary::Arbitrary, Debug)] +struct Input<'a> { + before: &'a str, + after: &'a str, + context_len: u32, +} + +/// Tests unified diff printing with: +/// - Different context lengths (0-10) +/// - Various input combinations +/// - Validates output format (lines start with ' ', '+', '-', or '@') +fn do_fuzz( + Input { + before, + after, + context_len, + }: Input<'_>, +) { + let input = InternedInput::new(before, after); + + // Test with different algorithms + for algorithm in [ + Algorithm::Histogram, + Algorithm::Myers, + Algorithm::MyersMinimal, + ] { + let mut diff = Diff::compute(algorithm, &input); + + // Postprocess before printing + diff.postprocess_lines(&input); + + // Create printer and config + let printer = BasicLineDiffPrinter(&input.interner); + let mut config = UnifiedDiffConfig::default(); + config.context_len(context_len); + + // Generate unified diff + let unified = diff.unified_diff(&printer, config, &input); + let output = unified.to_string(); + + // Basic sanity checks on output + // It should be valid UTF-8 (already guaranteed by to_string) + // Lines should start with valid diff prefixes + for line in output.lines() { + if !line.is_empty() { + let first_char = line.chars().next().unwrap(); + // Should be a valid diff line prefix + assert!( + VALID_DIFF_LINE_PREFIXES.contains(&first_char), + "Invalid diff line prefix: '{}' in line: '{}'", + first_char, + line + ); + } + } + } +} + +fuzz_target!(|input: Input<'_>| { + do_fuzz(input); +}); diff --git a/gix-imara-diff/src/histogram.rs b/gix-imara-diff/src/histogram.rs new file mode 100644 index 00000000000..4bb88151e8d --- /dev/null +++ b/gix-imara-diff/src/histogram.rs @@ -0,0 +1,106 @@ +use crate::histogram::lcs::find_lcs; +use crate::histogram::list_pool::{ListHandle, ListPool}; +use crate::intern::Token; +use crate::myers; + +mod lcs; +mod list_pool; + +/// Maximum number of occurrences tracked for a single token. +/// Tokens appearing more frequently fall back to Myers algorithm. +const MAX_CHAIN_LEN: u32 = 63; + +/// State for computing histogram-based diffs. +struct Histogram { + /// Tracks where each token appears in the "before" sequence. + token_occurrences: Vec, + /// Memory pool for efficiently storing occurrence lists. + pool: ListPool, +} + +/// Computes a diff using the histogram algorithm. +/// +/// # Parameters +/// +/// * `before` - The token sequence from the first file, before changes. +/// * `after` - The token sequence from the second file, after changes. +/// * `removed` - Output array marking removed tokens +/// * `added` - Output array marking added tokens +/// * `num_tokens` - The total number of distinct tokens +pub fn diff(before: &[Token], after: &[Token], removed: &mut [bool], added: &mut [bool], num_tokens: u32) { + let mut histogram = Histogram::new(num_tokens); + histogram.run(before, after, removed, added); +} + +impl Histogram { + fn new(num_buckets: u32) -> Histogram { + Histogram { + token_occurrences: vec![ListHandle::default(); num_buckets as usize], + pool: ListPool::new(2 * num_buckets), + } + } + + fn clear(&mut self) { + self.pool.clear(); + } + + fn token_occurrences(&self, token: Token) -> &[u32] { + self.token_occurrences[token.0 as usize].as_slice(&self.pool) + } + + fn num_token_occurrences(&self, token: Token) -> u32 { + self.token_occurrences[token.0 as usize].len(&self.pool) + } + + fn populate(&mut self, file: &[Token]) { + for (i, &token) in file.iter().enumerate() { + self.token_occurrences[token.0 as usize].push(i as u32, &mut self.pool); + } + } + + fn run(&mut self, mut before: &[Token], mut after: &[Token], mut removed: &mut [bool], mut added: &mut [bool]) { + loop { + if before.is_empty() { + added.fill(true); + return; + } else if after.is_empty() { + removed.fill(true); + return; + } + + self.populate(before); + match find_lcs(before, after, self) { + // no lcs was found, that means that file1 and file2 two have nothing in common + Some(lcs) if lcs.len == 0 => { + added.fill(true); + removed.fill(true); + return; + } + Some(lcs) => { + self.run( + &before[..lcs.before_start as usize], + &after[..lcs.after_start as usize], + &mut removed[..lcs.before_start as usize], + &mut added[..lcs.after_start as usize], + ); + + // this is equivalent to (tail) recursion but implement as a loop for efficiency reasons + let before_end = lcs.before_start + lcs.len; + before = &before[before_end as usize..]; + removed = &mut removed[before_end as usize..]; + + let after_end = lcs.after_start + lcs.len; + after = &after[after_end as usize..]; + added = &mut added[after_end as usize..]; + } + None => { + // we are diffing two extremely large repetitive files + // this is a worst case for histogram diff with O(N^2) performance + // fallback to myers to maintain linear time complexity + myers::diff(before, after, removed, added, false); + return; + } + } + } + } +} diff --git a/gix-imara-diff/src/histogram/lcs.rs b/gix-imara-diff/src/histogram/lcs.rs new file mode 100644 index 00000000000..7c797ffc726 --- /dev/null +++ b/gix-imara-diff/src/histogram/lcs.rs @@ -0,0 +1,140 @@ +use crate::histogram::{Histogram, MAX_CHAIN_LEN}; +use crate::intern::Token; + +/// Finds the longest common subsequence (LCS) using a histogram-based approach. +/// +/// Returns `None` if the sequences are highly repetitive and should fall back to Myers. +pub(super) fn find_lcs(before: &[Token], after: &[Token], histogram: &mut Histogram) -> Option { + let mut search = LcsSearch { + lcs: Lcs::default(), + min_occurrences: MAX_CHAIN_LEN + 1, + found_cs: false, + }; + search.run(before, after, histogram); + if search.success() { + Some(search.lcs) + } else { + None + } +} + +/// Represents a longest common subsequence found by the histogram algorithm. +#[derive(Default, Debug)] +pub struct Lcs { + /// Starting position in the "before" sequence. + pub before_start: u32, + /// Starting position in the "after" sequence. + pub after_start: u32, + /// Length of the common subsequence. + pub len: u32, +} + +/// State for searching for the longest common subsequence. +pub struct LcsSearch { + /// The best LCS found so far. + lcs: Lcs, + /// The minimum occurrence count of tokens in the best LCS. + min_occurrences: u32, + /// Whether any common subsequence was found. + found_cs: bool, +} + +impl LcsSearch { + fn run(&mut self, before: &[Token], after: &[Token], histogram: &mut Histogram) { + let mut pos = 0; + while let Some(&token) = after.get(pos as usize) { + if histogram.num_token_occurrences(token) != 0 { + self.found_cs = true; + if histogram.num_token_occurrences(token) <= self.min_occurrences { + pos = self.update_lcs(pos, token, histogram, before, after); + continue; + } + } + + pos += 1; + } + + histogram.clear(); + } + + fn success(&mut self) -> bool { + !self.found_cs || self.min_occurrences <= MAX_CHAIN_LEN + } + + fn update_lcs( + &mut self, + after_pos: u32, + token: Token, + histogram: &Histogram, + before: &[Token], + after: &[Token], + ) -> u32 { + let mut next_token_idx2 = after_pos + 1; + let mut occurrences_iter = histogram.token_occurrences(token).iter().copied(); + let mut token_idx1 = occurrences_iter.next().unwrap(); + + 'occurrences_iter: loop { + let mut occurrences = histogram.num_token_occurrences(token); + let mut start1 = token_idx1; + let mut start2 = after_pos; + loop { + if start1 == 0 || start2 == 0 { + break; + } + let token1 = before.get(start1 as usize - 1); + let token2 = after.get(start2 as usize - 1); + if matches!((token1, token2), (Some(token1), Some(token2)) if token1 == token2) { + start1 -= 1; + start2 -= 1; + let new_occurrences = histogram.num_token_occurrences(before[start1 as usize]); + occurrences = occurrences.min(new_occurrences); + } else { + break; + } + } + + let mut end1 = token_idx1 + 1; + let mut end2 = after_pos + 1; + loop { + let token1 = before.get(end1 as usize); + let token2 = after.get(end2 as usize); + if matches!((token1, token2), (Some(token1), Some(token2)) if token1 == token2) { + let new_occurrences = histogram.num_token_occurrences(before[end1 as usize]); + occurrences = occurrences.min(new_occurrences); + end1 += 1; + end2 += 1; + } else { + break; + } + } + + if next_token_idx2 < end2 { + next_token_idx2 = end2; + } + + let len = end2 - start2; + debug_assert_eq!(len, end1 - start1); + if self.lcs.len < len || self.min_occurrences > occurrences { + self.min_occurrences = occurrences; + self.lcs = Lcs { + before_start: start1, + after_start: start2, + len, + }; + } + + loop { + if let Some(next_token_idx) = occurrences_iter.next() { + if next_token_idx > end2 { + token_idx1 = next_token_idx; + break; + } + } else { + break 'occurrences_iter; + } + } + } + + next_token_idx2 + } +} diff --git a/gix-imara-diff/src/histogram/list_pool.rs b/gix-imara-diff/src/histogram/list_pool.rs new file mode 100644 index 00000000000..98472bcc580 --- /dev/null +++ b/gix-imara-diff/src/histogram/list_pool.rs @@ -0,0 +1,256 @@ +use crate::histogram::MAX_CHAIN_LEN; + +/// A small list of entity references allocated from a pool. +/// +/// An `ListHandle` type provides similar functionality to `Vec`, but with some important +/// differences in the implementation: +/// +/// 1. Memory is allocated from a `ListPool` instead of the global heap. +/// 2. The footprint of an entity list is 4 bytes, compared with the 24 bytes for `Vec`. +/// 3. An entity list doesn't implement `Drop`, leaving it to the pool to manage memory. +/// +/// The list pool is intended to be used as a LIFO allocator. After building up a larger data +/// structure with many list references, the whole thing can be discarded quickly by clearing the +/// pool. +/// +/// # Safety +/// +/// Entity lists are not as safe to use as `Vec`, but they never jeopardize Rust's memory safety +/// guarantees. These are the problems to be aware of: +/// +/// - If you lose track of an entity list, its memory won't be recycled until the pool is cleared. +/// This can cause the pool to grow very large with leaked lists. +/// - If entity lists are used after their pool is cleared, they may contain garbage data, and +/// modifying them may corrupt other lists in the pool. +/// - If an entity list is used with two different pool instances, both pools are likely to become +/// corrupted. +/// +/// Entity lists can be cloned, but that operation should only be used as part of cloning the whole +/// function they belong to. *Cloning an entity list does not allocate new memory for the clone*. +/// It creates an alias of the same memory. +/// +/// Entity lists cannot be hashed and compared for equality because it's not possible to compare the +/// contents of the list without the pool reference. +/// +/// # Implementation +/// +/// The `ListHandle` itself is designed to have the smallest possible footprint. This is important +/// because it is used inside very compact data structures like `InstructionData`. The list +/// contains only a 32-bit index into the pool's memory vector, pointing to the first element of +/// the list. +/// +/// The pool is just a single `Vec` containing all of the allocated lists. Each list is +/// represented as three contiguous parts: +/// +/// 1. The number of elements in the list. +/// 2. The list elements. +/// 3. Excess capacity elements. +/// +/// The total size of the three parts is always a power of two, and the excess capacity is always +/// as small as possible. This means that shrinking a list may cause the excess capacity to shrink +/// if a smaller power-of-two size becomes available. +/// +/// Both growing and shrinking a list may cause it to be reallocated in the pool vector. +/// +/// The index stored in an `ListHandle` points to part 2, the list elements. The value 0 is +/// reserved for the empty list which isn't allocated in the vector. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct ListHandle { + index: u32, + generation: u32, + len: u32, +} + +/// Create an empty list. +impl Default for ListHandle { + fn default() -> Self { + Self { + index: 0, + generation: 0, + len: 0, + } + } +} + +const MAX_SIZE_CLASS: SizeClass = sclass_for_length(super::MAX_CHAIN_LEN - 1); +const NUM_SIZE_CLASS: usize = MAX_SIZE_CLASS as usize + 1; + +/// A memory pool for storing lists of `T`. +#[derive(Clone, Debug)] +pub struct ListPool { + // The main array containing the lists. + data: Vec, + + // Heads of the free lists, one for each size class. + free: [u32; NUM_SIZE_CLASS], + + generation: u32, +} + +/// Lists are allocated in sizes that are powers of two, starting from 4. +/// Each power of two is assigned a size class number, so the size is `4 << SizeClass`. +type SizeClass = u8; + +/// Get the size of a given size class. The size includes the length field, so the maximum list +/// length is one less than the class size. +#[inline] +const fn sclass_size(sclass: SizeClass) -> usize { + 4 << sclass +} + +/// Get the size class to use for a given list length. +/// This always leaves room for the length element in addition to the list elements. +#[inline] +const fn sclass_for_length(len: u32) -> SizeClass { + 30 - (len | 3).leading_zeros() as SizeClass +} + +/// Is `len` the minimum length in its size class? +#[inline] +fn is_sclass_max_length(len: u32) -> bool { + len > 3 && len.is_power_of_two() +} + +impl ListPool { + /// Create a new list pool. + pub fn new(capacity: u32) -> Self { + Self { + data: Vec::with_capacity(capacity as usize), + free: [u32::MAX; NUM_SIZE_CLASS], + generation: 1, + } + } + + /// Clear the pool, forgetting about all lists that use it. + /// + /// This invalidates any existing entity lists that used this pool to allocate memory. + /// + /// The pool's memory is not released to the operating system, but kept around for faster + /// allocation in the future. + pub fn clear(&mut self) { + self.data.clear(); + self.free.fill(u32::MAX); + self.generation += 1; + } + + /// Allocate a storage block with a size given by `sclass`. + /// + /// Returns the first index of an available segment of `self.data` containing + /// `sclass_size(sclass)` elements. The allocated memory is filled with reserved + /// values. + fn alloc(&mut self, sclass: SizeClass) -> usize { + let freelist_head = self.free[sclass as usize]; + // First try the free list for this size class. + if freelist_head == u32::MAX { + // Nothing on the free list. Allocate more memory. + let offset = self.data.len(); + self.data.resize(offset + sclass_size(sclass), u32::MAX); + offset + } else { + // take allocation of the free list (linked list) + self.free[sclass as usize] = self.data[freelist_head as usize]; + freelist_head as usize + } + } + + /// Free a storage block with a size given by `sclass`. + /// + /// This must be a block that was previously allocated by `alloc()` with the same size class. + fn free(&mut self, block: usize, sclass: SizeClass) { + let sclass = sclass as usize; + // Insert the block on the free list which is a single linked list. + self.data[block] = self.free[sclass]; + self.free[sclass] = block as u32 + } + + /// Returns two mutable slices representing the two requested blocks. + /// + /// The two returned slices can be longer than the blocks. Each block is located at the front + /// of the respective slice. + fn mut_slices(&mut self, block0: usize, block1: usize) -> (&mut [u32], &mut [u32]) { + if block0 < block1 { + let (s0, s1) = self.data.split_at_mut(block1); + (&mut s0[block0..], s1) + } else { + let (s1, s0) = self.data.split_at_mut(block0); + (s0, &mut s1[block1..]) + } + } + + /// Reallocate a block to a different size class. + /// + /// Copy `elems_to_copy` elements from the old to the new block. + fn realloc(&mut self, block: usize, from_sclass: SizeClass, to_sclass: SizeClass, elems_to_copy: usize) -> usize { + debug_assert!(elems_to_copy <= sclass_size(from_sclass)); + debug_assert!(elems_to_copy <= sclass_size(to_sclass)); + let new_block = self.alloc(to_sclass); + + let (old, new) = self.mut_slices(block, new_block); + new[0..elems_to_copy].copy_from_slice(&old[0..elems_to_copy]); + + self.free(block, from_sclass); + new_block + } +} + +impl ListHandle { + /// Get the number of elements in the list. + #[allow(clippy::len_without_is_empty)] + pub fn len(&self, pool: &ListPool) -> u32 { + if self.generation == pool.generation { + self.len + } else { + 0 + } + } + + /// Get the list as a slice. + pub fn as_slice<'a>(&'a self, pool: &'a ListPool) -> &'a [u32] { + let idx = self.index as usize; + match self.len(pool) { + 0 => &[], + 1 => std::slice::from_ref(&self.index), + len => &pool.data[idx..idx + len as usize], + } + } + + /// Appends an element to the back of the list. + /// Returns the index where the element was inserted. + pub fn push(&mut self, element: u32, pool: &mut ListPool) { + let len = self.len(pool); + match len { + 0 => { + self.generation = pool.generation; + self.index = element; + self.len = 1; + } + 1 => { + // This is an empty list. Allocate a block and set length=1. + let block = pool.alloc(0); + pool.data[block] = self.index; + pool.data[block + 1] = element; + self.index = block as u32; + self.len = 2; + } + 2..=MAX_CHAIN_LEN => { + // Do we need to reallocate? + let block; + let idx = self.index as usize; + if is_sclass_max_length(len) { + // Reallocate, preserving length + all old elements. + let sclass = sclass_for_length(len); + block = pool.realloc(idx, sclass - 1, sclass, len as usize); + self.index = block as u32; + } else { + block = idx; + } + pool.data[block + len as usize] = element; + self.len += 1; + } + + // ignore elements longer then MAX_CHAIN_LEN + // these are rarely relevant and if they are we fall back to myers + _ => (), + } + } +} diff --git a/gix-imara-diff/src/intern.rs b/gix-imara-diff/src/intern.rs new file mode 100644 index 00000000000..3435820b024 --- /dev/null +++ b/gix-imara-diff/src/intern.rs @@ -0,0 +1,255 @@ +use std::hash::{BuildHasher as _, Hash}; +use std::ops::Index; + +use hashbrown::hash_table::{Entry, HashTable}; +use hashbrown::DefaultHashBuilder as RandomState; + +/// A token represented as an interned integer. +/// +/// A token represents the smallest possible unit of change during a diff. +/// For text this is usually a line, a word or a single character. +/// All [algorithms](crate::Algorithm) operate on interned tokens instead +/// of using the token data directly. +/// This allows for much better performance by amortizing the cost of hashing/equality. +/// +/// While you can intern tokens yourself it is strongly recommended to use [`InternedInput`] module. +#[derive(PartialEq, Eq, Hash, Clone, Copy, Debug)] +#[repr(transparent)] +pub struct Token(pub u32); + +impl From for Token { + fn from(token: u32) -> Self { + Token(token) + } +} + +impl From for u32 { + fn from(token: Token) -> Self { + token.0 + } +} + +/// A trait for types that can be split into tokens for diffing. +/// +/// Implementing this trait allows a type to be used with [`InternedInput`] to create +/// interned token sequences for computing diffs. For example, `&str` implements this trait +/// by default to split text into lines. +pub trait TokenSource { + /// The type of token this source produces. Must be hashable and comparable for equality. + type Token: Hash + Eq; + /// An iterator that yields tokens from this source. + type Tokenizer: Iterator; + /// Creates an iterator that yields all tokens from this source. + fn tokenize(&self) -> Self::Tokenizer; + /// Provides an estimate of the number of tokens this source will produce. + /// + /// This is used to pre-allocate memory for better performance. The estimate + /// does not need to be exact. + fn estimate_tokens(&self) -> u32; +} + +/// Two lists of interned [tokens](Token) that a [`Diff`](crate::Diff) can be computed from. +/// +/// A token represents the smallest possible unit of change during a diff. +/// For text this is usually a line, a word or a single character. +/// All [algorithms](crate::Algorithm) operate on interned tokens instead +/// of using the token data directly. +/// This allows for much better performance by amortizing the cost of hashing/equality. +/// +/// While you can intern tokens yourself it is strongly recommended to use [`InternedInput`] module. +#[derive(Default)] +pub struct InternedInput { + /// The list of interned tokens from the first sequence (before changes). + pub before: Vec, + /// The list of interned tokens from the second sequence (after changes). + pub after: Vec, + /// The interner that stores the actual token data and maps tokens to their interned IDs. + pub interner: Interner, +} + +impl InternedInput { + /// Clears all token sequences and the interner. + /// + /// This removes all tokens from both the before and after sequences, as well as + /// clearing the interner's storage. + /// + /// Note that this will not free the allocated memory. + pub fn clear(&mut self) { + self.before.clear(); + self.after.clear(); + self.interner.clear(); + } +} + +impl InternedInput { + /// Creates a new `InternedInput` by tokenizing and interning two token sources. + /// + /// # Parameters + /// + /// * `before` - The token source for the first sequence + /// * `after` - The token source for the second sequence + /// + /// # Returns + /// + /// An `InternedInput` containing interned token sequences ready for diffing + pub fn new>(before: I, after: I) -> Self { + let token_estimate_before = before.estimate_tokens() as usize; + let token_estimate_after = after.estimate_tokens() as usize; + let mut res = Self { + before: Vec::with_capacity(token_estimate_before), + after: Vec::with_capacity(token_estimate_after), + interner: Interner::new(token_estimate_before + token_estimate_after), + }; + res.update_before(before.tokenize()); + res.update_after(after.tokenize()); + res + } + + /// Reserve capacity so that `before` and `after` would not need + /// to allocate if their [`estimate_tokens`](TokenSource::estimate_tokens) + /// would represent an exact match of their actual tokens. + /// + /// Useful for minimization of allocation before calls to + /// [`update_before`](InternedInput::update_before) and + /// [`update_after`](InternedInput::update_after). + pub fn reserve_for_token_source + ?Sized>(&mut self, before: &S, after: &S) { + self.reserve(before.estimate_tokens(), after.estimate_tokens()) + } + + /// Reserves capacity for the specified number of tokens in each sequence. + /// + /// # Parameters + /// + /// * `capacity_before` - The number of tokens to reserve for the "before" sequence + /// * `capacity_after` - The number of tokens to reserve for the "after" sequence + pub fn reserve(&mut self, capacity_before: u32, capacity_after: u32) { + self.before.reserve(capacity_before as usize); + self.after.reserve(capacity_after as usize); + self.interner + .reserve(capacity_before as usize + capacity_after as usize); + } + + /// replaces `self.before` with the interned Tokens yielded by `input` + /// Note that this does not erase any tokens from the interner and might therefore be considered + /// a memory leak. If this function is called often over a long-running process + /// consider clearing the interner with [`clear`](Interner::clear). + pub fn update_before(&mut self, input: impl Iterator) { + self.before.clear(); + self.before.extend(input.map(|token| self.interner.intern(token))); + } + + /// replaces `self.before` with the interned Tokens yielded by `input` + /// Note that this does not erase any tokens from the interner and might therefore be considered + /// a memory leak. If this function is called often over a long-running process + /// consider clearing the interner with [`clear`](Interner::clear) or + /// [`erase_tokens_after`](Interner::erase_tokens_after). + pub fn update_after(&mut self, input: impl Iterator) { + self.after.clear(); + self.after.extend(input.map(|token| self.interner.intern(token))); + } +} + +/// An interner that allows for fast access of tokens produced by a [`TokenSource`]. +#[derive(Default)] +pub struct Interner { + tokens: Vec, + table: HashTable, + hasher: RandomState, +} + +impl Interner { + /// Create an Interner with an initial capacity calculated by summing the results of calling + /// [`estimate_tokens`](TokenSource::estimate_tokens) methods of `before` and `after`. + pub fn new_for_token_source>(before: &S, after: &S) -> Self { + Self::new(before.estimate_tokens() as usize + after.estimate_tokens() as usize) + } + + /// Create an Interner with initial capacity `capacity`. + pub fn new(capacity: usize) -> Interner { + Interner { + tokens: Vec::with_capacity(capacity), + table: HashTable::with_capacity(capacity), + hasher: RandomState::default(), + } + } + + /// Remove all interned tokens. + pub fn clear(&mut self) { + self.table.clear(); + self.tokens.clear(); + } + + /// Returns to total number of **distinct** tokens currently interned. + pub fn num_tokens(&self) -> u32 { + self.tokens.len() as u32 + } +} + +impl Interner { + /// Create an Interner with an initial capacity calculated by calling + /// [`estimate_tokens`](TokenSource::estimate_tokens) methods of `before` and `after` + pub fn reserve_for_token_source>(&mut self, before: &S, after: &S) { + self.reserve(before.estimate_tokens() as usize + after.estimate_tokens() as usize) + } + + /// Reserves capacity for at least the specified number of additional tokens. + /// + /// # Parameters + /// + /// * `capacity` - The number of additional tokens to reserve space for + pub fn reserve(&mut self, capacity: usize) { + self.table + .reserve(capacity, |&token| self.hasher.hash_one(&self.tokens[token.0 as usize])); + self.tokens.reserve(capacity); + } + + /// Intern `token` and return the interned integer. + pub fn intern(&mut self, token: T) -> Token { + let hash = self.hasher.hash_one(&token); + match self.table.entry( + hash, + |&it| self.tokens[it.0 as usize] == token, + |&token| self.hasher.hash_one(&self.tokens[token.0 as usize]), + ) { + Entry::Occupied(entry) => *entry.get(), + Entry::Vacant(entry) => { + let interned = Token(self.tokens.len() as u32); + entry.insert(interned); + self.tokens.push(token); + interned + } + } + } + + /// Erases `first_erased_token` and any tokens interned afterward from the interner. + pub fn erase_tokens_after(&mut self, first_erased_token: Token) { + assert!(first_erased_token.0 <= self.tokens.len() as u32); + let retained = first_erased_token.0 as usize; + let erased = self.tokens.len() - retained; + if retained <= erased { + self.table.clear(); + for (i, token) in self.tokens[0..retained].iter().enumerate() { + let hash = self.hasher.hash_one(token); + self.table.insert_unique(hash, Token(i as u32), |&token| { + self.hasher.hash_one(&self.tokens[token.0 as usize]) + }); + } + } else { + for (i, token) in self.tokens[retained..].iter().enumerate() { + let hash = self.hasher.hash_one(token); + match self.table.find_entry(hash, |token| token.0 == (retained + i) as u32) { + Ok(occupied) => drop(occupied.remove()), + Err(_absent) => unreachable!(), + } + } + } + self.tokens.truncate(first_erased_token.0 as usize); + } +} + +impl Index for Interner { + type Output = T; + fn index(&self, index: Token) -> &Self::Output { + &self.tokens[index.0 as usize] + } +} diff --git a/gix-imara-diff/src/lib.rs b/gix-imara-diff/src/lib.rs new file mode 100644 index 00000000000..4b31f2d800f --- /dev/null +++ b/gix-imara-diff/src/lib.rs @@ -0,0 +1,521 @@ +#![deny(missing_docs)] +//! Imara-diff is a solid (imara in Swahili) diff library for Rust. +//! Solid refers to the fact that imara-diff provides very good runtime performance even +//! in pathological cases so that your application never appears to freeze while waiting on a diff. +//! The performance improvements are achieved using battle tested heuristics used in gnu-diff and git +//! that are known to yield fast runtime and performance. +//! +//! Imara-diff is also designed to be flexible so that it can be used with arbitrary collections and +//! not just lists and strings and even allows reusing large parts of the computation when +//! comparing the same file to multiple different files. +//! +//! Imara-diff provides two diff algorithms: +//! +//! * The linear-space variant of the well known [**Myers** algorithm](http://www.xmailserver.org/diff2.pdf) +//! * The **Histogram** algorithm which is a variant of the patience diff algorithm. +//! +//! Myers algorithm has been enhanced with preprocessing and multiple heuristics to ensure fast runtime in pathological +//! cases to avoid quadratic time complexity and closely matches the behavior of gnu-diff and git. +//! The Histogram algorithm was originally ported from git but has been heavily optimized. +//! The **Histogram algorithm outperforms Myers diff** by 10% - 100% across a **wide variety of workloads**. +//! +//! Imara-diffs algorithms have been benchmarked over a wide variety of real-world code. +//! For example, while comparing multiple different Linux kernel versions, it performs up to 30 times better than the `similar` crate. +//! +//! # API Overview +//! +//! ## Preparing the input +//! To compute a diff, an input sequence is required. `imara-diff` computes diffs on abstract +//! sequences represented as a slice of IDs/tokens: [`Token`]. To create +//! such a sequence from your input type (for example, text), the input needs to be interned. +//! For that `imara-diff` provides utilities in the form of the [`InternedInput`] struct and +//! the `TokenSource` trait to construct it. [`InternedInput`] contains the two sides of +//! the diff (used while computing the diff). As well as the interner that allows mapping +//! back tokens to their original data. +//! +//! The most common use case for diff is comparing text. `&str` implements `TokenSource` +//! by default to segment the text into lines. So creating an input for a text-based diff usually +//! looks something like the following: +//! +//! ``` +//! # use gix_imara_diff::InternedInput; +//! # +//! let before = "abc\ndef"; +//! let after = "abc\ndefg"; +//! let input = InternedInput::new(before, after); +//! assert_eq!(input.interner[input.before[0]], "abc\n"); +//! ``` +//! +//! Note that interning inputs is optional, and you could choose a different strategy +//! for creating a sequence of tokens. Instead of using the [`Diff::compute`] function, +//! [`Diff::compute_with`] can be used to provide a list of tokens directly, entirely +//! bypassing the interning step. +//! +//! ## Computing the Diff +//! +//! A diff of two sequences is represented by the [`Diff`] struct and computed by +//! [`Diff::compute`] / [`Diff::compute_with`]. An algorithm can also be chosen here. +//! In most situations, [`Algorithm::Histogram`] is a good choice; refer to the docs +//! of [`Algorithm`] for more details. +//! +//! After the initial computation, the diff can be *postprocessed*. If the diff is shown +//! to a human in some way (even indirectly), you always want to use this. +//! +//! However, when only counting the number of changed tokens quickly, this can be skipped. +//! The postprocessing allows you to provide your own +//! heuristic for selecting a slider position. An indentation-based heuristic is provided, +//! which is a good fit for all text-based line diffs. The internals of the heuristic are +//! public, so a tweaked heuristic can be built on top. +//! +//! ``` +//! # use gix_imara_diff::{InternedInput, Diff, Algorithm}; +//! # +//! let before = "abc\ndef"; +//! let after = "abc\ndefg"; +//! let input = InternedInput::new(before, after); +//! let mut diff = Diff::compute(Algorithm::Histogram, &input); +//! diff.postprocess_lines(&input); +//! assert!(!diff.is_removed(0) && !diff.is_added(0)); +//! assert!(diff.is_removed(1) && diff.is_added(1)); +//! ``` +//! +//! ## Accessing results +//! +//! [`Diff`] allows querying whether a particular position was removed/added on either +//! side of the diff with [`Diff::is_removed`] / [`Diff::is_added`]. The number +//! of additions/removals can be quickly counted with [`Diff::count_removals`] / +//! [`Diff::count_additions`]. The most powerful/useful interface is the hunk iterator +//! [`Diff::hunks`], which returns a list of additions/removals/modifications in the +//! order that they appear in the input. +//! +//! Finally, if the `unified_diff` feature is enabled, a diff can be printed with +//! [`Diff::unified_diff`] to print a unified diff/patch as shown by `git diff` or `diff +//! -u`. Note that while the unified diff has a decent amount of flexibility, it is fairly +//! simplistic and not every formatting may be possible. It's meant to cover common +//! situations but not cover every advanced use case. Instead, if you need more advanced +//! printing, build your own printer on top of the [`Diff::hunks`] iterator; for that, you can +//! take inspiration from the built-in printer. +//! +//! ``` +//! # use gix_imara_diff::{InternedInput, Diff, Algorithm, BasicLineDiffPrinter, UnifiedDiffConfig}; +//! # +//! +//! let before = r#"fn foo() -> Bar { +//! let mut foo = 2; +//! foo *= 50; +//! println!("hello world") +//! } +//! "#; +//! +//! let after = r#"// lorem ipsum +//! fn foo() -> Bar { +//! let mut foo = 2; +//! foo *= 50; +//! println!("hello world"); +//! println!("{foo}"); +//! } +//! // foo +//! "#; +//! let input = InternedInput::new(before, after); +//! let mut diff = Diff::compute(Algorithm::Histogram, &input); +//! diff.postprocess_lines(&input); +//! +//! assert_eq!( +//! diff.unified_diff( +//! &BasicLineDiffPrinter(&input.interner), +//! UnifiedDiffConfig::default(), +//! &input, +//! ) +//! .to_string(), +//! r#"@@ -1,5 +1,8 @@ +//! +// lorem ipsum +//! fn foo() -> Bar { +//! let mut foo = 2; +//! foo *= 50; +//! - println!("hello world") +//! + println!("hello world"); +//! + println!("{foo}"); +//! } +//! +// foo +//! "# +//! ); +//! ``` + +use std::ops::Range; +use std::slice; + +use crate::{ + sources::words, + util::{strip_common_postfix, strip_common_prefix}, +}; + +pub use crate::slider_heuristic::{IndentHeuristic, IndentLevel, NoSliderHeuristic, SliderHeuristic}; +pub use intern::{InternedInput, Interner, Token, TokenSource}; +#[cfg(feature = "unified_diff")] +pub use unified_diff::{BasicLineDiffPrinter, UnifiedDiff, UnifiedDiffConfig, UnifiedDiffPrinter}; + +mod histogram; +mod intern; +mod myers; +mod postprocess; +mod slider_heuristic; +pub mod sources; +#[cfg(test)] +mod tests; +#[cfg(feature = "unified_diff")] +mod unified_diff; +mod util; + +/// `imara-diff` supports multiple different algorithms +/// for computing an edit sequence. +/// These algorithms have different performance and all produce different output. +#[derive(Debug, PartialEq, Eq, Clone, Copy, Default)] +pub enum Algorithm { + /// A variation of the [`patience` diff algorithm described by Bram Cohen's blog post](https://bramcohen.livejournal.com/73318.html) + /// that uses a histogram to find the least common LCS. + /// Just like the `patience` diff algorithm, this algorithm usually produces + /// more human-readable output than Myers algorithm. + /// However, compared to the `patience` diff algorithm (which is slower than Myers algorithm), + /// the Histogram algorithm performs much better. + /// + /// The implementation here was originally ported from `git` but has been significantly + /// modified to improve performance. + /// As a result, it consistently **performs better than Myers algorithm** (5%-100%) over + /// a wide variety of test data. + /// + /// For pathological subsequences that only contain highly repeating tokens (64+ occurrences) + /// the algorithm falls back on Myers algorithm (with heuristics) to avoid quadratic behavior. + /// + /// Compared to Myers algorithm, the Histogram diff algorithm is more focused on providing + /// human-readable diffs instead of minimal diffs. In practice, this means that the edit sequences + /// produced by the histogram diff are often longer than those produced by Myers algorithm. + /// + /// The heuristic used by the histogram diff does not work well for inputs with small (often repeated) + /// tokens. For example, **character diffs do not work well** as most (English) text is made up of + /// a fairly small set of characters. The `Histogram` algorithm will automatically detect these cases and + /// fall back to Myers algorithm. However, this detection has a nontrivial overhead, so + /// if it's known upfront that the sort of tokens is very small, `Myers` algorithm should + /// be used instead. + #[default] + Histogram, + /// An implementation of the linear space variant of + /// [Myers `O((N+M)D)` algorithm](http://www.xmailserver.org/diff2.pdf). + /// The algorithm is enhanced with preprocessing that removes + /// tokens that don't occur in the other file at all. + /// Furthermore, two heuristics for the middle snake search are implemented + /// that ensure reasonable runtime (mostly linear time complexity) even for large files. + /// + /// Due to the divide-and-conquer nature of the algorithm, + /// the edit sequences produced are still fairly small even when the middle snake + /// search is aborted by a heuristic. + /// However, the produced edit sequences are not guaranteed to be fully minimal. + /// If that property is vital to you, use the `MyersMinimal` algorithm instead. + /// + /// The implementation (including the preprocessing) is mostly + /// ported from `git` and `gnu-diff`, where Myers algorithm is used + /// as the default diff algorithm. + /// Therefore, the used heuristics have been heavily battle-tested and + /// are known to behave well over a large variety of inputs. + Myers, + /// Same as `Myers` but the early abort heuristics are disabled to guarantee + /// a minimal edit sequence. + /// This can mean significant slowdown in pathological cases. + MyersMinimal, +} + +/// Represents the difference between two sequences of tokens. +/// +/// A `Diff` stores which tokens were removed from the first sequence and which tokens were added to the second sequence. +#[derive(Default)] +pub struct Diff { + /// Tracks which tokens were removed from the first sequence (`before`), with + /// one entry for each one in the `before` sequence. + removed: Vec, + /// Tracks which tokens were added to the second sequence (`after`), with + /// one entry for each one in the `after` sequence. + added: Vec, +} + +impl std::fmt::Debug for Diff { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_list().entries(self.hunks()).finish() + } +} + +impl Diff { + /// Computes an edit-script that transforms `input.before` into `input.after` using + /// the specified `algorithm` + pub fn compute(algorithm: Algorithm, input: &InternedInput) -> Diff { + let mut diff = Diff::default(); + diff.compute_with(algorithm, &input.before, &input.after, input.interner.num_tokens()); + diff + } + + /// Computes an edit-script that transforms `before` into `after` using + /// the specified `algorithm`. + pub fn compute_with(&mut self, algorithm: Algorithm, mut before: &[Token], mut after: &[Token], num_tokens: u32) { + assert!( + before.len() < i32::MAX as usize, + "imara-diff only supports up to {} tokens", + i32::MAX + ); + assert!( + after.len() < i32::MAX as usize, + "imara-diff only supports up to {} tokens", + i32::MAX + ); + self.removed.clear(); + self.added.clear(); + self.removed.resize(before.len(), false); + self.added.resize(after.len(), false); + let common_prefix = strip_common_prefix(&mut before, &mut after) as usize; + let common_postfix = strip_common_postfix(&mut before, &mut after); + let range = common_prefix..self.removed.len() - common_postfix as usize; + let removed = &mut self.removed[range]; + let range = common_prefix..self.added.len() - common_postfix as usize; + let added = &mut self.added[range]; + match algorithm { + Algorithm::Histogram => histogram::diff(before, after, removed, added, num_tokens), + Algorithm::Myers => myers::diff(before, after, removed, added, false), + Algorithm::MyersMinimal => myers::diff(before, after, removed, added, true), + } + } + + /// Returns the total number of tokens that were added in the second sequence. + pub fn count_additions(&self) -> u32 { + self.added.iter().map(|&added| added as u32).sum() + } + + /// Returns the total number of tokens that were removed from the first sequence (`before`). + pub fn count_removals(&self) -> u32 { + self.removed.iter().map(|&removed| removed as u32).sum() + } + + /// Returns `true` if the token at the given index was removed from the first sequence (`before`). + /// + /// # Panics + /// + /// Panics if `token_idx` is out of bounds for the first sequence. + pub fn is_removed(&self, token_idx: u32) -> bool { + self.removed[token_idx as usize] + } + + /// Returns `true` if the token at the given index was added to the second sequence (`after`). + /// + /// # Panics + /// + /// Panics if `token_idx` is out of bounds for the second sequence (`after`). + pub fn is_added(&self, token_idx: u32) -> bool { + self.added[token_idx as usize] + } + + /// Postprocesses the diff to make it more human-readable. Certain hunks + /// have an ambiguous placement (even in a minimal diff) where they can move + /// downward or upward by removing a token (line) at the start and adding + /// one at the end (or the other way around). The postprocessing adjusts + /// these hunks according to a couple of rules: + /// + /// * Always merge multiple hunks if possible. + /// * Always try to create a single MODIFY hunk instead of multiple disjoint + /// ADDED/REMOVED hunks. + /// * Move sliders as far down as possible. + pub fn postprocess_no_heuristic(&mut self, input: &InternedInput) { + self.postprocess_with_heuristic(input, NoSliderHeuristic) + } + + /// Postprocesses the diff to make it more human-readable. Certain hunks + /// have an ambiguous placement (even in a minimal diff) where they can move + /// downward or upward by removing a token (line) at the start and adding + /// one at the end (or the other way around). The postprocessing adjusts + /// these hunks according to a couple of rules: + /// + /// * Always merge multiple hunks if possible. + /// * Always try to create a single MODIFY hunk instead of multiple disjoint + /// ADDED/REMOVED hunks. + /// * Based on a line's indentation level, heuristically compute the most + /// intuitive location to split lines. + /// * Move sliders as far down as possible. + pub fn postprocess_lines>(&mut self, input: &InternedInput) { + self.postprocess_with_heuristic( + input, + IndentHeuristic::new(|token| { + IndentLevel::for_ascii_line(input.interner[token].as_ref().iter().copied(), 8) + }), + ) + } + + /// Return an iterator that yields the changed hunks in this diff. + pub fn hunks(&self) -> HunkIter<'_> { + HunkIter { + removed: self.removed.iter(), + added: self.added.iter(), + pos_before: 0, + pos_after: 0, + } + } +} + +/// A single change in a `Diff` that represents a range of tokens (`before`) +/// in the first sequence that were replaced by a different range of tokens +/// in the second sequence (`after`). +/// +/// Each hunk identifies a contiguous region of change, where tokens from the `before` range +/// should be replaced with tokens from the `after` range. +#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)] +pub struct Hunk { + /// The range of token indices in the first sequence (`before`) that were removed. + pub before: Range, + /// The range of token indices in the second sequence (`after`) that were added. + pub after: Range, +} + +impl Hunk { + /// Can be used instead of `Option::None` for better performance. + /// Because `imara-diff` does not support more than `i32::MAX` there is an unused bit pattern that can be used. + /// + /// It has some nice properties where it usually is not necessary to check for `None` separately: + /// Empty ranges fail contains checks and also fail smaller than checks. + pub const NONE: Hunk = Hunk { + before: u32::MAX..u32::MAX, + after: u32::MAX..u32::MAX, + }; + + /// Inverts a hunk so that it represents a change + /// that would undo this hunk. + pub fn invert(&self) -> Hunk { + Hunk { + before: self.after.clone(), + after: self.before.clone(), + } + } + + /// Returns whether tokens are only inserted and not removed in this hunk. + pub fn is_pure_insertion(&self) -> bool { + self.before.is_empty() + } + + /// Returns whether tokens are only removed and not inserted in this hunk. + pub fn is_pure_removal(&self) -> bool { + self.after.is_empty() + } + + /// Performs a word-diff on this hunk. + /// + /// This requires passing the original [`input`](InternedInput) in order to look up + /// the tokens of the current hunk, which typically are lines. + /// Each token is split into words using the built-in [`words`] tokenizer. + /// The resulting word tokens are stored in a second [`diff_input`](InternedInput), + /// and a [`diff`](Diff) is computed on them, with basic post-processing applied. + /// + /// For performance reasons, this second [`diff_input`](InternedInput) as well as + /// the computed [`diff`](Diff) need to be passed as parameters so that they can be + /// re-used when iterating over hunks. Note that word tokens are always + /// added but never removed from the interner. Consider clearing it if you expect + /// your input to have a large vocabulary. + /// + /// # Examples + /// + /// ``` + /// # use gix_imara_diff::{InternedInput, Diff, Algorithm}; + /// // Compute diff normally + /// let before = "before text"; + /// let after = "after text"; + /// let mut lines = InternedInput::new(before, after); + /// let mut diff = Diff::compute(Algorithm::Histogram, &lines); + /// diff.postprocess_lines(&lines); + /// + /// // Compute word-diff per hunk, reusing allocations across iterations + /// let mut hunk_diff_input = InternedInput::default(); + /// let mut hunk_diff = Diff::default(); + /// for hunk in diff.hunks() { + /// hunk.latin_word_diff(&lines, &mut hunk_diff_input, &mut hunk_diff); + /// let added = hunk_diff.count_additions(); + /// let removed = hunk_diff.count_removals(); + /// println!("word-diff of this hunk has {added} additions and {removed} removals"); + /// // optionally, clear the interner: + /// hunk_diff_input.clear(); + /// } + /// ``` + pub fn latin_word_diff<'a>( + &self, + input: &InternedInput<&'a str>, + word_tokens: &mut InternedInput<&'a str>, + diff: &mut Diff, + ) { + let Hunk { before, after } = self.clone(); + word_tokens.update_before( + before + .map(|index| input.before[index as usize]) + .map(|token| input.interner[token]) + .flat_map(|line| words(line)), + ); + word_tokens.update_after( + after + .map(|index| input.after[index as usize]) + .map(|token| input.interner[token]) + .flat_map(|line| words(line)), + ); + diff.removed.clear(); + diff.removed.resize(word_tokens.before.len(), false); + diff.added.clear(); + diff.added.resize(word_tokens.after.len(), false); + if self.is_pure_removal() { + diff.removed.fill(true); + } else if self.is_pure_insertion() { + diff.added.fill(true); + } else { + diff.compute_with( + Algorithm::Myers, + &word_tokens.before, + &word_tokens.after, + word_tokens.interner.num_tokens(), + ); + diff.postprocess_no_heuristic(word_tokens); + } + } +} + +/// Yields all [`Hunk`]s in a file in monotonically increasing order. +/// Monotonically increasing means here that the following holds for any two +/// consecutive [`Hunk`]s `x` and `y`: +/// +/// ``` no_compile +/// assert!(x.before.end < y.before.start); +/// assert!(x.after.end < y.after.start); +/// ``` +/// +pub struct HunkIter<'diff> { + removed: slice::Iter<'diff, bool>, + added: slice::Iter<'diff, bool>, + pos_before: u32, + pos_after: u32, +} + +impl Iterator for HunkIter<'_> { + type Item = Hunk; + + fn next(&mut self) -> Option { + loop { + let removed = (&mut self.removed).take_while(|&&removed| removed).count() as u32; + let added = (&mut self.added).take_while(|&&added| added).count() as u32; + if removed != 0 || added != 0 { + let start_before = self.pos_before; + let start_after = self.pos_after; + self.pos_before += removed; + self.pos_after += added; + let hunk = Hunk { + before: start_before..self.pos_before, + after: start_after..self.pos_after, + }; + self.pos_before += 1; + self.pos_after += 1; + return Some(hunk); + } else if self.removed.len() == 0 && self.added.len() == 0 { + return None; + } else { + self.pos_before += 1; + self.pos_after += 1; + } + } + } +} diff --git a/gix-imara-diff/src/myers.rs b/gix-imara-diff/src/myers.rs new file mode 100644 index 00000000000..3fcf1a30404 --- /dev/null +++ b/gix-imara-diff/src/myers.rs @@ -0,0 +1,291 @@ +use std::ptr::NonNull; + +use crate::intern::Token; +use crate::myers::middle_snake::{MiddleSnakeSearch, SearchResult}; +use crate::myers::slice::FileSlice; +use crate::util::sqrt; + +mod middle_snake; +mod preprocess; +mod slice; + +/// Computes a diff using the Myers algorithm. +/// +/// # Parameters +/// +/// * `before` - The token sequence from the first file +/// * `after` - The token sequence from the second file +/// * `removed` - Output array marking removed tokens +/// * `added` - Output array marking added tokens +/// * `minimal` - If true, disables heuristics to guarantee a minimal diff +pub fn diff(before: &[Token], after: &[Token], removed: &mut [bool], added: &mut [bool], minimal: bool) { + // Preprocess the files by removing parts of the file that are not contained in the other file at all. + // This process remaps the token indices so we have to account for that during the rest of the diff + let (before, after) = preprocess::preprocess(before, after, removed, added); + + // Perform the actual diff + Myers::new(before.tokens.len(), after.tokens.len()).run( + FileSlice::new(&before, removed), + FileSlice::new(&after, added), + minimal, + ); +} + +/// Minimum edit cost before heuristics are applied to avoid quadratic behavior. +const HEUR_MIN_COST: u32 = 256; +/// Minimum value for the maximum cost threshold. +const MAX_COST_MIN: u32 = 256; + +/// The Myers diff algorithm implementation with linear space complexity. +/// +/// This structure maintains the internal state needed to compute diffs using +/// Myers' algorithm with forward and backward search. +pub struct Myers { + /// The backing memory for k-vectors. + kvec: NonNull<[i32]>, + /// Pointer to the forward search k-vector. + kforward: NonNull, + /// Pointer to the backward search k-vector. + kbackward: NonNull, + /// Maximum edit cost before applying heuristics. + max_cost: u32, +} + +impl Drop for Myers { + fn drop(&mut self) { + unsafe { drop(Box::from_raw(self.kvec.as_ptr())) } + } +} + +impl Myers { + fn new(len1: usize, len2: usize) -> Self { + let ndiags = len1 + len2 + 3; + let kvec: *mut [i32] = Box::into_raw(vec![0; 2 * ndiags + 2].into_boxed_slice()); + let (kforward, kbackward) = unsafe { + ( + NonNull::new_unchecked((kvec as *mut i32).add(len2 + 1)), + NonNull::new_unchecked((kvec as *mut i32).add(ndiags + len2 + 1)), + ) + }; + Self { + kvec: unsafe { NonNull::new_unchecked(kvec) }, + kforward, + kbackward, + max_cost: sqrt(ndiags).max(MAX_COST_MIN), + } + } + + fn run<'f>(&mut self, mut file1: FileSlice<'f>, mut file2: FileSlice<'f>, mut need_min: bool) { + loop { + file1.strip_common(&mut file2); + + if file1.is_empty() { + file2.mark_changed(); + return; + } else if file2.is_empty() { + file1.mark_changed(); + return; + } + + let split = self.split(&file1, &file2, need_min); + self.run( + file1.borrow().slice(..split.token_idx1 as u32), + file2.borrow().slice(..split.token_idx2 as u32), + split.minimized_lo, + ); + + file1 = file1.slice(split.token_idx1 as u32..); + file2 = file2.slice(split.token_idx2 as u32..); + need_min = split.minimized_hi + } + } + + /// See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers. + /// Basically considers a "box" (off1, off2, lim1, lim2) and scan from both + /// the forward diagonal starting from (off1, off2) and the backward diagonal + /// starting from (lim1, lim2). If the K values on the same diagonal crosses + /// returns the furthest point of reach. We might encounter expensive edge cases + /// using this algorithm, so a little bit of heuristic is needed to cut the + /// search and to return a suboptimal point. + fn split(&mut self, file1: &FileSlice, file2: &FileSlice, need_min: bool) -> Split { + let mut forward_search = unsafe { MiddleSnakeSearch::::new(self.kforward, file1, file2) }; + let mut backwards_search = unsafe { MiddleSnakeSearch::::new(self.kbackward, file1, file2) }; + let is_odd = file2.len().wrapping_sub(file1.len()) & 1 != 0; + + let mut ec = 0; + + while ec <= self.max_cost { + let mut found_snake = false; + forward_search.next_d(); + if is_odd { + if let Some(res) = forward_search.run(file1, file2, |k, token_idx1| { + backwards_search.contains(k) && backwards_search.x_pos_at_diagonal(k) <= token_idx1 + }) { + #[cfg(test)] + cov_mark::hit!(ODD_SPLIT); + match res { + SearchResult::Snake => found_snake = true, + SearchResult::Found { token_idx1, token_idx2 } => { + return Split { + token_idx1, + token_idx2, + minimized_lo: true, + minimized_hi: true, + }; + } + } + } + } else { + found_snake |= forward_search.run(file1, file2, |_, _| false).is_some() + }; + + backwards_search.next_d(); + if !is_odd { + if let Some(res) = backwards_search.run(file1, file2, |k, token_idx1| { + forward_search.contains(k) && token_idx1 <= forward_search.x_pos_at_diagonal(k) + }) { + #[cfg(test)] + cov_mark::hit!(EVEN_SPLIT); + match res { + SearchResult::Snake => found_snake = true, + SearchResult::Found { token_idx1, token_idx2 } => { + return Split { + token_idx1, + token_idx2, + minimized_lo: true, + minimized_hi: true, + }; + } + } + } + } else { + found_snake |= backwards_search.run(file1, file2, |_, _| false).is_some() + }; + + if need_min { + continue; + } + + // If the edit cost is above the heuristic trigger and if + // we got a good snake, we sample current diagonals to see + // if some of them have reached an "interesting" path. Our + // measure is a function of the distance from the diagonal + // corner (i1 + i2) penalized with the distance from the + // mid-diagonal itself. If this value is above the current + // edit cost times a magic factor (XDL_K_HEUR) we consider + // it interesting. + if found_snake && ec > HEUR_MIN_COST { + if let Some((token_idx1, token_idx2)) = forward_search.found_snake(ec, file1, file2) { + return Split { + token_idx1, + token_idx2, + minimized_lo: true, + minimized_hi: false, + }; + } + + if let Some((token_idx1, token_idx2)) = backwards_search.found_snake(ec, file1, file2) { + return Split { + token_idx1, + token_idx2, + minimized_lo: false, + minimized_hi: true, + }; + } + } + + ec += 1; + } + + let (distance_forward, token_idx1_forward) = forward_search.best_position(file1, file2); + let (distance_backwards, token_idx1_backwards) = backwards_search.best_position(file1, file2); + if distance_forward > file1.len() as isize + file2.len() as isize - distance_backwards { + Split { + token_idx1: token_idx1_forward, + token_idx2: (distance_forward - token_idx1_forward as isize) as i32, + minimized_lo: true, + minimized_hi: false, + } + } else { + Split { + token_idx1: token_idx1_backwards, + token_idx2: (distance_backwards - token_idx1_backwards as isize) as i32, + minimized_lo: false, + minimized_hi: true, + } + } + } +} + +/// Represents a split point in the divide-and-conquer approach. +/// +/// The split divides the problem into two subproblems at the given token positions. +#[derive(Debug)] +struct Split { + /// Token index in the first sequence where the split occurs. + token_idx1: i32, + /// Token index in the second sequence where the split occurs. + token_idx2: i32, + /// Whether the lower subproblem was minimized. + minimized_lo: bool, + /// Whether the upper subproblem was minimized. + minimized_hi: bool, +} + +// /// the mapping performed during preprocessing makes it impossible to directly call +// /// the `sink` during the diff itself. Instead, `file.changed` is set to true for all +// /// tokens that are changed +// /// below these arrays are used to call the sink function +// fn process_changes_with_sink( +// before: &PreprocessedFile, +// after: &PreprocessedFile, +// sink: &mut impl Sink, +// ) { +// let before_end = before.changed.len() as u32 + before.offset; +// let after_end = after.changed.len() as u32 + after.offset; + +// let mut before = before +// .changed +// .iter() +// .enumerate() +// .map(|(i, removed)| (i as u32 + before.offset, *removed)); + +// let mut after = after +// .changed +// .iter() +// .enumerate() +// .map(|(i, inserted)| (i as u32 + after.offset, *inserted)); + +// let mut next1 = before.next(); +// let mut next2 = after.next(); + +// while let (Some((before_pos, removed)), Some((after_pos, inserted))) = (next1, next2) { +// if !(removed | inserted) { +// next1 = before.next(); +// next2 = after.next(); +// continue; +// } + +// let mut hunk_before = before_pos..before_pos; +// let mut hunk_after = after_pos..after_pos; +// if removed { +// let end = before.find(|(_, changed)| !changed); +// next1 = end.map(|(end, _)| (end, false)); +// hunk_before.end = end.map_or(before_end, |(end, _)| end); +// }; + +// if inserted { +// let end = after.find(|(_, changed)| !changed); +// next2 = end.map(|(end, _)| (end, false)); +// hunk_after.end = end.map_or(after_end, |(end, _)| end); +// } + +// sink.process_change(hunk_before, hunk_after); +// } + +// if let Some((before_pos, _)) = next1 { +// sink.process_change(before_pos..before_end, after_end..after_end); +// } else if let Some((after_pos, _)) = next2 { +// sink.process_change(before_end..before_end, after_pos..after_end); +// } +// } diff --git a/gix-imara-diff/src/myers/middle_snake.rs b/gix-imara-diff/src/myers/middle_snake.rs new file mode 100644 index 00000000000..bb0d012c60f --- /dev/null +++ b/gix-imara-diff/src/myers/middle_snake.rs @@ -0,0 +1,275 @@ +use std::ptr::NonNull; + +use crate::myers::slice::FileSlice; +use crate::util::{common_postfix, common_prefix}; + +/// Minimum snake length to be considered a "good" snake for heuristics. +const SNAKE_CNT: u32 = 20; +/// Heuristic multiplier used to evaluate snake quality. +const K_HEUR: u32 = 4; + +/// Performs forward or backward search for the middle snake in Myers' algorithm. +/// +/// The `BACK` const generic parameter determines the search direction: +/// `false` for forward search, `true` for backward search. +#[derive(Debug)] +pub struct MiddleSnakeSearch { + /// Pointer to the k-vector storage for this search direction. + kvec: NonNull, + /// Minimum k-diagonal currently being searched. + kmin: i32, + /// Maximum k-diagonal currently being searched. + kmax: i32, + /// Minimum possible k-diagonal value. + dmin: i32, + /// Maximum possible k-diagonal value. + dmax: i32, +} + +impl MiddleSnakeSearch { + /// # Safety + /// `data` must be valid for reads and writes between `-file2.len() - 1` and `file1.len() + 1` + pub unsafe fn new(data: NonNull, file1: &FileSlice, file2: &FileSlice) -> Self { + let dmin = -(file2.len() as i32); + let dmax = file1.len() as i32; + let kmid = if BACK { dmin + dmax } else { 0 }; + let mut res = Self { + kvec: data, + kmin: kmid, + kmax: kmid, + dmin, + dmax, + }; + let init = if BACK { file1.len() as i32 } else { 0 }; + res.write_xpos_at_diagonal(kmid, init); + res + } + + pub fn contains(&self, k: i32) -> bool { + (self.kmin..=self.kmax).contains(&k) + } + + pub fn bounds_check(&self, k: i32) { + debug_assert!((self.dmin - 1..=self.dmax + 1).contains(&k)); + } + + fn write_xpos_at_diagonal(&mut self, k: i32, token_idx1: i32) { + self.bounds_check(k); + unsafe { self.kvec.as_ptr().offset(k as isize).write(token_idx1) } + } + + pub fn x_pos_at_diagonal(&self, diagonal: i32) -> i32 { + self.bounds_check(diagonal); + unsafe { self.kvec.as_ptr().offset(diagonal as isize).read() } + } + + pub fn pos_at_diagonal(&self, diagonal: i32) -> (i32, i32) { + self.bounds_check(diagonal); + let token_idx1 = unsafe { self.kvec.as_ptr().offset(diagonal as isize).read() }; + let token_idx2 = token_idx1 - diagonal; + (token_idx1, token_idx2) + } + + /// We need to extend the diagonal "domain" by one. If the next + /// values exits the box boundaries we need to change it in the + /// opposite direction because (max - min) must be a power of + /// two. + /// + /// Also we initialize the external K value to -1 so that we can + /// avoid extra conditions in the check inside the core loop. + pub fn next_d(&mut self) { + let init_val = if BACK { + // value should always be larger then bounds + i32::MAX + } else { + // value should always be smaller then bounds + i32::MIN + }; + + if self.kmin > self.dmin { + self.kmin -= 1; + self.write_xpos_at_diagonal(self.kmin - 1, init_val); + } else { + self.kmin += 1; + } + + if self.kmax < self.dmax { + self.kmax += 1; + self.write_xpos_at_diagonal(self.kmax + 1, init_val); + } else { + self.kmax -= 1; + } + } + + pub fn run( + &mut self, + file1: &FileSlice, + file2: &FileSlice, + mut f: impl FnMut(i32, i32) -> bool, + ) -> Option { + let mut res = None; + let mut k = self.kmax; + while k >= self.kmin { + #[cfg(test)] + cov_mark::hit!(SPLIT_SEARCH_ITER); + let mut token_idx1 = if BACK { + if self.x_pos_at_diagonal(k - 1) < self.x_pos_at_diagonal(k + 1) { + self.x_pos_at_diagonal(k - 1) + } else { + self.x_pos_at_diagonal(k + 1) - 1 + } + } else if self.x_pos_at_diagonal(k - 1) >= self.x_pos_at_diagonal(k + 1) { + self.x_pos_at_diagonal(k - 1) + 1 + } else { + self.x_pos_at_diagonal(k + 1) + }; + + let mut token_idx2 = token_idx1 - k; + let off = if BACK { + if token_idx1 > 0 && token_idx2 > 0 { + let tokens1 = &file1.tokens[..token_idx1 as usize]; + let tokens2 = &file2.tokens[..token_idx2 as usize]; + common_postfix(tokens1, tokens2) + } else { + 0 + } + } else if token_idx1 < file1.len() as i32 && token_idx2 < file2.len() as i32 { + let tokens1 = &file1.tokens[token_idx1 as usize..]; + let tokens2 = &file2.tokens[token_idx2 as usize..]; + common_prefix(tokens1, tokens2) + } else { + 0 + }; + + if off > SNAKE_CNT { + res = Some(SearchResult::Snake) + } + + if BACK { + token_idx1 -= off as i32; + token_idx2 -= off as i32; + } else { + token_idx1 += off as i32; + token_idx2 += off as i32; + } + self.write_xpos_at_diagonal(k, token_idx1); + + if f(k, token_idx1) { + return Some(SearchResult::Found { token_idx1, token_idx2 }); + } + + k -= 2; + } + + res + } + + pub fn best_position(&self, file1: &FileSlice, file2: &FileSlice) -> (isize, i32) { + let mut best_distance: isize = if BACK { isize::MAX } else { -1 }; + let mut best_token_idx1 = if BACK { i32::MAX } else { -1 }; + let mut k = self.kmax; + while k >= self.kmin { + let mut token_idx1 = self.x_pos_at_diagonal(k); + if BACK { + token_idx1 = token_idx1.max(0); + } else { + token_idx1 = token_idx1.min(file1.len() as i32); + } + let mut token_idx2 = token_idx1 - k; + if BACK { + if token_idx2 < 0 { + token_idx1 = k; + token_idx2 = 0; + } + } else if token_idx2 > file2.len() as i32 { + token_idx1 = file2.len() as i32 + k; + token_idx2 = file2.len() as i32; + } + + let distance = token_idx1 as isize + token_idx2 as isize; + if BACK && distance < best_distance || !BACK && distance > best_distance { + best_distance = distance; + best_token_idx1 = token_idx1; + } + + k -= 2; + } + (best_distance, best_token_idx1) + } + + pub fn found_snake(&self, ec: u32, file1: &FileSlice, file2: &FileSlice) -> Option<(i32, i32)> { + let mut best_score = 0; + let mut best_token_idx1 = 0; + let mut best_token_idx2 = 0; + let mut k = self.kmax; + while k >= self.kmin { + let (token_idx1, token_idx2) = self.pos_at_diagonal(k); + if BACK { + if !(0..file1.len() as i32 - SNAKE_CNT as i32).contains(&token_idx1) { + k -= 2; + continue; + } + if !(0..file2.len() as i32 - SNAKE_CNT as i32).contains(&token_idx2) { + k -= 2; + continue; + } + } else { + if !(SNAKE_CNT as i32..file1.len() as i32).contains(&token_idx1) { + k -= 2; + continue; + } + if !(SNAKE_CNT as i32..file2.len() as i32).contains(&token_idx2) { + k -= 2; + continue; + } + } + + let main_diagonal_distance = k.unsigned_abs() as usize; + let distance = if BACK { + (file1.len() - token_idx1 as u32) + (file2.len() - token_idx2 as u32) + } else { + token_idx1 as u32 + token_idx2 as u32 + }; + let score = distance as usize + main_diagonal_distance; + if score > (K_HEUR * ec) as usize && score > best_score { + let is_snake = if BACK { + file1.tokens[token_idx1 as usize..] + .iter() + .zip(&file2.tokens[token_idx2 as usize..]) + .take(SNAKE_CNT as usize) + .all(|(token1, token2)| token1 == token2) + } else { + file1.tokens[..token_idx1 as usize] + .iter() + .zip(&file2.tokens[..token_idx2 as usize]) + .rev() + .take(SNAKE_CNT as usize) + .all(|(token1, token2)| token1 == token2) + }; + if is_snake { + best_token_idx1 = token_idx1; + best_token_idx2 = token_idx2; + best_score = score + } + } + + k -= 2; + } + + (best_score > 0).then_some((best_token_idx1, best_token_idx2)) + } +} + +/// The result of a middle snake search iteration. +#[derive(Debug)] +pub enum SearchResult { + /// A good snake was found but not necessarily the middle snake. + Snake, + /// The middle snake was found at the specified token positions. + Found { + /// Token index in the first sequence. + token_idx1: i32, + /// Token index in the second sequence. + token_idx2: i32, + }, +} diff --git a/gix-imara-diff/src/myers/preprocess.rs b/gix-imara-diff/src/myers/preprocess.rs new file mode 100644 index 00000000000..71f902545d9 --- /dev/null +++ b/gix-imara-diff/src/myers/preprocess.rs @@ -0,0 +1,196 @@ +use crate::intern::Token; +use crate::myers::sqrt; + +/// Preprocesses token sequences by removing tokens that don't appear in the other sequence. +/// +/// This optimization reduces the problem size for the Myers algorithm, improving performance +/// for files with many unique tokens. +pub fn preprocess<'a>( + before: &[Token], + after: &[Token], + removed: &'a mut [bool], + added: &'a mut [bool], +) -> (PreprocessedFile, PreprocessedFile) { + let (occurrences_before, occurrences_after) = token_occurrences(before, after); + let file1 = PreprocessedFile::new(&occurrences_before, before, removed); + let file2 = PreprocessedFile::new(&occurrences_after, after, added); + (file1, file2) +} + +fn token_occurrences(file1: &[Token], file2: &[Token]) -> (Vec, Vec) { + const MAX_EQLIMIT: u32 = 1024; + + // compute the limit after which tokens are treated as `Occurrences::COMMON` + let eqlimit1 = sqrt(file1.len()).min(MAX_EQLIMIT); + let eqlimit2 = sqrt(file2.len()).min(MAX_EQLIMIT); + + // first collect how often each token occurs in a file + let mut occurrences1 = Vec::new(); + for token in file1 { + let bucket = token.0 as usize; + if bucket >= occurrences1.len() { + occurrences1.resize(bucket + 1, 0u32); + } + occurrences1[bucket] += 1; + } + + // do the same thing for + let mut occurrences2 = Vec::new(); + let token_occurrences2: Vec<_> = file2 + .iter() + .map(|token| { + let bucket = token.0 as usize; + if bucket >= occurrences2.len() { + occurrences2.resize(bucket + 1, 0); + } + occurrences2[bucket] += 1; + let occurrences1 = *occurrences1.get(bucket).unwrap_or(&0); + Occurrences::from_occurrences(occurrences1, eqlimit2) + }) + .collect(); + + let token_occurrences1: Vec<_> = file1 + .iter() + .map(|token| { + let bucket = token.0 as usize; + let occurrences2 = *occurrences2.get(bucket).unwrap_or(&0); + Occurrences::from_occurrences(occurrences2, eqlimit1) + }) + .collect(); + + (token_occurrences1, token_occurrences2) +} + +/// Categorizes how frequently a token appears in a file. +#[derive(Clone, Copy, Debug)] +enum Occurrences { + /// Token does not occur in the other file. + None, + /// Token occurs at least once in the other file. + Some, + /// Token occurs very frequently in the other file (exact threshold depends on file size). + /// Such tokens are usually empty lines or braces and are often not meaningful to a diff. + Common, +} + +impl Occurrences { + pub fn from_occurrences(occurrences: u32, eqlimit: u32) -> Occurrences { + if occurrences == 0 { + Occurrences::None + } else if occurrences >= eqlimit { + Occurrences::Common + } else { + Occurrences::Some + } + } +} + +/// A file after preprocessing has removed unmatched tokens. +#[derive(Debug)] +pub struct PreprocessedFile { + /// Maps from new token positions to original positions in the unpreprocessed file. + pub indices: Vec, + /// The tokens that remain after preprocessing. + pub tokens: Vec, +} + +impl PreprocessedFile { + fn new(token_occurrences: &[Occurrences], tokens: &[Token], changed: &mut [bool]) -> PreprocessedFile { + let (tokens, indices) = prune_unmatched_tokens(tokens, token_occurrences, changed); + PreprocessedFile { indices, tokens } + } +} + +fn prune_unmatched_tokens( + file: &[Token], + token_status: &[Occurrences], + changed: &mut [bool], +) -> (Vec, Vec) { + assert_eq!(token_status.len(), file.len()); + file.iter() + .zip(token_status) + .enumerate() + .filter_map(|(i, (&token, &status))| { + let prune = match status { + Occurrences::None => true, + Occurrences::Some => false, + Occurrences::Common => should_prune_common_line(token_status, i), + }; + if prune { + changed[i] = true; + None + } else { + Some((token, i as u32)) + } + }) + .unzip() +} + +// TODO do not unnecessarily rescan lines +fn should_prune_common_line(token_status: &[Occurrences], pos: usize) -> bool { + const WINDOW_SIZE: usize = 100; + + let mut unmatched_before = 0; + let mut common_before = 0; + + let start = pos.saturating_sub(WINDOW_SIZE); + for status in token_status[start..pos].iter().rev() { + match status { + Occurrences::None => { + unmatched_before += 1; + } + Occurrences::Common => { + common_before += 1; + } + Occurrences::Some => break, + } + } + + if unmatched_before == 0 { + return false; + } + + let end = token_status.len().min(pos + WINDOW_SIZE); + let mut unmatched_after = 0; + let mut common_after = 0; + for status in token_status[pos..end].iter() { + match status { + Occurrences::None => { + unmatched_after += 1; + } + Occurrences::Common => { + common_after += 1; + } + Occurrences::Some => break, + } + } + + if unmatched_after == 0 { + return false; + } + + let common = common_before + common_after; + let unmatched = unmatched_before + unmatched_after; + + unmatched > 3 * common +} + +#[cfg(test)] +mod tests { + use super::{should_prune_common_line, Occurrences}; + + #[test] + fn common_line_pruning_ignores_distant_context() { + let mut token_status = vec![Occurrences::Some; 700]; + token_status[100..400].fill(Occurrences::None); + token_status[400..450].fill(Occurrences::None); + token_status[450..500].fill(Occurrences::Common); + token_status[500..550].fill(Occurrences::Common); + token_status[550..600].fill(Occurrences::None); + + assert!( + !should_prune_common_line(&token_status, 500), + "only the last 100 items before the current line should contribute to the backward scan" + ); + } +} diff --git a/gix-imara-diff/src/myers/slice.rs b/gix-imara-diff/src/myers/slice.rs new file mode 100644 index 00000000000..f266fa7e17f --- /dev/null +++ b/gix-imara-diff/src/myers/slice.rs @@ -0,0 +1,80 @@ +use std::mem::take; +use std::ops::RangeBounds; + +use crate::intern::Token; +use crate::myers::preprocess::PreprocessedFile; +use crate::util::common_edges; + +/// A slice of a preprocessed file used during the Myers algorithm's divide-and-conquer. +/// +/// This structure allows the algorithm to work on subproblems while maintaining +/// references to the original file's token indices and change tracking. +#[derive(Default)] +pub struct FileSlice<'a> { + /// The tokens in this slice. + pub tokens: &'a [Token], + /// Maps from slice positions back to original file positions. + indices: &'a [u32], + /// Tracks which tokens in the original file have changed. + changed: &'a mut [bool], +} + +impl<'a> FileSlice<'a> { + pub fn new(file: &'a PreprocessedFile, changed: &'a mut [bool]) -> Self { + Self { + tokens: &file.tokens, + indices: &file.indices, + changed, + } + } + + pub fn mark_changed(&mut self) { + for &i in self.indices { + self.changed[i as usize] = true; + } + } + + pub fn borrow(&mut self) -> FileSlice<'_> { + FileSlice { + tokens: self.tokens, + changed: self.changed, + indices: self.indices, + } + } + + pub fn slice>(self, range: R) -> Self { + let start = match range.start_bound() { + std::ops::Bound::Included(&start) => start, + std::ops::Bound::Excluded(&start) => start + 1, + std::ops::Bound::Unbounded => 0, + }; + + let end = match range.end_bound() { + std::ops::Bound::Included(&end) => end + 1, + std::ops::Bound::Excluded(&end) => end, + std::ops::Bound::Unbounded => self.len(), + }; + + Self { + tokens: &self.tokens[start as usize..end as usize], + changed: self.changed, + indices: &self.indices[start as usize..end as usize], + } + } + + pub fn strip_common(&mut self, other: &mut Self) { + let (start, common_postfix) = common_edges(self.tokens, other.tokens); + let end = self.len() - common_postfix; + *self = take(self).slice(start..end); + let end = other.len() - common_postfix; + *other = take(other).slice(start..end) + } + + pub fn len(&self) -> u32 { + self.tokens.len() as u32 + } + + pub fn is_empty(&self) -> bool { + self.tokens.is_empty() + } +} diff --git a/gix-imara-diff/src/postprocess.rs b/gix-imara-diff/src/postprocess.rs new file mode 100644 index 00000000000..a846e33cd2a --- /dev/null +++ b/gix-imara-diff/src/postprocess.rs @@ -0,0 +1,159 @@ +use crate::intern::{InternedInput, Token}; +use crate::slider_heuristic::SliderHeuristic; +use crate::util::{find_hunk_end, find_hunk_start}; +use crate::{Diff, Hunk}; + +impl Diff { + /// Postprocesses the diff using explicit token sequences and a custom heuristic. + /// + /// This is a lower-level method that works directly with token sequences rather than + /// an `InternedInput`. Use [`postprocess_with_heuristic`](Self::postprocess_with_heuristic) + /// for a more convenient API. + /// + /// # Parameters + /// + /// * `before` - The token sequence from the first file, before changes + /// * `after` - The token sequence from the second file, after changes + /// * `heuristic` - The slider heuristic to use for positioning hunks + pub fn postprocess_with(&mut self, before: &[Token], after: &[Token], mut heuristic: impl SliderHeuristic) { + Postprocessor { + added: &mut self.added, + removed: &mut self.removed, + tokens: after, + hunk: Hunk { + before: 0..0, + after: 0..0, + }, + heuristic: &mut heuristic, + } + .run(); + Postprocessor { + added: &mut self.removed, + removed: &mut self.added, + tokens: before, + hunk: Hunk { + before: 0..0, + after: 0..0, + }, + heuristic: &mut heuristic, + } + .run() + } + + /// Postprocesses the diff using an `InternedInput` and a custom heuristic. + /// + /// This is a convenience wrapper around [`postprocess_with`](Self::postprocess_with) + /// that extracts the token sequences from the input automatically. + /// + /// # Parameters + /// + /// * `input` - The interned input containing the token sequences + /// * `heuristic` - The slider heuristic to use for positioning hunks + pub fn postprocess_with_heuristic(&mut self, input: &InternedInput, heuristic: impl SliderHeuristic) { + self.postprocess_with(&input.before, &input.after, heuristic); + } +} + +/// Internal state for postprocessing a diff to improve readability. +struct Postprocessor<'a, H> { + /// The mutable array tracking which tokens were added. + added: &'a mut [bool], + /// The immutable array tracking which tokens were removed. + removed: &'a [bool], + /// The token sequence being processed. + tokens: &'a [Token], + /// The current hunk being processed in the iteration. + hunk: Hunk, + /// The heuristic used to determine optimal hunk positions. + heuristic: &'a mut H, +} + +impl Postprocessor<'_, H> { + fn run(mut self) { + loop { + // find a hunk + if !self.hunk.next_hunk(self.removed, self.added) { + break; + } + + let mut earliest_end; + let mut is_modification; + loop { + // move hunk up as far as possible to possibly merge it with other hunks + // and discover if there are other possible positions + while self.slide_up() {} + earliest_end = self.hunk.after.end; + is_modification = self.hunk.before.start != self.hunk.before.end; + + let hunk_size_unexpanded = self.hunk.after.len(); + // move hunk down as far as possible (and merge with other hunks it if + // possible) sliding down is often the most preferred position + while self.slide_down() { + is_modification |= self.hunk.before.start != self.hunk.before.end; + } + // if this hunk was merged with another hunk while sliding down we might + // be able to slide up more otherwise we are done + if hunk_size_unexpanded == self.hunk.after.len() { + break; + } + } + + if self.hunk.after.end == earliest_end { + continue; + } + if is_modification { + // hunk can be moved and there is a removed hunk in the same region + // move the hunk so it align with the other hunk to produce a single + // MODIFIED hunk instead of two separate ADDED/REMOVED hunks + while self.hunk.before.start == self.hunk.before.end { + let success = self.slide_up(); + debug_assert!(success); + } + } else { + let slider_end = self + .heuristic + .best_slider_end(self.tokens, self.hunk.after.clone(), earliest_end); + for _ in slider_end..self.hunk.after.end { + let success = self.slide_up(); + debug_assert!(success); + } + } + } + } + + /// Slides a hunk down by one token/line, potentially merging it with a subsequent hunk. + fn slide_down(&mut self) -> bool { + let Some(&next_token) = self.tokens.get(self.hunk.after.end as usize) else { + return false; + }; + if self.tokens[self.hunk.after.start as usize] != next_token { + return false; + } + self.added[self.hunk.after.start as usize] = false; + self.added[self.hunk.after.end as usize] = true; + self.hunk.after.start += 1; + self.hunk.after.end = find_hunk_end(self.added, self.hunk.after.end); + // move the end of the remove range one down to keep the unchanged lines aligned + self.hunk.before.start = self.hunk.before.end + 1; + self.hunk.before.end = find_hunk_end(self.removed, self.hunk.before.start); + true + } + + /// Slides a hunk up by one token/line, potentially merging it with a previous hunk. + fn slide_up(&mut self) -> bool { + if self.hunk.after.start == 0 { + return false; + } + if self.tokens[self.hunk.after.start as usize - 1] != self.tokens[self.hunk.after.end as usize - 1] { + return false; + } + self.added[self.hunk.after.start as usize - 1] = true; + self.added[self.hunk.after.end as usize - 1] = false; + self.hunk.after.end -= 1; + self.hunk.after.start = find_hunk_start(self.added, self.hunk.after.start - 1); + // move the start of the remove range one up to keep the unchanged lines aligned + self.hunk.before.end = self.hunk.before.start - 1; + self.hunk.before.start = find_hunk_start(self.removed, self.hunk.before.start - 1); + true + } +} diff --git a/gix-imara-diff/src/slider_heuristic.rs b/gix-imara-diff/src/slider_heuristic.rs new file mode 100644 index 00000000000..13828427e4a --- /dev/null +++ b/gix-imara-diff/src/slider_heuristic.rs @@ -0,0 +1,401 @@ +use std::cmp::Ordering; +use std::hash::Hash; +use std::ops::{Add, Range}; + +use crate::intern::Token; + +/// A trait for heuristics that determine the best position for ambiguous diff hunks. +/// +/// During postprocessing, some hunks can be moved up or down without changing the +/// minimal nature of the diff. This trait allows customizing the logic for choosing +/// the optimal position for such hunks. +pub trait SliderHeuristic { + /// Determines the best ending position for a hunk that can be slid. + /// + /// # Parameters + /// + /// * `tokens` - The token sequence being diffed + /// * `hunk` - The range representing the current hunk position + /// * `earliest_end` - The earliest valid ending position for the hunk + /// + /// # Returns + /// + /// The preferred ending position for the hunk + fn best_slider_end(&mut self, tokens: &[Token], hunk: Range, earliest_end: u32) -> u32; +} + +impl SliderHeuristic for F +where + F: FnMut(&[Token], Range, u32) -> u32, +{ + fn best_slider_end(&mut self, tokens: &[Token], hunk: Range, earliest_end: u32) -> u32 { + self(tokens, hunk, earliest_end) + } +} + +/// A slider heuristic that doesn't adjust hunk positions. +/// +/// This heuristic always places hunks at their lowest possible position without +/// applying any additional logic. +pub struct NoSliderHeuristic; + +impl SliderHeuristic for NoSliderHeuristic { + fn best_slider_end(&mut self, _tokens: &[Token], hunk: Range, _earliest_end: u32) -> u32 { + hunk.end + } +} + +/// A slider heuristic that uses indentation levels to determine the best hunk position. +/// +/// This heuristic analyzes the indentation of lines surrounding potential hunk positions +/// and chooses the position that results in the most intuitive diff for human readers. +/// It's particularly effective for code and other indented text. +pub struct IndentHeuristic { + /// A function that computes the indentation level for a given token. + indent_of_token: IndentOfToken, +} + +impl IndentHeuristic { + /// Creates a new `IndentHeuristic` with the given indentation function. + /// + /// # Parameters + /// + /// * `indent_of_token` - A function that takes a token and returns its indentation level + pub fn new(indent_of_token: IndentOfToken) -> Self { + Self { indent_of_token } + } +} + +impl IndentLevel> SliderHeuristic for IndentHeuristic { + fn best_slider_end(&mut self, tokens: &[Token], hunk: Range, earliest_end: u32) -> u32 { + const MAX_SLIDING: u32 = 100; + // This is a pure insertion that can be moved freely up and down. + // To get more intuitive results, apply a heuristic. + let mut top_slider_end = earliest_end; + // TODO: why is this needed + if top_slider_end < hunk.start - 1 { + top_slider_end = hunk.start - 1; + } + if hunk.end > top_slider_end + MAX_SLIDING { + top_slider_end = hunk.end - MAX_SLIDING; + } + let group_size = hunk.end - hunk.start; + let mut best_score = Score::for_range( + top_slider_end - group_size..top_slider_end, + tokens, + &self.indent_of_token, + ); + let mut best_slider_end = top_slider_end; + for slider_end in (top_slider_end + 1)..=hunk.end { + let score = Score::for_range(slider_end - group_size..slider_end, tokens, &self.indent_of_token); + if score.is_improvement_over(best_score) { + best_score = score; + best_slider_end = slider_end; + } + } + best_slider_end + } +} + +/// Represents the indentation level of a line. +/// +/// Indentation is measured in spaces, with tabs expanded according to a configurable tab width. +/// Special values are used to represent blank lines and maximum indentation. +#[derive(Debug, PartialEq, Eq, Clone, Copy, Hash, PartialOrd)] +pub struct IndentLevel(u8); + +impl IndentLevel { + /// Represents a line that is empty or contains only whitespace (or EOF). + const BLANK: IndentLevel = IndentLevel(u8::MAX); + /// The maximum trackable indentation level. + const MAX: IndentLevel = IndentLevel(200); + + /// Computes the indentation level for an ASCII line. + /// + /// # Parameters + /// + /// * `src` - An iterator over the bytes of the line + /// * `tab_width` - The number of spaces that a tab character represents (min is 1) + /// + /// # Returns + /// + /// The computed indentation level, or `BLANK` if the line contains only whitespace + pub fn for_ascii_line(src: impl IntoIterator, tab_width: u8) -> IndentLevel { + let mut indent_level = IndentLevel(0); + let tab_width = tab_width.max(1); + for c in src { + match c { + b' ' => indent_level = indent_level.increased_by(1), + b'\t' => indent_level = indent_level.increased_by(tab_width - indent_level.0 % tab_width), + b'\r' | b'\n' | b'\x0C' => (), + _ => return indent_level, + } + if indent_level >= Self::MAX { + return Self::MAX; + } + } + IndentLevel::BLANK + } + + /// Computes the indentation level for a Unicode line. + /// + /// # Parameters + /// + /// * `src` - An iterator over the characters of the line + /// * `tab_width` - The number of spaces that a tab character represents + /// + /// # Returns + /// + /// The computed indentation level, or `BLANK` if the line contains only whitespace + pub fn for_line(src: impl IntoIterator, tab_width: u8) -> IndentLevel { + let mut indent_level = IndentLevel(0); + let tab_width = tab_width.max(1); + for c in src { + match c { + ' ' => indent_level = indent_level.increased_by(1), + '\t' => indent_level = indent_level.increased_by(tab_width - indent_level.0 % tab_width), + '\r' | '\n' | '\x0C' => (), + _ => return indent_level, + } + if indent_level >= Self::MAX { + return Self::MAX; + } + } + IndentLevel::BLANK + } + + fn increased_by(self, amount: u8) -> Self { + IndentLevel(self.0.saturating_add(amount).min(Self::MAX.0)) + } + + fn map_or(self, default: T, f: impl FnOnce(u8) -> T) -> T { + if self == Self::BLANK { + default + } else { + f(self.0) + } + } + + fn or(self, default: Self) -> Self { + if self == Self::BLANK { + default + } else { + self + } + } +} + +/// Captures indentation information for a token and its surrounding context. +/// +/// This structure is used by the indent heuristic to evaluate different hunk positions. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct Indents { + /// Indentation level of the current line/token. + indent: IndentLevel, + /// Indentation level of the previous non-blank line. + prev_indent: IndentLevel, + /// Indentation level of the next non-blank line. + next_indent: IndentLevel, + /// The number of consecutive blank lines above the current position. + leading_blanks: u8, + /// The number of blank lines after the line following the current position. + trailing_blanks: u8, +} + +/// Maximum number of consecutive blank lines to consider when computing indentation context. +const MAX_BLANKS: usize = 20; + +impl Indents { + fn at_token(tokens: &[Token], token_idx: usize, indent_of_token: impl Fn(Token) -> IndentLevel) -> Indents { + let (leading_blank_lines, indent_previous_line) = tokens[..token_idx] + .iter() + .rev() + .enumerate() + .find_map(|(i, &token)| { + if i == MAX_BLANKS { + Some((i, IndentLevel(0))) + } else { + let level = indent_of_token(token); + if level == IndentLevel::BLANK { + None + } else { + Some((i, level)) + } + } + }) + .unwrap_or((token_idx, IndentLevel::BLANK)); + let at_eof = token_idx == tokens.len(); + let (trailing_blank_lines, indent_next_line) = if at_eof { + (0, IndentLevel::BLANK) + } else { + tokens[token_idx + 1..] + .iter() + .enumerate() + .find_map(|(i, &token)| { + if i == MAX_BLANKS { + Some((i, IndentLevel(0))) + } else { + let level = indent_of_token(token); + if level == IndentLevel::BLANK { + None + } else { + Some((i, level)) + } + } + }) + .unwrap_or((token_idx, IndentLevel::BLANK)) + }; + let indent = tokens + .get(token_idx) + .map_or(IndentLevel::BLANK, |&token| indent_of_token(token)); + Indents { + indent, + prev_indent: indent_previous_line, + next_indent: indent_next_line, + leading_blanks: leading_blank_lines as u8, + trailing_blanks: trailing_blank_lines as u8, + } + } + + fn score(&self) -> Score { + let mut penalty = 0; + if self.prev_indent == IndentLevel::BLANK && self.leading_blanks == 0 { + penalty += START_OF_FILE_PENALTY; + } + if self.next_indent == IndentLevel::BLANK && self.trailing_blanks == 0 { + penalty += END_OF_FILE_PENALTY; + } + + let trailing_blank_lines = if self.indent == IndentLevel::BLANK { + self.trailing_blanks as i32 + 1 + } else { + 0 + }; + let total_blank_lines = trailing_blank_lines + self.leading_blanks as i32; + penalty += TOTAL_BLANK_LINE_WEIGHT * total_blank_lines + trailing_blank_lines * TRAILING_BLANK_LINES_WEIGHT; + let indent = self.indent.or(self.next_indent); + if indent != IndentLevel::BLANK && self.prev_indent != IndentLevel::BLANK { + match indent.0.cmp(&self.prev_indent.0) { + Ordering::Equal => {} + // self.next_indent != IndentLevel::BLANK follows for free here + // since indent != BLANK and therefore self.next_indent <= indent < BLANK + Ordering::Less if self.next_indent.0 <= indent.0 => { + penalty += if total_blank_lines != 0 { + RELATIVE_DEDENT_WITH_BLANK_PENALTY + } else { + RELATIVE_DEDENT_PENALTY + } + } + Ordering::Less => { + penalty += if total_blank_lines != 0 { + RELATIVE_OUTDENT_WITH_BLANK_PENALTY + } else { + RELATIVE_OUTDENT_PENALTY + } + } + Ordering::Greater => { + penalty += if total_blank_lines != 0 { + RELATIVE_INDENT_WITH_BLANK_PENALTY + } else { + RELATIVE_INDENT_PENALTY + } + } + } + } + Score { + indent: indent.map_or(-1, i32::from), + penalty, + } + } +} + +/// Penalty for placing a hunk at the start of a file. +const START_OF_FILE_PENALTY: i32 = 1; +/// Penalty for placing a hunk at the end of a file. +const END_OF_FILE_PENALTY: i32 = 21; +/// Weight applied to the total number of blank lines surrounding a hunk (negative means preferred). +const TOTAL_BLANK_LINE_WEIGHT: i32 = -30; +/// Additional weight for trailing blank lines. +const TRAILING_BLANK_LINES_WEIGHT: i32 = 6; + +/// Penalty for placing a hunk where indentation increases (negative means preferred). +const RELATIVE_INDENT_PENALTY: i32 = -4; +/// Penalty for placing a hunk where indentation increases with blank lines present. +const RELATIVE_INDENT_WITH_BLANK_PENALTY: i32 = 10; + +/// Penalty for placing a hunk where indentation decreases (outdent). +const RELATIVE_OUTDENT_PENALTY: i32 = 24; +/// Penalty for placing a hunk where indentation decreases with blank lines present. +const RELATIVE_OUTDENT_WITH_BLANK_PENALTY: i32 = 17; + +/// Penalty for placing a hunk where indentation decreases but stays aligned (dedent). +const RELATIVE_DEDENT_PENALTY: i32 = 23; +/// Penalty for placing a hunk where indentation decreases but stays aligned with blank lines present. +const RELATIVE_DEDENT_WITH_BLANK_PENALTY: i32 = 17; + +/// Weight factor for comparing indentation levels when scoring positions. +const INDENT_WEIGHT: i32 = 60; + +/// A score for evaluating the quality of a hunk position. +/// +/// Lower scores are better. The score considers both indentation level +/// and various penalties based on the surrounding context. +#[derive(PartialEq, Eq, Clone, Copy)] +struct Score { + /// The combined indentation level at the hunk boundaries. + indent: i32, + /// The total penalty from various heuristics. + penalty: i32, +} + +impl Score { + fn for_range(range: Range, tokens: &[Token], indent_of_token: impl Fn(Token) -> IndentLevel) -> Score { + Indents::at_token(tokens, range.start as usize, &indent_of_token).score() + + Indents::at_token(tokens, range.end as usize, &indent_of_token).score() + } +} + +impl Add for Score { + type Output = Score; + + fn add(self, rhs: Self) -> Self::Output { + Score { + indent: self.indent + rhs.indent, + penalty: self.penalty + rhs.penalty, + } + } +} + +impl Score { + fn is_improvement_over(self, prev_score: Self) -> bool { + // smaller indentation level is preferred (with a weight) + let indent_score = match prev_score.indent.cmp(&self.indent) { + Ordering::Less => INDENT_WEIGHT, + Ordering::Greater => -INDENT_WEIGHT, + Ordering::Equal => 0, + }; + (indent_score + self.penalty - prev_score.penalty) <= 0 + } +} + +#[cfg(test)] +mod tests { + use super::IndentLevel; + + #[test] + fn ascii_indent_clamps_before_overflow() { + assert_eq!( + IndentLevel::for_ascii_line(std::iter::repeat(b' ').take(255), 1), + IndentLevel::MAX + ); + assert_eq!( + IndentLevel::for_ascii_line(std::iter::repeat(b'\t').take(8), u8::MAX), + IndentLevel::MAX + ); + } + + #[test] + fn unicode_indent_treats_zero_tab_width_as_one() { + assert_eq!(IndentLevel::for_line(['\t', 'x'], 0), IndentLevel(1)); + } +} diff --git a/gix-imara-diff/src/sources.rs b/gix-imara-diff/src/sources.rs new file mode 100644 index 00000000000..e8ddcfc592b --- /dev/null +++ b/gix-imara-diff/src/sources.rs @@ -0,0 +1,179 @@ +//! Utilities for creating token sources from common data types. +//! +//! This module provides implementations of [`TokenSource`] for +//! strings and byte slices, splitting them into lines by default. + +use std::str::from_utf8_unchecked; + +use memchr::memchr; + +use crate::TokenSource; + +/// Returns a [`TokenSource`] that uses the lines in `data` as Tokens. The newline +/// separator (`\r\n` or `\n`) is included in the emitted tokens. This means that changing +/// the newline separator from `\r\n` to `\n` (or omitting it fully on the last line) is +/// detected by [`Diff`](crate::Diff). +pub fn lines(data: &str) -> Lines<'_> { + Lines(ByteLines(data.as_bytes())) +} + +/// Returns a [`TokenSource`] that uses the words in `data` as Tokens. A word is +/// a sequence of alphanumeric characters as determined by +/// `char::is_alphanumeric`, or a sequence of just the space character ' '. Any +/// other characters are their own word. +pub fn words(data: &str) -> Words<'_> { + Words(data) +} + +/// Returns a [`TokenSource`] that uses the lines in `data` as Tokens. The newline +/// separator (`\r\n` or `\n`) is included in the emitted tokens. This means that changing +/// the newline separator from `\r\n` to `\n` (or omitting it fully on the last line) is +/// detected when computing a [`Diff`](crate::Diff). +pub fn byte_lines(data: &[u8]) -> ByteLines<'_> { + ByteLines(data) +} + +/// By default, a line diff is produced for a string +impl<'a> TokenSource for &'a str { + type Token = &'a str; + + type Tokenizer = Lines<'a>; + + fn tokenize(&self) -> Self::Tokenizer { + lines(self) + } + + fn estimate_tokens(&self) -> u32 { + lines(self).estimate_tokens() + } +} + +/// By default, a line diff is produced for a bytes +impl<'a> TokenSource for &'a [u8] { + type Token = Self; + type Tokenizer = ByteLines<'a>; + + fn tokenize(&self) -> Self::Tokenizer { + byte_lines(self) + } + + fn estimate_tokens(&self) -> u32 { + byte_lines(self).estimate_tokens() + } +} + +/// A [`TokenSource`] that returns the lines of a `str` as tokens. See [`lines`] for +/// details. +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct Lines<'a>(ByteLines<'a>); + +impl<'a> Iterator for Lines<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option { + // safety invariant: this struct may only contain valid utf8 + // dividing valid utf8 bytes by ascii characters always produces valid utf-8 + self.0.next().map(|it| unsafe { from_utf8_unchecked(it) }) + } +} + +/// By default, a line diff is produced for a string +impl<'a> TokenSource for Lines<'a> { + type Token = &'a str; + + type Tokenizer = Self; + + fn tokenize(&self) -> Self::Tokenizer { + *self + } + + fn estimate_tokens(&self) -> u32 { + self.0.estimate_tokens() + } +} + +/// A [`TokenSource`] that returns the words of a string as tokens. See +/// [`words`] for details. +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct Words<'a>(&'a str); + +impl<'a> Iterator for Words<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option { + if self.0.is_empty() { + return None; + } + + let initial = self.0.chars().next().unwrap(); + let word_len = if initial == ' ' { + self.0 + .char_indices() + .find(|(_, c)| *c != ' ') + .map_or(self.0.len(), |(index, _)| index) + } else if initial.is_alphanumeric() { + self.0 + .char_indices() + .find(|(_, c)| !c.is_alphanumeric() && *c != '_') + .map_or(self.0.len(), |(index, _)| index) + } else { + initial.len_utf8() + }; + + let (word, rem) = self.0.split_at(word_len); + self.0 = rem; + Some(word) + } +} +impl<'a> TokenSource for Words<'a> { + type Token = &'a str; + + type Tokenizer = Self; + + fn tokenize(&self) -> Self::Tokenizer { + *self + } + + fn estimate_tokens(&self) -> u32 { + (self.0.len() / 3) as u32 + } +} + +/// A [`TokenSource`] that returns the lines of a byte slice as tokens. See [`byte_lines`] +/// for details. +#[derive(Clone, Copy, PartialEq, Eq)] +pub struct ByteLines<'a>(&'a [u8]); + +impl<'a> Iterator for ByteLines<'a> { + type Item = &'a [u8]; + + fn next(&mut self) -> Option { + if self.0.is_empty() { + return None; + } + let line_len = memchr(b'\n', self.0).map_or(self.0.len(), |len| len + 1); + let (line, rem) = self.0.split_at(line_len); + self.0 = rem; + Some(line) + } +} + +/// By default, a line diff is produced for a string +impl<'a> TokenSource for ByteLines<'a> { + type Token = &'a [u8]; + + type Tokenizer = Self; + + fn tokenize(&self) -> Self::Tokenizer { + *self + } + + fn estimate_tokens(&self) -> u32 { + let len: usize = self.take(20).map(|line| line.len()).sum(); + if len == 0 { + 100 + } else { + (self.0.len() * 20 / len) as u32 + } + } +} diff --git a/gix-imara-diff/src/tests.rs b/gix-imara-diff/src/tests.rs new file mode 100644 index 00000000000..a0e93558a86 --- /dev/null +++ b/gix-imara-diff/src/tests.rs @@ -0,0 +1,68 @@ +use crate::{Algorithm, BasicLineDiffPrinter, Diff, InternedInput, UnifiedDiffConfig}; +use expect_test::expect; + +#[test] +fn myers_is_even() { + let before = "a\nb\nx\nx\ny\n"; + let after = "b\na\nx\ny\nx\n"; + + cov_mark::check!(EVEN_SPLIT); + // if the check for is_odd incorrectly always true then we take a fastpath + // when we shouldn't, which always leads to infinite iterations/recursion + // still we check the number of iterations here in case the search + // is buggy in more subtle ways + cov_mark::check_count!(SPLIT_SEARCH_ITER, 15); + let input = InternedInput::new(before, after); + let diff = Diff::compute(Algorithm::Myers, &input); + expect![[r#" + @@ -1,5 +1,5 @@ + -a + b + -x + +a + x + y + +x + "#]] + .assert_eq( + &diff + .unified_diff( + &BasicLineDiffPrinter(&input.interner), + UnifiedDiffConfig::default(), + &input, + ) + .to_string(), + ); +} + +#[test] +fn myers_is_odd() { + let before = "a\nb\nx\ny\nx\n"; + let after = "b\na\nx\ny\n"; + + cov_mark::check!(ODD_SPLIT); + // if the check for odd doesn't work then + // we still find the correct result but the number of search + // iterations increases + cov_mark::check_count!(SPLIT_SEARCH_ITER, 9); + let input = InternedInput::new(before, after); + let diff = Diff::compute(Algorithm::Myers, &input); + expect![[r#" + @@ -1,5 +1,4 @@ + -a + b + +a + x + y + -x + "#]] + .assert_eq( + &diff + .unified_diff( + &BasicLineDiffPrinter(&input.interner), + UnifiedDiffConfig::default(), + &input, + ) + .to_string(), + ); +} diff --git a/gix-imara-diff/src/unified_diff.rs b/gix-imara-diff/src/unified_diff.rs new file mode 100644 index 00000000000..703b3b5285a --- /dev/null +++ b/gix-imara-diff/src/unified_diff.rs @@ -0,0 +1,278 @@ +use std::fmt::{self, Display}; +use std::hash::Hash; + +use crate::intern::{InternedInput, Interner, Token}; +use crate::Diff; + +impl Diff { + /// Creates a unified diff output that can be formatted as a string. + /// + /// This is a convenience method that extracts the token sequences from the `input`. + /// + /// # Parameters + /// + /// * `printer` - A printer implementation that controls how tokens are displayed + /// * `config` - Configuration options for the unified diff format + /// * `input` - The interned input containing the token sequences + pub fn unified_diff<'a, P: UnifiedDiffPrinter, T: Hash + Eq>( + &'a self, + printer: &'a P, + config: UnifiedDiffConfig, + input: &'a InternedInput, + ) -> UnifiedDiff<'a, P> { + self.unified_diff_with(printer, config, &input.before, &input.after) + } + + /// Creates a unified diff output with explicit token sequences. + /// + /// # Parameters + /// + /// * `printer` - A printer implementation that controls how tokens are displayed + /// * `config` - Configuration options for the unified diff format + /// * `before` - The token sequence from the first file, before changes. + /// * `after` - The token sequence from the second file, after changes. + pub fn unified_diff_with<'a, P: UnifiedDiffPrinter>( + &'a self, + printer: &'a P, + config: UnifiedDiffConfig, + before: &'a [Token], + after: &'a [Token], + ) -> UnifiedDiff<'a, P> { + UnifiedDiff { + printer, + diff: self, + config, + before, + after, + } + } +} + +/// A trait for customizing the output format of unified diffs. +/// +/// Implementations of this trait control how different parts of a unified diff are displayed, +/// including headers, context lines, and changed hunks. +pub trait UnifiedDiffPrinter { + /// Displays the header for a hunk in the unified diff format. + /// + /// The header typically includes the line numbers and lengths for both files. + /// + /// # Parameters + /// + /// * `f` - The formatter to write to + /// * `start_before` - The starting line number in the first file (0-indexed) + /// * `start_after` - The starting line number in the second file (0-indexed) + /// * `len_before` - The number of lines from the first file in this hunk + /// * `len_after` - The number of lines from the second file in this hunk + fn display_header( + &self, + f: impl fmt::Write, + start_before: u32, + start_after: u32, + len_before: u32, + len_after: u32, + ) -> fmt::Result; + /// Displays a context token (an unchanged line) in the unified diff. + /// + /// # Parameters + /// + /// * `f` - The formatter to write to + /// * `token` - The token to display + fn display_context_token(&self, f: impl fmt::Write, token: Token) -> fmt::Result; + /// Displays a hunk showing the changes between before and after tokens. + /// + /// # Parameters + /// + /// * `f` - The formatter to write to + /// * `before` - The tokens from the first file that were removed + /// * `after` - The tokens from the second file that were added + fn display_hunk(&self, f: impl fmt::Write, before: &[Token], after: &[Token]) -> fmt::Result; +} + +/// Configuration options for unified diff output. +/// +/// Controls aspects of the unified diff format such as the number of context lines. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct UnifiedDiffConfig { + /// The number of unchanged lines to show around each hunk for context. + context_len: u32, +} + +impl Default for UnifiedDiffConfig { + fn default() -> Self { + UnifiedDiffConfig { context_len: 3 } + } +} + +impl UnifiedDiffConfig { + /// Sets the number of context lines to display around each hunk. + /// + /// # Parameters + /// + /// * `len` - The number of unchanged lines to show before and after each change + /// + /// # Returns + /// + /// A mutable reference to self for method chaining + pub fn context_len(&mut self, len: u32) -> &mut Self { + self.context_len = len; + self + } +} + +/// A helper trait for determining if a token ends with a newline. +/// +/// This is used by the unified diff printer to decide whether to add newlines +/// when displaying tokens. +pub trait EndsWithNewline { + /// Returns `true` if the token ends with a newline character. + fn ends_with_newline(&self) -> bool; +} + +impl + ?Sized> EndsWithNewline for T { + fn ends_with_newline(&self) -> bool { + self.as_ref().ends_with(b"\n") + } +} + +/// A basic implementation of [`UnifiedDiffPrinter`] for line-based diffs. +/// +/// This printer formats diffs in the standard unified diff format commonly used by +/// tools like `git diff` and `diff -u`. It displays removed lines with a `-` prefix +/// and added lines with a `+` prefix. +pub struct BasicLineDiffPrinter<'a, T: EndsWithNewline + ?Sized + Hash + Eq + Display>( + /// A reference to the interner containing the line data. + pub &'a Interner<&'a T>, +); + +impl UnifiedDiffPrinter for BasicLineDiffPrinter<'_, T> { + fn display_header( + &self, + mut f: impl fmt::Write, + start_before: u32, + start_after: u32, + len_before: u32, + len_after: u32, + ) -> fmt::Result { + writeln!( + f, + "@@ -{},{} +{},{} @@", + start_before + 1, + len_before, + start_after + 1, + len_after + ) + } + + fn display_context_token(&self, mut f: impl fmt::Write, token: Token) -> fmt::Result { + write!(f, " {}", &self.0[token])?; + if !&self.0[token].ends_with_newline() { + writeln!(f)?; + } + Ok(()) + } + + fn display_hunk(&self, mut f: impl fmt::Write, before: &[Token], after: &[Token]) -> fmt::Result { + if let Some(&last) = before.last() { + for &token in before { + let token = self.0[token]; + write!(f, "-{token}")?; + } + if !self.0[last].ends_with_newline() { + writeln!(f)?; + } + } + if let Some(&last) = after.last() { + for &token in after { + let token = self.0[token]; + write!(f, "+{token}")?; + } + if !self.0[last].ends_with_newline() { + writeln!(f)?; + } + } + Ok(()) + } +} + +/// A formatted unified diff that can be displayed as a string. +/// +/// This structure is created by [`Diff::unified_diff`] or [`Diff::unified_diff_with`] +/// and implements [`Display`] to produce standard unified diff output. +pub struct UnifiedDiff<'a, P: UnifiedDiffPrinter> { + /// The printer that controls output formatting. + printer: &'a P, + /// The computed diff to display. + diff: &'a Diff, + /// Configuration for the unified diff format. + config: UnifiedDiffConfig, + /// The token sequence from the first file, before changes. + before: &'a [Token], + /// The token sequence from the second file, after changes. + after: &'a [Token], +} + +impl Display for UnifiedDiff<'_, P> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let first_hunk = self.diff.hunks().next().unwrap_or_default(); + let context_len = self.config.context_len.min(1024 * 1024); + let mut pos = first_hunk.before.start.saturating_sub(context_len); + let mut before_context_start = pos; + let mut after_context_start = first_hunk.after.start.saturating_sub(context_len); + let mut before_context_len = 0; + let mut after_context_len = 0; + let mut buffer = String::new(); + for hunk in self.diff.hunks() { + if hunk.before.start - pos > 2 * context_len { + if !buffer.is_empty() { + let end = (pos + context_len).min(self.before.len() as u32); + self.printer.display_header( + &mut *f, + before_context_start, + after_context_start, + before_context_len + end - pos, + after_context_len + end - pos, + )?; + write!(f, "{buffer}")?; + for &token in &self.before[pos as usize..end as usize] { + self.printer.display_context_token(&mut *f, token)?; + } + buffer.clear(); + } + pos = hunk.before.start - context_len; + before_context_start = pos; + after_context_start = hunk.after.start - context_len; + before_context_len = 0; + after_context_len = 0; + } + for &token in &self.before[pos as usize..hunk.before.start as usize] { + self.printer.display_context_token(&mut buffer, token)?; + } + let context_len = hunk.before.start - pos; + before_context_len += hunk.before.len() as u32 + context_len; + after_context_len += hunk.after.len() as u32 + context_len; + self.printer.display_hunk( + &mut buffer, + &self.before[hunk.before.start as usize..hunk.before.end as usize], + &self.after[hunk.after.start as usize..hunk.after.end as usize], + )?; + pos = hunk.before.end; + } + if !buffer.is_empty() { + let end = (pos + context_len).min(self.before.len() as u32); + self.printer.display_header( + &mut *f, + before_context_start, + after_context_start, + before_context_len + end - pos, + after_context_len + end - pos, + )?; + write!(f, "{buffer}")?; + for &token in &self.before[pos as usize..end as usize] { + self.printer.display_context_token(&mut *f, token)?; + } + buffer.clear(); + } + Ok(()) + } +} diff --git a/gix-imara-diff/src/util.rs b/gix-imara-diff/src/util.rs new file mode 100644 index 00000000000..359a50ba8e0 --- /dev/null +++ b/gix-imara-diff/src/util.rs @@ -0,0 +1,108 @@ +use crate::intern::Token; +use crate::Hunk; + +/// Computes the number of common tokens at the start of two sequences. +pub fn common_prefix(file1: &[Token], file2: &[Token]) -> u32 { + let mut off = 0; + for (token1, token2) in file1.iter().zip(file2) { + if token1 != token2 { + break; + } + off += 1; + } + off +} + +/// Computes the number of common tokens at the end of two sequences. +pub fn common_postfix(file1: &[Token], file2: &[Token]) -> u32 { + let mut off = 0; + for (token1, token2) in file1.iter().rev().zip(file2.iter().rev()) { + if token1 != token2 { + break; + } + off += 1; + } + off +} + +/// Computes both the common prefix and postfix lengths of two sequences. +pub fn common_edges(file1: &[Token], file2: &[Token]) -> (u32, u32) { + let prefix = common_prefix(file1, file2); + let postfix = common_postfix(&file1[prefix as usize..], &file2[prefix as usize..]); + (prefix, postfix) +} + +/// Removes the common prefix from both sequences and returns its length. +pub fn strip_common_prefix(file1: &mut &[Token], file2: &mut &[Token]) -> u32 { + let off = common_prefix(file1, file2); + *file1 = &file1[off as usize..]; + *file2 = &file2[off as usize..]; + off +} + +/// Removes the common postfix from both sequences and returns its length. +pub fn strip_common_postfix(file1: &mut &[Token], file2: &mut &[Token]) -> u32 { + let off = common_postfix(file1, file2); + *file1 = &file1[..file1.len() - off as usize]; + *file2 = &file2[..file2.len() - off as usize]; + off +} + +/// Computes an approximation of the square root using bit operations. +pub fn sqrt(val: usize) -> u32 { + let nbits = (usize::BITS - val.leading_zeros()) / 2; + 1 << nbits +} + +impl Hunk { + pub(crate) fn next_hunk(&mut self, removed: &[bool], added: &[bool]) -> bool { + let Some(off) = find_next_change(added, self.after.end) else { + return false; + }; + let mut off_before = 0; + loop { + debug_assert!( + removed.len() as u32 != self.before.end || off == 0, + "broken hunk alignment {self:?} " + ); + let unchanged_tokens = + find_next_change(removed, self.before.end).unwrap_or(removed.len() as u32 - self.before.end); + if off_before + unchanged_tokens > off { + self.before.start = self.before.end + (off - off_before); + self.before.end = self.before.start; + break; + } + off_before += unchanged_tokens; + self.before.start = self.before.end + unchanged_tokens; + self.before.end = find_hunk_end(removed, self.before.end + unchanged_tokens); + if off_before == off { + break; + } + } + self.after.start = self.after.end + off; + self.after.end = find_hunk_end(added, self.after.start); + true + } +} + +/// Finds the offset to the next changed token starting from the given position. +pub fn find_next_change(changes: &[bool], pos: u32) -> Option { + changes[pos as usize..] + .iter() + .position(|&changed| changed) + .map(|off| off as u32) +} + +/// Finds the end position of a hunk of changed tokens starting from the given position. +pub fn find_hunk_end(changes: &[bool], pos: u32) -> u32 { + pos + changes[pos as usize..].iter().take_while(|&&changed| changed).count() as u32 +} + +/// Finds the start position of a hunk of changed tokens ending at the given position. +pub fn find_hunk_start(changes: &[bool], pos: u32) -> u32 { + pos - changes[..pos as usize] + .iter() + .rev() + .take_while(|&&changed| changed) + .count() as u32 +} diff --git a/gix-imara-diff/tests/fixtures/make_git_diffs.sh b/gix-imara-diff/tests/fixtures/make_git_diffs.sh new file mode 100755 index 00000000000..4ca8a9d8d9e --- /dev/null +++ b/gix-imara-diff/tests/fixtures/make_git_diffs.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +set -e +num_revs=20 + +for repo_file in corpus/*.info; do + repo_url=$(cat "${repo_file}") + git clone "${repo_url}" + repo_name=$(basename -- "$repo_file") + repo_name="${repo_file_name%.*}" + cd "${repo_name}" + git revl + git diff --n $num_revs + cd .. +done diff --git a/gix-imara-diff/tests/integration/main.rs b/gix-imara-diff/tests/integration/main.rs new file mode 100644 index 00000000000..3aabe460763 --- /dev/null +++ b/gix-imara-diff/tests/integration/main.rs @@ -0,0 +1,596 @@ +use std::mem::swap; + +use expect_test::expect; +// use git::bstr::BStr; +// use git_repository as git; + +use gix_imara_diff::sources::words; +use gix_imara_diff::BasicLineDiffPrinter; +use gix_imara_diff::InternedInput; +use gix_imara_diff::{Algorithm, Diff, UnifiedDiffConfig}; + +const ALL_ALGORITHMS: [Algorithm; 2] = [Algorithm::Histogram, Algorithm::Myers]; + +#[test] +fn words_tokenizer() { + let text = "Hello, imara!\n (foo-bar_baz)"; + let tokens = words(text).collect::>(); + assert_eq!( + tokens, + vec!["Hello", ",", " ", "imara", "!", "\n", " ", "(", "foo", "-", "bar_baz", ")"] + ); +} + +#[test] +fn replace() { + let before = r#"fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; + println!("hello world") +} +"#; + + let after = r#"const TEST: i32 = 0; +fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; + println!("hello world"); + println!("hello foo {TEST}"); +} + +"#; + let input = InternedInput::new(before, after); + for algorithm in ALL_ALGORITHMS { + let mut diff = Diff::compute(algorithm, &input); + diff.postprocess_lines(&input); + expect![[r#" + @@ -1,5 +1,8 @@ + +const TEST: i32 = 0; + fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; + - println!("hello world") + + println!("hello world"); + + println!("hello foo {TEST}"); + } + + + "#]] + .assert_eq( + &diff + .unified_diff( + &BasicLineDiffPrinter(&input.interner), + UnifiedDiffConfig::default(), + &input, + ) + .to_string(), + ); + } +} + +#[test] +fn identical_files() { + let file = r#"fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; +}"#; + + for algorithm in ALL_ALGORITHMS { + let input = InternedInput::new(file, file); + let mut diff = Diff::compute(algorithm, &input); + diff.postprocess_lines(&input); + assert_eq!( + diff.unified_diff( + &BasicLineDiffPrinter(&input.interner), + UnifiedDiffConfig::default(), + &input, + ) + .to_string(), + "" + ); + } +} + +#[test] +fn simple_insert() { + let before = r#"fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; +}"#; + + let after = r#"fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; + println("hello world") +}"#; + + let mut input = InternedInput::new(before, after); + for algorithm in ALL_ALGORITHMS { + let mut diff = Diff::compute(algorithm, &input); + diff.postprocess_lines(&input); + expect![[r#" + @@ -1,4 +1,5 @@ + fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; + + println("hello world") + } + "#]] + .assert_eq( + &diff + .unified_diff( + &BasicLineDiffPrinter(&input.interner), + UnifiedDiffConfig::default(), + &input, + ) + .to_string(), + ); + + swap(&mut input.before, &mut input.after); + + let mut diff = Diff::compute(algorithm, &input); + diff.postprocess_lines(&input); + expect![[r#" + @@ -1,5 +1,4 @@ + fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; + - println("hello world") + } + "#]] + .assert_eq( + &diff + .unified_diff( + &BasicLineDiffPrinter(&input.interner), + UnifiedDiffConfig::default(), + &input, + ) + .to_string(), + ); + swap(&mut input.before, &mut input.after); + } +} + +#[test] +fn unified_diff_context_lines_near_input_start_and_end() { + let before = r#"a +b +c +d +e +f +g +h +i +"#; + + let after = r#"a +b +c +d +edit +f +g +h +i +"#; + + let input = InternedInput::new(before, after); + for algorithm in ALL_ALGORITHMS { + let mut diff = Diff::compute(algorithm, &input); + diff.postprocess_lines(&input); + expect![[r#" + @@ -2,7 +2,7 @@ + b + c + d + -e + +edit + f + g + h + "#]] + .assert_eq( + &diff + .unified_diff( + &BasicLineDiffPrinter(&input.interner), + UnifiedDiffConfig::default(), + &input, + ) + .to_string(), + ); + } +} + +mod latin_word_diff { + use crate::ALL_ALGORITHMS; + + use gix_imara_diff::sources::words; + use gix_imara_diff::{Diff, InternedInput, Token}; + use std::mem::swap; + use std::ops::Range; + + #[test] + fn pure_insertion_or_removal() { + let before = r#"fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; +}"#; + let after = r#"fn foo() -> Bar{ + let mut foo = 2.0; + foo *= 100 / 2; + println("hello world") +}"#; + let mut input = InternedInput::new(before, after); + for algorithm in ALL_ALGORITHMS { + let mut diff_input = InternedInput::default(); + let mut out = Diff::default(); + + let mut diff = Diff::compute(algorithm, &input); + diff.postprocess_lines(&input); + + let mut hunks = diff.hunks(); + let hunk = hunks.next().expect("missing first hunk"); + hunk.latin_word_diff(&input, &mut diff_input, &mut out); + hunks = out.hunks(); + + let first = hunks.next().expect("missing first inner hunk"); + assert!(first.is_pure_insertion()); + assert_eq!(first.before, 0..0); + assert_eq!(first.after, 0..words(" println(\"hello world\")\n").count() as u32); + assert_eq!(hunks.next(), None); + assert_eq!(hunks.next(), None); + assert_eq!( + visualise(&diff_input, &first.after, &diff_input.after), + " |println|(|\"|hello| |world|\"|)|\n" + ); + + swap(&mut input.before, &mut input.after); + + let mut diff = Diff::compute(algorithm, &input); + diff.postprocess_lines(&input); + + hunks = diff.hunks(); + let hunk = hunks.next().expect("missing first hunk"); + hunk.latin_word_diff(&input, &mut diff_input, &mut out); + hunks = out.hunks(); + + let first = hunks.next().expect("missing first inner hunk"); + assert!(first.is_pure_removal()); + assert_eq!(first.before, 0..words(" println(\"hello world\")\n").count() as u32); + assert_eq!(first.after, 0..0); + assert_eq!(hunks.next(), None); + assert_eq!(hunks.next(), None); + assert_eq!( + visualise(&diff_input, &first.before, &diff_input.before), + " |println|(|\"|hello| |world|\"|)|\n" + ); + + swap(&mut input.before, &mut input.after); + } + } + + #[test] + fn modification() { + let before = r#"fn foo() -> Bar { + let mut foo = 2.0; + foo *= 100 / 2; +}"#; + let after = r#"fn foo() -> Bar { + let mut foo = 3.0 * 2.0; + foo += 100 / 2; +}"#; + let mut input = InternedInput::new(before, after); + for algorithm in ALL_ALGORITHMS { + let mut diff_input = InternedInput::default(); + let mut out = Diff::default(); + + let mut diff = Diff::compute(algorithm, &input); + diff.postprocess_lines(&input); + + let mut hunks = diff.hunks(); + let hunk = hunks.next().expect("missing first hunk"); + hunk.latin_word_diff(&input, &mut diff_input, &mut out); + hunks = out.hunks(); + + let first = hunks.next().expect("missing first inner hunk"); + assert!(first.is_pure_insertion()); + let off = words(" let mut foo = ").count() as u32; + assert_eq!(first.before, off..off); + let ins = words("3.0 * ").count() as u32; + assert_eq!(first.after, off..ins + off); + assert_eq!(visualise(&diff_input, &first.before, &diff_input.before), ""); + assert_eq!(visualise(&diff_input, &first.after, &diff_input.after), "3|.|0| |*| "); + + let second = hunks.next().expect("missing second inner hunk"); + let off = words( + r#" let mut foo = 2.0; + foo "#, + ) + .count() as u32; + assert_eq!(second.before, off..1 + off); + assert_eq!(second.after, ins + off..1 + ins + off); + assert_eq!(visualise(&diff_input, &second.before, &diff_input.before), "*"); + assert_eq!(visualise(&diff_input, &second.after, &diff_input.after), "+"); + assert_eq!(hunks.next(), None); + assert_eq!(hunks.next(), None); + + swap(&mut input.before, &mut input.after); + + let mut diff = Diff::compute(algorithm, &input); + diff.postprocess_lines(&input); + + hunks = diff.hunks(); + let hunk = hunks.next().expect("missing first hunk"); + + hunk.latin_word_diff(&input, &mut diff_input, &mut out); + hunks = out.hunks(); + + let first = hunks.next().expect("missing first inner hunk"); + assert!(first.is_pure_removal()); + let off = words(" let mut foo = ").count() as u32; + let rem = words("3.0 * ").count() as u32; + assert_eq!(first.before, off..rem + off); + assert_eq!(first.after, off..off); + let second = hunks.next().expect("missing second inner hunk"); + let off = words( + r#" let mut foo = 2.0; + foo "#, + ) + .count() as u32; + assert_eq!(second.before, rem + off..1 + rem + off); + assert_eq!(second.after, off..1 + off); + assert_eq!(hunks.next(), None); + assert_eq!(hunks.next(), None); + + swap(&mut input.before, &mut input.after); + } + } + + fn visualise(diff_input: &InternedInput<&str>, token_ids: &Range, tokens: &[Token]) -> String { + token_ids + .clone() + .map(|id| { + let id = id as usize; + diff_input.interner[tokens[id]] + }) + .collect::>() + .join("|") + } +} + +#[test] +#[cfg(not(miri))] +fn hand_checked_unidiffs() { + let before = r#"use crate::{ + alpha::Alpha, + beta::Beta, + gamma::Gamma, +}; + +use std::{ + collections::{HashMap, HashSet}, + path::Path, +}; + +pub struct Engine { + cache: HashMap, + steps: Vec<&'static str>, +} + +impl Engine { + pub fn new() -> Self { + Self { + cache: HashMap::new(), + steps: vec!["parse", "render"], + } + } + + pub fn update(&mut self, path: &Path) { + let _ = path; + self.steps.push("scan"); + } +} + +fn unchanged_one() { + println!("one"); +} + +fn unchanged_two() { + println!("two"); +} + +pub enum Error { + InvalidPath, + Unknown, +} + +pub struct Layer { + pub depth: usize, +} + +impl Layer { + pub fn parse(&self) -> Result<(), Error> { + Ok(()) + } +} +"#; + let after = r#"use crate::{ + alpha::Alpha, + beta::Beta, + gamma::Gamma, +}; + +use std::{ + collections::HashMap, + mem::replace, + path::Path, +}; + +pub struct Engine { + cache: HashMap, + steps: Vec<&'static str>, + dirty: bool, +} + +impl Engine { + pub fn new() -> Self { + Self { + cache: HashMap::new(), + steps: vec!["parse", "render"], + dirty: false, + } + } + + pub fn update(&mut self, path: &Path) { + let _previous = replace(&mut self.dirty, true); + let _ = path; + self.steps.push("scan"); + } +} + +fn unchanged_one() { + println!("one"); +} + +fn unchanged_two() { + println!("two"); +} + +pub enum Error { + InvalidPath, + InvalidState, + Unknown, +} + +pub struct Layer { + pub depth: u32, +} + +impl Layer { + pub fn parse(&self) -> Result<(), Error> { + Ok(()) + } +} +"#; + + for algorithm in ALL_ALGORITHMS { + println!("{algorithm:?}"); + let input = InternedInput::new(before, after); + let mut diff = Diff::compute(algorithm, &input); + diff.postprocess_lines(&input); + expect![[r#" +@@ -5,13 +5,15 @@ + }; + + use std::{ +- collections::{HashMap, HashSet}, ++ collections::HashMap, ++ mem::replace, + path::Path, + }; + + pub struct Engine { + cache: HashMap, + steps: Vec<&'static str>, ++ dirty: bool, + } + + impl Engine { +@@ -19,10 +21,12 @@ + Self { + cache: HashMap::new(), + steps: vec!["parse", "render"], ++ dirty: false, + } + } + + pub fn update(&mut self, path: &Path) { ++ let _previous = replace(&mut self.dirty, true); + let _ = path; + self.steps.push("scan"); + } +@@ -38,11 +42,12 @@ + + pub enum Error { + InvalidPath, ++ InvalidState, + Unknown, + } + + pub struct Layer { +- pub depth: usize, ++ pub depth: u32, + } + + impl Layer { +"#]] + .assert_eq( + &diff + .unified_diff( + &BasicLineDiffPrinter(&input.interner), + UnifiedDiffConfig::default(), + &input, + ) + .to_string(), + ); + } +} + +#[test] +fn postprocess() { + let before = r#" + /* + * Stay on the safe side. if read_directory() has run once on + * "dir", some sticky flag may have been left. Clear them all. + */ + clear_sticky(dir); + + /* + * exclude patterns are treated like positive ones in + * create_simplify. Usually exclude patterns should be a + * subset of positive ones, which has no impacts on + * foo + * bar + * test + */ + foo + "#; + let after = r#" + /* + * exclude patterns are treated like positive ones in + * create_simplify. Usually exclude patterns should be a + * subset of positive ones, which has no impacts on + * foo + * bar + * test + */ + foo + "#; + + let input = InternedInput::new(before, after); + for algorithm in [Algorithm::Histogram, Algorithm::Myers] { + let mut diff = Diff::compute(algorithm, &input); + diff.postprocess_lines(&input); + let diff = diff + .unified_diff( + &BasicLineDiffPrinter(&input.interner), + UnifiedDiffConfig::default(), + &input, + ) + .to_string(); + expect![[r#" + @@ -1,10 +1,4 @@ + + - /* + - * Stay on the safe side. if read_directory() has run once on + - * "dir", some sticky flag may have been left. Clear them all. + - */ + - clear_sticky(dir); + - + /* + * exclude patterns are treated like positive ones in + * create_simplify. Usually exclude patterns should be a + "#]] + .assert_eq(&diff); + } +} diff --git a/gix-imara-diff/tests/sliders.rs b/gix-imara-diff/tests/sliders.rs new file mode 100644 index 00000000000..2453be764cd --- /dev/null +++ b/gix-imara-diff/tests/sliders.rs @@ -0,0 +1,26 @@ +// use git_repository::bstr::BStr; +// use git_repository::objs::Kind; +// use git_repository::Repository; +// use imara_diff::{intern::InternedInput, Algorithm, Diff}; + +// fn diff(algo: Algorithm, repo: &Repository, file_rev1: &BStr, file_rev2: &BStr) { +// let file1 = repo +// .rev_parse_single(file_rev1) +// .unwrap() +// .object() +// .unwrap() +// .peel_to_kind(Kind::Blob) +// .unwrap(); +// let file2 = repo +// .rev_parse_single(file_rev2) +// .unwrap() +// .object() +// .unwrap() +// .peel_to_kind(Kind::Blob) +// .unwrap().kind; + +// let input = InternedInput::new(&*file1.data, &*file2.data); +// let mut diff = Diff::compute(algo, &input); +// diff.postprocess(&input); +// diff.unified_diff(,) +// } diff --git a/gix-merge/Cargo.toml b/gix-merge/Cargo.toml index 8bef1943d0f..aa73e7eea92 100644 --- a/gix-merge/Cargo.toml +++ b/gix-merge/Cargo.toml @@ -35,9 +35,9 @@ gix-revision = { version = "^0.43.0", path = "../gix-revision", default-features gix-revwalk = { version = "^0.29.0", path = "../gix-revwalk" } gix-diff = { version = "^0.61.0", path = "../gix-diff", default-features = false, features = ["blob"] } gix-index = { version = "^0.49.0", path = "../gix-index" } +imara-diff = { package = "gix-imara-diff-01", version = "0.1.8", path = "../gix-imara-diff-01" } thiserror = "2.0.18" -imara-diff = { version = "0.1.8" } bstr = { version = "1.12.0", default-features = false } nonempty = "0.12.0" serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"] } @@ -50,6 +50,7 @@ gix-odb = { path = "../gix-odb" } gix-utils = { path = "../gix-utils" } termtree = "1.0.0" pretty_assertions = "1.4.0" +arbitrary = { version = "1.4.2", features = ["derive"] } [package.metadata.docs.rs] all-features = true diff --git a/gix-merge/fuzz/Cargo.toml b/gix-merge/fuzz/Cargo.toml index 643d0155511..46d307a937c 100644 --- a/gix-merge/fuzz/Cargo.toml +++ b/gix-merge/fuzz/Cargo.toml @@ -12,7 +12,7 @@ cargo-fuzz = true anyhow = "1.0.76" libfuzzer-sys = "0.4" arbitrary = { version = "1.3.2", features = ["derive"] } -imara-diff = { version = "0.1.8" } +imara-diff = { package = "gix-imara-diff-01", version = "0.1.8", path = "../../gix-imara-diff-01" } gix-merge = { path = "..", features = ["sha1"] } # Prevent this from interfering with workspaces diff --git a/gix-merge/tests/fixtures/clusterfuzz-testcase-minimized-gix-merge-blob-6377298803884032 b/gix-merge/tests/fixtures/clusterfuzz-testcase-minimized-gix-merge-blob-6377298803884032 new file mode 100644 index 00000000000..666825a10bb Binary files /dev/null and b/gix-merge/tests/fixtures/clusterfuzz-testcase-minimized-gix-merge-blob-6377298803884032 differ diff --git a/gix-merge/tests/merge/blob/builtin_driver.rs b/gix-merge/tests/merge/blob/builtin_driver.rs index 2172f86a5e5..1a6c8ec424f 100644 --- a/gix-merge/tests/merge/blob/builtin_driver.rs +++ b/gix-merge/tests/merge/blob/builtin_driver.rs @@ -27,6 +27,7 @@ fn binary() { } mod text { + use arbitrary::Arbitrary; use bstr::ByteSlice; use gix_merge::blob::{ builtin_driver, @@ -34,6 +35,7 @@ mod text { Resolution, }; use pretty_assertions::assert_str_eq; + use std::num::NonZero; const DIVERGING: &[&str] = &[ // Somehow, on in zdiff mode, it's different, and I wasn't able to figure out the rule properly. @@ -140,6 +142,76 @@ mod text { } } + /// This test reproduces what the fuzzer does, allowing it to accept `Arbitrary` input produced by the fuzzer. + #[test] + fn clusterfuzz_timeout_regression() { + #[derive(Debug, Arbitrary)] + struct FuzzCtx<'a> { + base: &'a [u8], + ours: &'a [u8], + theirs: &'a [u8], + marker_size: NonZero, + } + fn run_fuzz_case(ours: &[u8], base: &[u8], theirs: &[u8], marker_size: NonZero) { + let mut out = Vec::new(); + let mut input = imara_diff::intern::InternedInput::default(); + for diff_algorithm in [ + imara_diff::Algorithm::Histogram, + imara_diff::Algorithm::Myers, + imara_diff::Algorithm::MyersMinimal, + ] { + let mut options = builtin_driver::text::Options { + diff_algorithm, + conflict: Default::default(), + }; + for (left, right) in [(ours, theirs), (theirs, ours)] { + let resolution = gix_merge::blob::builtin_driver::text( + &mut out, + &mut input, + Default::default(), + left, + base, + right, + options, + ); + if resolution == Resolution::Conflict { + for conflict in [ + Conflict::ResolveWithOurs, + Conflict::ResolveWithTheirs, + Conflict::ResolveWithUnion, + Conflict::Keep { + style: ConflictStyle::Diff3, + marker_size, + }, + Conflict::Keep { + style: ConflictStyle::ZealousDiff3, + marker_size, + }, + ] { + options.conflict = conflict; + gix_merge::blob::builtin_driver::text( + &mut out, + &mut input, + Default::default(), + left, + base, + right, + options, + ); + } + } + } + } + } + + let ctx = FuzzCtx::arbitrary(&mut arbitrary::Unstructured::new(include_bytes!( + "../../fixtures/clusterfuzz-testcase-minimized-gix-merge-blob-6377298803884032" + ))) + .expect("testcase matches the historical fuzz target input layout"); + + run_fuzz_case(ctx.ours, ctx.base, ctx.theirs, ctx.marker_size); + } + #[test] fn run_baseline() -> crate::Result { let root = gix_testtools::scripted_fixture_read_only("text-baseline.sh")?; diff --git a/justfile b/justfile index bd8e1a7afe0..a9a5d5fcff5 100755 --- a/justfile +++ b/justfile @@ -50,9 +50,8 @@ check: ! cargo check --features lean-async 2>/dev/null ! cargo check -p gitoxide-core --all-features --features gix/sha1 2>/dev/null ! cargo check -p gix-protocol --all-features 2>/dev/null - # warning happens if nothing found, no exit code :/ - cargo --color=never tree -p gix --no-default-features -e normal -i imara-diff \ - 2>&1 >/dev/null | grep '^warning: nothing to print\>' + tree="$(cargo --color=never tree -p gix --no-default-features -e normal --prefix none --format '{p}')"; \ + ! printf '%s\n' "$tree" | rg -q '^gix-imara-diff(-01)? v' cargo --color=never tree -p gix --no-default-features -e normal -i gix-submodule \ 2>&1 >/dev/null | grep '^warning: nothing to print\>' cargo --color=never tree -p gix --no-default-features -e normal -i gix-pathspec \