22//
33// SPDX-License-Identifier: PolyForm-Noncommercial-1.0.0
44
5- use std:: collections:: HashMap ;
5+ use std:: collections:: { HashMap , HashSet } ;
66use std:: path:: { Path , PathBuf } ;
77
88use crate :: domain:: { CodeSymbol , CommitType , FileCategory , FileChange , StagedChanges } ;
@@ -228,6 +228,44 @@ impl CommitSplitter {
228228 }
229229 }
230230
231+ /// Compute Jaccard similarity between two sets of tokens.
232+ fn jaccard_index ( a : & HashSet < String > , b : & HashSet < String > ) -> f64 {
233+ if a. is_empty ( ) && b. is_empty ( ) {
234+ return 1.0 ;
235+ }
236+ let intersection = a. intersection ( b) . count ( ) ;
237+ let union = a. len ( ) + b. len ( ) - intersection;
238+ if union == 0 {
239+ 0.0
240+ } else {
241+ intersection as f64 / union as f64
242+ }
243+ }
244+
245+ /// Extract significant tokens from a diff.
246+ /// Captures the vocabulary of the change (variable names, keywords, types).
247+ fn tokenize_diff ( diff : & str ) -> HashSet < String > {
248+ let mut tokens = HashSet :: new ( ) ;
249+ for line in diff. lines ( ) {
250+ // Only process add/remove lines, skip headers
251+ if ( !line. starts_with ( '+' ) && !line. starts_with ( '-' ) )
252+ || line. starts_with ( "+++" )
253+ || line. starts_with ( "---" )
254+ {
255+ continue ;
256+ }
257+
258+ // Split by non-alphanumeric characters to get words
259+ for word in line[ 1 ..] . split ( |c : char | !c. is_alphanumeric ( ) ) {
260+ if word. len ( ) > 2 {
261+ // Skip tiny tokens
262+ tokens. insert ( word. to_string ( ) ) ;
263+ }
264+ }
265+ }
266+ tokens
267+ }
268+
231269 /// Group source files by diff-shape similarity.
232270 ///
233271 /// Files with very similar fingerprints (same kind of transformation)
@@ -237,34 +275,47 @@ impl CommitSplitter {
237275 return vec ! [ files. to_vec( ) ] ;
238276 }
239277
240- let fingerprints: Vec < ( & FileChange , DiffFingerprint ) > = files
278+ // Pre-calculate features for all files
279+ let features: Vec < ( & FileChange , DiffFingerprint , HashSet < String > ) > = files
241280 . iter ( )
242- . map ( |f| ( * f, Self :: diff_fingerprint ( f) ) )
281+ . map ( |f| ( * f, Self :: diff_fingerprint ( f) , Self :: tokenize_diff ( & f . diff ) ) )
243282 . collect ( ) ;
244283
245- // Greedy clustering: assign each file to the first similar group, or create new
246- let mut clusters: Vec < ( DiffFingerprint , Vec < & ' a FileChange > ) > = Vec :: new ( ) ;
284+ // Greedy clustering with hybrid similarity
285+ let mut clusters: Vec < ( DiffFingerprint , HashSet < String > , Vec < & ' a FileChange > ) > = Vec :: new ( ) ;
247286
248- for ( file, fp) in & fingerprints {
287+ for ( file, fp, tokens ) in & features {
249288 let mut assigned = false ;
250289
251- for ( centroid, members) in & mut clusters {
252- if centroid. is_similar ( fp) {
253- members. push ( file) ;
254- assigned = true ;
255- break ;
290+ for ( centroid_fp, centroid_tokens, members) in & mut clusters {
291+ // Hybrid similarity:
292+ // 1. Must be statistically similar (shape/size)
293+ // 2. Must share content vocabulary (Jaccard index)
294+
295+ if centroid_fp. is_similar ( fp) {
296+ let content_sim = Self :: jaccard_index ( centroid_tokens, tokens) ;
297+
298+ // Threshold: 0.4 implies significant vocabulary overlap
299+ // e.g., sharing variable names, types, or specific syntax patterns
300+ if content_sim > 0.4 {
301+ members. push ( file) ;
302+ // Update centroid tokens? Union them to represent the group better?
303+ // For simple greedy, keeping the first file's tokens as centroid is simpler/stable.
304+ assigned = true ;
305+ break ;
306+ }
256307 }
257308 }
258309
259310 if !assigned {
260- clusters. push ( ( fp. clone ( ) , vec ! [ file] ) ) ;
311+ clusters. push ( ( fp. clone ( ) , tokens . clone ( ) , vec ! [ file] ) ) ;
261312 }
262313 }
263314
264315 // If clustering produced only 1 group, check if it's genuinely uniform
265316 // or if we should fall back to module-based splitting.
266317 if clusters. len ( ) == 1 {
267- let ( centroid, cluster_files) = & clusters[ 0 ] ;
318+ let ( centroid, _ , cluster_files) = & clusters[ 0 ] ;
268319
269320 // If all files have non-empty, highly balanced diffs (adds ≈ removes) and are
270321 // small, they likely received the same mechanical transformation → keep grouped.
@@ -290,7 +341,7 @@ impl CommitSplitter {
290341
291342 let mut result: Vec < Vec < & ' a FileChange > > = Vec :: new ( ) ;
292343
293- for ( _, cluster_files) in clusters {
344+ for ( _, _ , cluster_files) in clusters {
294345 // E2: Sub-split large clusters that span multiple modules.
295346 // If a group has >6 files across multiple modules, the shape clustering
296347 // wasn't discriminating enough — split by module to avoid mega-groups.
0 commit comments