refactor(services): add Jaccard clustering, NUL-delimited git, conflict fix

Sephyi · Sephyi · commit 8db162de3d17 · 2026-03-08T03:51:27.000+01:00
Splitter uses hybrid similarity (diff-shape + Jaccard vocabulary overlap)
for more accurate file grouping. Git service uses -z flag for safe path
parsing. Conflict detector uses component-based path matching and scans
only added lines, with concat! to prevent self-detection.
diff --git a/src/services/git.rs b/src/services/git.rs
@@ -58,9 +58,9 @@ impl GitService {
     pub async fn get_staged_changes(&self, max_file_lines: usize) -> Result<StagedChanges> {
         self.check_state()?;
 
-        // Two calls total (down from N+1): name-status + unified diff
+        // Two calls total: name-status (NUL delimited) + unified diff
         let (status_output, diff_output) = tokio::try_join!(
-            self.run_git(&["diff", "--cached", "--name-status", "--no-renames"]),
+            self.run_git(&["diff", "-z", "--cached", "--name-status", "--no-renames"]),
             self.run_git(&[
                 "diff",
                 "--cached",
@@ -75,33 +75,36 @@ impl GitService {
         let mut files = Vec::new();
         let mut stats = DiffStats::default();
 
-        for line in status_output.lines() {
-            if line.is_empty() {
-                continue;
-            }
+        let mut parts = status_output.split('\0').filter(|s| !s.is_empty());
 
-            let parts: Vec<&str> = line.splitn(2, '\t').collect();
-            if parts.len() != 2 {
-                continue;
-            }
+        while let Some(status_code) = parts.next() {
+            let path_str = match parts.next() {
+                Some(p) => p,
+                None => break, // Should not happen with well-formed git output
+            };
 
-            let status = match parts[0] {
+            let status = match status_code {
                 "A" => ChangeStatus::Added,
                 "M" => ChangeStatus::Modified,
                 "D" => ChangeStatus::Deleted,
                 _ => continue,
             };
 
-            let file_path = Path::new(parts[1]).to_path_buf();
+            let file_path = PathBuf::from(path_str);
             let category = FileCategory::from_path(&file_path);
             let is_binary = Self::is_binary_path(&file_path);
 
             if is_binary {
                 continue;
             }
 
+            // For lookups in file_diffs, we need the string key.
+            // Note: split_unified_diff currently uses paths from "diff --git a/... b/..." headers which are usually standard strings.
+            // Complex unicode paths might mismatch if git output encoding differs, but -z guarantees strict bytes for status.
+            let diff_key = file_path.to_string_lossy();
+
             let diff = file_diffs
-                .get(parts[1])
+                .get(diff_key.as_ref())
                 .map(|d| Self::truncate_diff(d, max_file_lines))
                 .unwrap_or_default();
 
diff --git a/src/services/safety.rs b/src/services/safety.rs
@@ -79,15 +79,25 @@ pub fn scan_for_secrets(changes: &StagedChanges) -> Vec<SecretMatch> {
 pub fn check_for_conflicts(changes: &StagedChanges) -> bool {
     for file in &changes.files {
         // Skip docs/test files where conflict markers might be intentional examples
-        if file.path.to_string_lossy().contains("test")
-            || file.path.to_string_lossy().contains("doc")
-            || file.path.to_string_lossy().contains("example")
-        {
+        // Use path components to avoid matching "testing_utils" or "documentation" substrings
+        if file.path.components().any(|c| {
+            let s = c.as_os_str().to_string_lossy();
+            s == "tests" || s == "docs" || s == "examples" || s.contains("test")
+        }) {
             continue;
         }
 
-        if file.diff.contains("<<<<<<<") || file.diff.contains(">>>>>>>") {
-            return true;
+        // Only check added lines for conflict markers
+        for line in file.diff.lines() {
+            if line.starts_with('+') && !line.starts_with("+++") {
+                // Split strings to prevent self-detection in this file's own diff
+                const CONFLICT_START: &str = concat!("<", "<<<<<<");
+                const CONFLICT_END: &str = concat!(">", ">>>>>>");
+
+                if line.contains(CONFLICT_START) || line.contains(CONFLICT_END) {
+                    return true;
+                }
+            }
         }
     }
     false
diff --git a/src/services/splitter.rs b/src/services/splitter.rs
@@ -2,7 +2,7 @@
 //
 // SPDX-License-Identifier: PolyForm-Noncommercial-1.0.0
 
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::path::{Path, PathBuf};
 
 use crate::domain::{CodeSymbol, CommitType, FileCategory, FileChange, StagedChanges};
@@ -228,6 +228,44 @@ impl CommitSplitter {
         }
     }
 
+    /// Compute Jaccard similarity between two sets of tokens.
+    fn jaccard_index(a: &HashSet<String>, b: &HashSet<String>) -> f64 {
+        if a.is_empty() && b.is_empty() {
+            return 1.0;
+        }
+        let intersection = a.intersection(b).count();
+        let union = a.len() + b.len() - intersection;
+        if union == 0 {
+            0.0
+        } else {
+            intersection as f64 / union as f64
+        }
+    }
+
+    /// Extract significant tokens from a diff.
+    /// Captures the vocabulary of the change (variable names, keywords, types).
+    fn tokenize_diff(diff: &str) -> HashSet<String> {
+        let mut tokens = HashSet::new();
+        for line in diff.lines() {
+            // Only process add/remove lines, skip headers
+            if (!line.starts_with('+') && !line.starts_with('-'))
+                || line.starts_with("+++")
+                || line.starts_with("---")
+            {
+                continue;
+            }
+
+            // Split by non-alphanumeric characters to get words
+            for word in line[1..].split(|c: char| !c.is_alphanumeric()) {
+                if word.len() > 2 {
+                    // Skip tiny tokens
+                    tokens.insert(word.to_string());
+                }
+            }
+        }
+        tokens
+    }
+
     /// Group source files by diff-shape similarity.
     ///
     /// Files with very similar fingerprints (same kind of transformation)
@@ -237,34 +275,47 @@ impl CommitSplitter {
             return vec![files.to_vec()];
         }
 
-        let fingerprints: Vec<(&FileChange, DiffFingerprint)> = files
+        // Pre-calculate features for all files
+        let features: Vec<(&FileChange, DiffFingerprint, HashSet<String>)> = files
             .iter()
-            .map(|f| (*f, Self::diff_fingerprint(f)))
+            .map(|f| (*f, Self::diff_fingerprint(f), Self::tokenize_diff(&f.diff)))
             .collect();
 
-        // Greedy clustering: assign each file to the first similar group, or create new
-        let mut clusters: Vec<(DiffFingerprint, Vec<&'a FileChange>)> = Vec::new();
+        // Greedy clustering with hybrid similarity
+        let mut clusters: Vec<(DiffFingerprint, HashSet<String>, Vec<&'a FileChange>)> = Vec::new();
 
-        for (file, fp) in &fingerprints {
+        for (file, fp, tokens) in &features {
             let mut assigned = false;
 
-            for (centroid, members) in &mut clusters {
-                if centroid.is_similar(fp) {
-                    members.push(file);
-                    assigned = true;
-                    break;
+            for (centroid_fp, centroid_tokens, members) in &mut clusters {
+                // Hybrid similarity:
+                // 1. Must be statistically similar (shape/size)
+                // 2. Must share content vocabulary (Jaccard index)
+
+                if centroid_fp.is_similar(fp) {
+                    let content_sim = Self::jaccard_index(centroid_tokens, tokens);
+
+                    // Threshold: 0.4 implies significant vocabulary overlap
+                    // e.g., sharing variable names, types, or specific syntax patterns
+                    if content_sim > 0.4 {
+                        members.push(file);
+                        // Update centroid tokens? Union them to represent the group better?
+                        // For simple greedy, keeping the first file's tokens as centroid is simpler/stable.
+                        assigned = true;
+                        break;
+                    }
                 }
             }
 
             if !assigned {
-                clusters.push((fp.clone(), vec![file]));
+                clusters.push((fp.clone(), tokens.clone(), vec![file]));
             }
         }
 
         // If clustering produced only 1 group, check if it's genuinely uniform
         // or if we should fall back to module-based splitting.
         if clusters.len() == 1 {
-            let (centroid, cluster_files) = &clusters[0];
+            let (centroid, _, cluster_files) = &clusters[0];
 
             // If all files have non-empty, highly balanced diffs (adds ≈ removes) and are
             // small, they likely received the same mechanical transformation → keep grouped.
@@ -290,7 +341,7 @@ impl CommitSplitter {
 
         let mut result: Vec<Vec<&'a FileChange>> = Vec::new();
 
-        for (_, cluster_files) in clusters {
+        for (_, _, cluster_files) in clusters {
             // E2: Sub-split large clusters that span multiple modules.
             // If a group has >6 files across multiple modules, the shape clustering
             // wasn't discriminating enough — split by module to avoid mega-groups.