From 3461a39c7adf20577321d16f228591d3efdcc2aa Mon Sep 17 00:00:00 2001 From: dipto Date: Fri, 27 Mar 2026 15:26:34 -0400 Subject: [PATCH 1/2] stabilize Azure detector hash_v2 with deterministic iteration ProcessData in the Azure Entra v2 detector iterated Go maps non-deterministically, causing the same credential to produce different (clientId, tenantId) pairings across scanner runs. This yielded different RawV2 values, different hash_v2 hashes, and duplicate secret rows in the database (CSM-1857 secondary issue). Iterate map keys in sorted order via slices.Sorted(maps.Keys(...)) and clone caller maps with maps.Clone before verification-driven mutations. Same inputs now always produce the same RawV2/hash_v2. Made-with: Cursor --- .../azure_entra/serviceprincipal/v2/spv2.go | 39 ++-- .../v2/spv2_determinism_test.go | 173 ++++++++++++++++++ 2 files changed, 195 insertions(+), 17 deletions(-) create mode 100644 pkg/detectors/azure_entra/serviceprincipal/v2/spv2_determinism_test.go diff --git a/pkg/detectors/azure_entra/serviceprincipal/v2/spv2.go b/pkg/detectors/azure_entra/serviceprincipal/v2/spv2.go index 3b09b0fca0f4..affb4a8934e6 100644 --- a/pkg/detectors/azure_entra/serviceprincipal/v2/spv2.go +++ b/pkg/detectors/azure_entra/serviceprincipal/v2/spv2.go @@ -3,8 +3,10 @@ package v2 import ( "context" "errors" + "maps" "net/http" "regexp" + "slices" "strings" "github.com/trufflesecurity/trufflehog/v3/pkg/common" @@ -74,8 +76,13 @@ func ProcessData(ctx context.Context, clientSecrets, clientIds, tenantIds map[st logCtx := logContext.AddLogger(ctx) invalidClientsForTenant := make(map[string]map[string]struct{}) + // Clone maps so verification-driven deletions don't mutate the caller's + // data or produce non-deterministic results across scanner runs. + activeClients := maps.Clone(clientIds) + activeTenants := maps.Clone(tenantIds) + SecretLoop: - for clientSecret := range clientSecrets { + for _, clientSecret := range slices.Sorted(maps.Keys(clientSecrets)) { var ( r *detectors.Result clientId string @@ -83,12 +90,17 @@ SecretLoop: ) ClientLoop: - for cId := range clientIds { + for _, cId := range slices.Sorted(maps.Keys(activeClients)) { + if _, ok := activeClients[cId]; !ok { + continue + } clientId = cId - for tId := range tenantIds { + for _, tId := range slices.Sorted(maps.Keys(activeTenants)) { + if _, ok := activeTenants[tId]; !ok { + continue + } tenantId = tId - // Skip known invalid tenants. invalidClients := invalidClientsForTenant[tenantId] if invalidClients == nil { invalidClients = map[string]struct{}{} @@ -100,15 +112,12 @@ SecretLoop: if verify { if !azure_entra.TenantExists(logCtx, client, tenantId) { - // Tenant doesn't exist - delete(tenantIds, tenantId) + delete(activeTenants, tenantId) continue } - // Tenant exists, ensure this isn't attempted as a clientId. - delete(clientIds, tenantId) + delete(activeClients, tenantId) isVerified, extraData, verificationErr := serviceprincipal.VerifyCredentials(ctx, client, tenantId, clientId, clientSecret) - // Handle errors. if verificationErr != nil { switch { case errors.Is(verificationErr, serviceprincipal.ErrConditionalAccessPolicy): @@ -118,18 +127,15 @@ SecretLoop: case errors.Is(verificationErr, serviceprincipal.ErrSecretExpired): continue SecretLoop case errors.Is(verificationErr, serviceprincipal.ErrTenantNotFound): - // Tenant doesn't exist. This shouldn't happen with the check above. - delete(tenantIds, tenantId) + delete(activeTenants, tenantId) continue case errors.Is(verificationErr, serviceprincipal.ErrClientNotFoundInTenant): - // Tenant is valid but the ClientID doesn't exist. invalidClients[clientId] = struct{}{} continue } } - // The result is verified or there's only one associated client and tenant. - if isVerified || (len(clientIds) == 1 && len(tenantIds) == 1) { + if isVerified || (len(activeClients) == 1 && len(activeTenants) == 1) { r = createResult(tenantId, clientId, clientSecret, isVerified, extraData, verificationErr) break ClientLoop } @@ -138,11 +144,10 @@ SecretLoop: } if r == nil { - // Only include the clientId and tenantId if we're confident which one it is. - if len(clientIds) != 1 { + if len(activeClients) != 1 { clientId = "" } - if len(tenantIds) != 1 { + if len(activeTenants) != 1 { tenantId = "" } r = createResult(tenantId, clientId, clientSecret, false, nil, nil) diff --git a/pkg/detectors/azure_entra/serviceprincipal/v2/spv2_determinism_test.go b/pkg/detectors/azure_entra/serviceprincipal/v2/spv2_determinism_test.go new file mode 100644 index 000000000000..e46f4bfb80b1 --- /dev/null +++ b/pkg/detectors/azure_entra/serviceprincipal/v2/spv2_determinism_test.go @@ -0,0 +1,173 @@ +package v2 + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestProcessData_RawV2DependsOnIDCount shows that the same Azure client +// secret produces different RawV2 values depending on how many candidate +// client/tenant IDs appear in the chunk. +// +// With verify=false, ProcessData populates RawV2 only when there is exactly +// one client ID and exactly one tenant ID. If either set is ambiguous (>1), +// the IDs are cleared and RawV2 is nil. +// +// This is the root cause of CSM-1857's secondary issue: the same logical +// secret can get different hash_v2 values across scans if the surrounding +// chunk context changes, producing duplicate secret rows in the database. +func TestProcessData_RawV2DependsOnIDCount(t *testing.T) { + const ( + clientSecret = "abc4Q~fake-secret-that-is-long-enough1234567" + clientID1 = "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + clientID2 = "f9e8d7c6-b5a4-3210-fedc-ba9876543210" + tenantID1 = "11111111-2222-3333-4444-555566667777" + tenantID2 = "aaaaaaaa-bbbb-cccc-dddd-eeeeffff0000" + ) + + tests := []struct { + name string + clientIDs map[string]struct{} + tenantIDs map[string]struct{} + wantRawV2 bool + }{ + { + name: "single client and tenant", + clientIDs: map[string]struct{}{clientID1: {}}, + tenantIDs: map[string]struct{}{tenantID1: {}}, + wantRawV2: true, + }, + { + name: "multiple clients clears IDs", + clientIDs: map[string]struct{}{clientID1: {}, clientID2: {}}, + tenantIDs: map[string]struct{}{tenantID1: {}}, + wantRawV2: false, + }, + { + name: "multiple tenants clears IDs", + clientIDs: map[string]struct{}{clientID1: {}}, + tenantIDs: map[string]struct{}{tenantID1: {}, tenantID2: {}}, + wantRawV2: false, + }, + { + name: "multiple clients and tenants clears IDs", + clientIDs: map[string]struct{}{clientID1: {}, clientID2: {}}, + tenantIDs: map[string]struct{}{tenantID1: {}, tenantID2: {}}, + wantRawV2: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + secrets := map[string]struct{}{clientSecret: {}} + results := ProcessData(context.Background(), secrets, tc.clientIDs, tc.tenantIDs, false, nil) + + require.Len(t, results, 1) + assert.Equal(t, []byte(clientSecret), results[0].Raw, "Raw should always be the client secret") + + if tc.wantRawV2 { + assert.NotNil(t, results[0].RawV2, "RawV2 should be populated with unambiguous IDs") + } else { + assert.Nil(t, results[0].RawV2, "RawV2 should be nil when IDs are ambiguous") + } + }) + } +} + +// TestProcessData_DeterministicRawV2 verifies that ProcessData produces +// identical RawV2 on repeated calls with the same inputs. Before the sorted +// iteration fix, Go map randomization could cause different (clientId, tenantId) +// pairings across runs. +func TestProcessData_DeterministicRawV2(t *testing.T) { + const ( + clientSecret = "abc4Q~fake-secret-that-is-long-enough1234567" + clientID = "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + tenantID = "11111111-2222-3333-4444-555566667777" + ) + + var firstRawV2 []byte + for i := 0; i < 50; i++ { + results := ProcessData( + context.Background(), + map[string]struct{}{clientSecret: {}}, + map[string]struct{}{clientID: {}}, + map[string]struct{}{tenantID: {}}, + false, nil, + ) + require.Len(t, results, 1) + if i == 0 { + firstRawV2 = results[0].RawV2 + require.NotNil(t, firstRawV2) + } else { + assert.Equal(t, firstRawV2, results[0].RawV2, + "RawV2 must be identical across repeated calls (iteration %d)", i) + } + } +} + +// TestProcessData_DoesNotMutateCallerMaps verifies that ProcessData does not +// modify the maps passed by the caller. +func TestProcessData_DoesNotMutateCallerMaps(t *testing.T) { + const clientSecret = "abc4Q~fake-secret-that-is-long-enough1234567" + + secrets := map[string]struct{}{clientSecret: {}} + clientIDs := map[string]struct{}{ + "a1b2c3d4-e5f6-7890-abcd-ef1234567890": {}, + "f9e8d7c6-b5a4-3210-fedc-ba9876543210": {}, + } + tenantIDs := map[string]struct{}{ + "11111111-2222-3333-4444-555566667777": {}, + "aaaaaaaa-bbbb-cccc-dddd-eeeeffff0000": {}, + } + + origSecretLen := len(secrets) + origClientLen := len(clientIDs) + origTenantLen := len(tenantIDs) + + _ = ProcessData(context.Background(), secrets, clientIDs, tenantIDs, false, nil) + + assert.Len(t, secrets, origSecretLen, "caller's secrets map must not be mutated") + assert.Len(t, clientIDs, origClientLen, "caller's clientIDs map must not be mutated") + assert.Len(t, tenantIDs, origTenantLen, "caller's tenantIDs map must not be mutated") +} + +// TestProcessData_SameSecretDifferentRawV2 demonstrates the chain: +// the same client secret scanned with different chunk contexts produces +// different RawV2 bytes depending on whether IDs are ambiguous. +func TestProcessData_SameSecretDifferentRawV2(t *testing.T) { + const ( + clientSecret = "abc4Q~fake-secret-that-is-long-enough1234567" + clientID = "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + tenantID = "11111111-2222-3333-4444-555566667777" + ) + + // Scan 1: chunk has exactly one client ID and one tenant ID. + results1 := ProcessData( + context.Background(), + map[string]struct{}{clientSecret: {}}, + map[string]struct{}{clientID: {}}, + map[string]struct{}{tenantID: {}}, + false, nil, + ) + require.Len(t, results1, 1) + rawV2Populated := results1[0].RawV2 + require.NotNil(t, rawV2Populated, "scan 1: RawV2 should be populated") + + // Scan 2: same secret, but chunk now contains an extra client-like UUID. + results2 := ProcessData( + context.Background(), + map[string]struct{}{clientSecret: {}}, + map[string]struct{}{clientID: {}, "f9e8d7c6-b5a4-3210-fedc-ba9876543210": {}}, + map[string]struct{}{tenantID: {}}, + false, nil, + ) + require.Len(t, results2, 1) + rawV2Nil := results2[0].RawV2 + assert.Nil(t, rawV2Nil, "scan 2: RawV2 should be nil due to ambiguous client IDs") + + assert.NotEqual(t, rawV2Populated, rawV2Nil, + "same logical secret produces different RawV2 depending on chunk context") +} From 6fc2ccbfd79e275d52a15a8234a7d9cb08e574ae Mon Sep 17 00:00:00 2001 From: dipto Date: Tue, 31 Mar 2026 19:27:48 -0400 Subject: [PATCH 2/2] fix: exercise verify=true in DoesNotMutateCallerMaps test The test previously passed verify=false, but all delete calls that mutate maps are inside the `if verify` block, making the test a no-op. Use verify=true with a mock HTTP client that triggers tenant-not-found deletions so the test actually validates the maps.Clone protection. Made-with: Cursor --- .../v2/spv2_determinism_test.go | 46 +++++++++++++++---- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/pkg/detectors/azure_entra/serviceprincipal/v2/spv2_determinism_test.go b/pkg/detectors/azure_entra/serviceprincipal/v2/spv2_determinism_test.go index e46f4bfb80b1..d08019de528c 100644 --- a/pkg/detectors/azure_entra/serviceprincipal/v2/spv2_determinism_test.go +++ b/pkg/detectors/azure_entra/serviceprincipal/v2/spv2_determinism_test.go @@ -2,6 +2,10 @@ package v2 import ( "context" + "io" + "maps" + "net/http" + "strings" "testing" "github.com/stretchr/testify/assert" @@ -109,7 +113,10 @@ func TestProcessData_DeterministicRawV2(t *testing.T) { } // TestProcessData_DoesNotMutateCallerMaps verifies that ProcessData does not -// modify the maps passed by the caller. +// modify the maps passed by the caller. It uses verify=true with a mock HTTP +// client so that the verification code path (which contains delete calls) is +// actually exercised. With verify=false, no deletes occur and the test would +// pass trivially even without maps.Clone. func TestProcessData_DoesNotMutateCallerMaps(t *testing.T) { const clientSecret = "abc4Q~fake-secret-that-is-long-enough1234567" @@ -118,20 +125,39 @@ func TestProcessData_DoesNotMutateCallerMaps(t *testing.T) { "a1b2c3d4-e5f6-7890-abcd-ef1234567890": {}, "f9e8d7c6-b5a4-3210-fedc-ba9876543210": {}, } + // Use tenant IDs unique to this test to avoid polluting the package-level + // tenantCache shared with other tests. tenantIDs := map[string]struct{}{ - "11111111-2222-3333-4444-555566667777": {}, - "aaaaaaaa-bbbb-cccc-dddd-eeeeffff0000": {}, + "cccccccc-dddd-eeee-ffff-000011112222": {}, + "dddddddd-eeee-ffff-0000-111122223333": {}, } - origSecretLen := len(secrets) - origClientLen := len(clientIDs) - origTenantLen := len(tenantIDs) + origSecrets := maps.Clone(secrets) + origClientIDs := maps.Clone(clientIDs) + origTenantIDs := maps.Clone(tenantIDs) + + // Return 400 for every request so TenantExists returns false, triggering + // delete(activeTenants, tenantId) inside the verify block. + mockClient := &http.Client{ + Transport: roundTripFunc(func(*http.Request) (*http.Response, error) { + return &http.Response{ + StatusCode: http.StatusBadRequest, + Body: io.NopCloser(strings.NewReader("")), + }, nil + }), + } + + _ = ProcessData(context.Background(), secrets, clientIDs, tenantIDs, true, mockClient) + + assert.Equal(t, origSecrets, secrets, "caller's secrets map must not be mutated") + assert.Equal(t, origClientIDs, clientIDs, "caller's clientIDs map must not be mutated") + assert.Equal(t, origTenantIDs, tenantIDs, "caller's tenantIDs map must not be mutated") +} - _ = ProcessData(context.Background(), secrets, clientIDs, tenantIDs, false, nil) +type roundTripFunc func(*http.Request) (*http.Response, error) - assert.Len(t, secrets, origSecretLen, "caller's secrets map must not be mutated") - assert.Len(t, clientIDs, origClientLen, "caller's clientIDs map must not be mutated") - assert.Len(t, tenantIDs, origTenantLen, "caller's tenantIDs map must not be mutated") +func (f roundTripFunc) RoundTrip(req *http.Request) (*http.Response, error) { + return f(req) } // TestProcessData_SameSecretDifferentRawV2 demonstrates the chain: