Skip to content

Commit fd89102

Browse files
fix: redacting user retirement data in lms (#37886)
Redact PII fields first (name, email, username), then delete. In case an ETL tool is syncing data to a downstream data warehouse, and treats the deletes as soft-deletes, the data will have first been redacted, protecting the sensitive PII.
1 parent b878549 commit fd89102

5 files changed

Lines changed: 214 additions & 20 deletions

File tree

openedx/core/djangoapps/user_api/accounts/tests/test_retirement_views.py

Lines changed: 104 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@
1212
from django.contrib.sites.models import Site
1313
from django.core import mail
1414
from django.core.cache import cache
15+
from django.db import connection
1516
from django.test import TestCase
17+
from django.test.utils import CaptureQueriesContext
1618
from django.urls import reverse
1719
from opaque_keys.edx.keys import CourseKey
1820
from rest_framework import status
@@ -1069,9 +1071,109 @@ def cleanup_and_assert_status(self, data=None, expected_status=status.HTTP_204_N
10691071
assert response.status_code == expected_status
10701072
return response
10711073

1072-
def test_simple_success(self):
1074+
def _assert_redacted_update_delete_queries(self, queries, redacted_username, redacted_email, redacted_name):
1075+
"""
1076+
Helper method to verify UPDATE and DELETE queries use ID-based filtering and correct field-value assignments.
1077+
Args:
1078+
queries: List of captured query dicts from CaptureQueriesContext
1079+
redacted_username: Expected redacted username value
1080+
redacted_email: Expected redacted email value
1081+
redacted_name: Expected redacted name value
1082+
"""
1083+
update_queries = [q for q in queries if 'UPDATE' in q['sql'] and 'user_api_userretirementstatus' in q['sql']]
1084+
delete_queries = [q for q in queries if 'DELETE' in q['sql'] and 'user_api_userretirementstatus' in q['sql']]
1085+
# Should have exactly 1 bulk UPDATE and 1 bulk DELETE query (not individual per-record queries)
1086+
assert len(update_queries) == 1, f"Expected 1 UPDATE query, found {len(update_queries)}"
1087+
assert len(delete_queries) == 1, f"Expected 1 DELETE query, found {len(delete_queries)}"
1088+
# Verify UPDATE query redacts records with the correct field-value assignments and uses ID-based filtering
1089+
update_query = update_queries[0]
1090+
sql_lower = update_query['sql']
1091+
# Ensure original_username, original_email, and original_name are set to redacted values
1092+
assert f'"original_username" = \'{redacted_username}\'' in sql_lower, (
1093+
f"UPDATE query missing '\"original_username\" = {redacted_username}': {sql_lower}"
1094+
)
1095+
assert f'"original_email" = \'{redacted_email}\'' in sql_lower, (
1096+
f"UPDATE query missing '\"original_email\" = {redacted_email}': {sql_lower}"
1097+
)
1098+
assert f'"original_name" = \'{redacted_name}\'' in sql_lower, (
1099+
f"UPDATE query missing '\"original_name\" = {redacted_name}': {sql_lower}"
1100+
)
1101+
# Ensure UPDATE uses ID-based filtering
1102+
assert '"id" IN' in sql_lower or 'WHERE "id"' in sql_lower, (
1103+
f"UPDATE query should use ID filtering to prevent over-update, but got: {sql_lower}"
1104+
)
1105+
# Verify DELETE is from the correct table and uses ID-based filtering
1106+
delete_query = delete_queries[0]
1107+
sql_lower = delete_query['sql']
1108+
assert 'user_api_userretirementstatus' in sql_lower, (
1109+
f"DELETE query against unexpected table: {sql_lower}"
1110+
)
1111+
assert '"id" IN' in sql_lower or 'WHERE "id"' in sql_lower, (
1112+
f"DELETE query should use ID filtering to prevent over-deletion, but got: {sql_lower}"
1113+
)
1114+
1115+
def test_default_redacted_values(self):
1116+
"""
1117+
Test basic cleanup with default redacted values.
1118+
Verify that redaction (UPDATE) happens before deletion (DELETE).
1119+
Captures actual SQL queries to ensure UPDATE queries contain correct field-value assignments.
1120+
"""
1121+
with CaptureQueriesContext(connection) as context:
1122+
self.cleanup_and_assert_status()
1123+
1124+
# Verify records are deleted after redaction
1125+
retirements = UserRetirementStatus.objects.all()
1126+
assert retirements.count() == 0
1127+
1128+
# Verify UPDATE and DELETE queries with default 'redacted' value
1129+
self._assert_redacted_update_delete_queries(context.captured_queries, 'redacted', 'redacted', 'redacted')
1130+
1131+
def test_custom_redacted_values(self):
1132+
"""Test that custom redacted values are applied before deletion."""
1133+
custom_username = 'username-redacted-12345'
1134+
custom_email = 'email-redacted-67890'
1135+
custom_name = 'name-redacted-abcde'
1136+
1137+
data = {
1138+
'usernames': self.usernames,
1139+
'redacted_username': custom_username,
1140+
'redacted_email': custom_email,
1141+
'redacted_name': custom_name
1142+
}
1143+
1144+
with CaptureQueriesContext(connection) as context:
1145+
self.cleanup_and_assert_status(data=data)
1146+
1147+
# Verify records are deleted after redaction
1148+
retirements = UserRetirementStatus.objects.all()
1149+
assert retirements.count() == 0
1150+
1151+
# Verify UPDATE and DELETE queries with custom redacted values
1152+
self._assert_redacted_update_delete_queries(
1153+
context.captured_queries, custom_username, custom_email, custom_name
1154+
)
1155+
1156+
def test_does_not_delete_unrelated_redacted_records(self):
1157+
"""
1158+
Verify cleanup doesn't delete unrelated records with coincidental redacted values.
1159+
Regression test for over-deletion bug where deletion was filtered by field values
1160+
(original_username='redacted') instead of by primary key.
1161+
"""
1162+
# Create an unrelated record that already has redacted field values
1163+
other_user = UserFactory()
1164+
other_retirement = create_retirement_status(other_user, state=self.complete_state)
1165+
other_retirement.original_username = 'redacted'
1166+
other_retirement.original_email = 'redacted'
1167+
other_retirement.original_name = 'redacted'
1168+
other_retirement.save()
1169+
other_id = other_retirement.id
1170+
# Clean up only self.usernames records
10731171
self.cleanup_and_assert_status()
1074-
assert not UserRetirementStatus.objects.all()
1172+
# Verify target records were deleted
1173+
target_count = UserRetirementStatus.objects.filter(user__username__in=self.usernames).count()
1174+
assert target_count == 0, f"Expected 0 target records, found {target_count}"
1175+
# Verify unrelated record was NOT deleted (not a target of cleanup)
1176+
assert UserRetirementStatus.objects.filter(id=other_id).exists()
10751177

10761178
def test_leaves_other_users(self):
10771179
remaining_usernames = []

openedx/core/djangoapps/user_api/accounts/views.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1019,14 +1019,20 @@ def cleanup(self, request):
10191019
10201020
```
10211021
{
1022-
'usernames': ['user1', 'user2', ...]
1022+
'usernames': ['user1', 'user2', ...],
1023+
'redacted_username': 'Value to store in username field',
1024+
'redacted_email': 'Value to store in email field',
1025+
'redacted_name': 'Value to store in name field'
10231026
}
10241027
```
10251028
1026-
Deletes a batch of retirement requests by username.
1029+
Redacts and then deletes a batch of retirement requests by username.
10271030
"""
10281031
try:
10291032
usernames = request.data["usernames"]
1033+
redacted_username = request.data.get("redacted_username", "redacted")
1034+
redacted_email = request.data.get("redacted_email", "redacted")
1035+
redacted_name = request.data.get("redacted_name", "redacted")
10301036

10311037
if not isinstance(usernames, list):
10321038
raise TypeError("Usernames should be an array.")
@@ -1040,7 +1046,20 @@ def cleanup(self, request):
10401046
if len(usernames) != len(retirements):
10411047
raise UserRetirementStatus.DoesNotExist("Not all usernames exist in the COMPLETE state.")
10421048

1043-
retirements.delete()
1049+
# Redact PII fields first, then delete. In case an ETL tool is syncing data
1050+
# to a downstream data warehouse, and treats the deletes as soft-deletes,
1051+
# the data will have first been redacted, protecting the sensitive PII.
1052+
# Get the IDs of the retirements to update/delete
1053+
retirement_ids = list(retirements.values_list('id', flat=True))
1054+
# Update by IDs
1055+
UserRetirementStatus.objects.filter(id__in=retirement_ids).update(
1056+
original_username=redacted_username,
1057+
original_email=redacted_email,
1058+
original_name=redacted_name
1059+
)
1060+
# Delete by IDs
1061+
UserRetirementStatus.objects.filter(id__in=retirement_ids, current_state=complete_state).delete()
1062+
10441063
return Response(status=status.HTTP_204_NO_CONTENT)
10451064
except (RetirementStateError, UserRetirementStatus.DoesNotExist, TypeError) as exc:
10461065
return Response(str(exc), status=status.HTTP_400_BAD_REQUEST)

scripts/user_retirement/retirement_archive_and_cleanup.py

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -196,16 +196,19 @@ def _archive_retirements_or_exit(config, learners, dry_run=False):
196196
FAIL_EXCEPTION(ERR_ARCHIVING, 'Unexpected error occurred archiving retirements!', exc)
197197

198198

199-
def _cleanup_retirements_or_exit(config, learners):
199+
def _cleanup_retirements_or_exit(config, learners, redacted_username='redacted',
200+
redacted_email='redacted', redacted_name='redacted'):
200201
"""
201-
Bulk deletes the retirements for this run
202+
Bulk deletes the retirements for this run after redacting PII fields
202203
"""
203204
LOG('Cleaning up retirements for {} learners'.format(len(learners))) # noqa: UP032
204205
try:
205206
usernames = [l['original_username'] for l in learners]
206-
config['LMS'].bulk_cleanup_retirements(usernames)
207+
config['LMS'].bulk_cleanup_retirements(
208+
usernames, redacted_username, redacted_email, redacted_name
209+
)
207210
except Exception as exc: # pylint: disable=broad-except
208-
FAIL_EXCEPTION(ERR_DELETING, 'Unexpected error occurred deleting retirements!', exc)
211+
FAIL_EXCEPTION(ERR_DELETING, 'Unexpected error occurred redacting/deleting retirements!', exc)
209212

210213

211214
def _get_utc_now():
@@ -259,7 +262,26 @@ def _get_utc_now():
259262
help='Number of user retirements to process',
260263
type=int
261264
)
262-
def archive_and_cleanup(config_file, cool_off_days, dry_run, start_date, end_date, batch_size):
265+
@click.option(
266+
'--redacted_username',
267+
help='Value to use for redacted username field',
268+
type=str,
269+
default='redacted'
270+
)
271+
@click.option(
272+
'--redacted_email',
273+
help='Value to use for redacted email field',
274+
type=str,
275+
default='redacted'
276+
)
277+
@click.option(
278+
'--redacted_name',
279+
help='Value to use for redacted name field',
280+
type=str,
281+
default='redacted'
282+
)
283+
def archive_and_cleanup(config_file, cool_off_days, dry_run, start_date, end_date, batch_size,
284+
redacted_username, redacted_email, redacted_name):
263285
"""
264286
Cleans up UserRetirementStatus rows in LMS by:
265287
1- Getting all rows currently in COMPLETE that were created --cool_off_days ago or more,
@@ -314,7 +336,9 @@ def archive_and_cleanup(config_file, cool_off_days, dry_run, start_date, end_dat
314336
if dry_run:
315337
LOG('This is a dry-run. Exiting before any retirements are cleaned up')
316338
else:
317-
_cleanup_retirements_or_exit(config, batch)
339+
_cleanup_retirements_or_exit(
340+
config, batch, redacted_username, redacted_email, redacted_name
341+
)
318342
LOG('Archive and cleanup complete for batch #{}'.format(str(index + 1))) # noqa: UP032
319343
time.sleep(DELAY)
320344
else:

scripts/user_retirement/tests/test_retirement_archive_and_cleanup.py

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@
2828
FAKE_BUCKET_NAME = "fake_test_bucket"
2929

3030

31-
def _call_script(cool_off_days=37, batch_size=None, dry_run=None, start_date=None, end_date=None):
31+
def _call_script(cool_off_days=37, batch_size=None, dry_run=None, start_date=None, end_date=None,
32+
redacted_username=None, redacted_email=None, redacted_name=None):
3233
"""
3334
Call the archive script with the given params and a generic config file.
3435
Returns the CliRunner.invoke results
@@ -50,6 +51,12 @@ def _call_script(cool_off_days=37, batch_size=None, dry_run=None, start_date=Non
5051
base_args += ['--start_date', start_date]
5152
if end_date:
5253
base_args += ['--end_date', end_date]
54+
if redacted_username:
55+
base_args += ['--redacted_username', redacted_username]
56+
if redacted_email:
57+
base_args += ['--redacted_email', redacted_email]
58+
if redacted_name:
59+
base_args += ['--redacted_name', redacted_name]
5360

5461
result = runner.invoke(archive_and_cleanup, args=base_args)
5562
print(result)
@@ -106,7 +113,7 @@ def test_successful(*args, **kwargs):
106113
assert mock_get_access_token.call_count == 1
107114
mock_get_learners.assert_called_once()
108115
mock_bulk_cleanup_retirements.assert_called_once_with(
109-
['test1', 'test2', 'test3'])
116+
['test1', 'test2', 'test3'], 'redacted', 'redacted', 'redacted')
110117

111118
assert result.exit_code == 0
112119
assert 'Archive and cleanup complete' in result.output
@@ -134,14 +141,48 @@ def test_successful_with_batching(*args, **kwargs):
134141
# Called once to get the LMS token
135142
assert mock_get_access_token.call_count == 1
136143
mock_get_learners.assert_called_once()
137-
get_learner_calls = [call(['test1', 'test2']), call(['test3'])]
144+
get_learner_calls = [call(['test1', 'test2'], 'redacted', 'redacted', 'redacted'),
145+
call(['test3'], 'redacted', 'redacted', 'redacted')]
138146
mock_bulk_cleanup_retirements.assert_has_calls(get_learner_calls)
139147

140148
assert result.exit_code == 0
141149
assert 'Archive and cleanup complete for batch #1' in result.output
142150
assert 'Archive and cleanup complete for batch #2' in result.output
143151

144152

153+
@patch('scripts.user_retirement.utils.edx_api.BaseApiClient.get_access_token', return_value=('THIS_IS_A_JWT', None))
154+
@patch.multiple(
155+
'scripts.user_retirement.utils.edx_api.LmsApi',
156+
get_learners_by_date_and_status=DEFAULT,
157+
bulk_cleanup_retirements=DEFAULT
158+
)
159+
@mock_aws
160+
def test_successful_with_custom_redaction_values(*args, **kwargs):
161+
conn = boto3.resource('s3')
162+
conn.create_bucket(Bucket=FAKE_BUCKET_NAME)
163+
164+
mock_get_access_token = args[0]
165+
mock_get_learners = kwargs['get_learners_by_date_and_status']
166+
mock_bulk_cleanup_retirements = kwargs['bulk_cleanup_retirements']
167+
168+
mock_get_learners.return_value = fake_learners_to_retire()
169+
170+
result = _call_script(
171+
redacted_username='custom_user',
172+
redacted_email='custom@example.com',
173+
redacted_name='Custom Name'
174+
)
175+
176+
# Called once to get the LMS token
177+
assert mock_get_access_token.call_count == 1
178+
mock_get_learners.assert_called_once()
179+
mock_bulk_cleanup_retirements.assert_called_once_with(
180+
['test1', 'test2', 'test3'], 'custom_user', 'custom@example.com', 'Custom Name')
181+
182+
assert result.exit_code == 0
183+
assert 'Archive and cleanup complete' in result.output
184+
185+
145186
@patch('scripts.user_retirement.utils.edx_api.BaseApiClient.get_access_token', return_value=('THIS_IS_A_JWT', None))
146187
@patch.multiple(
147188
'scripts.user_retirement.utils.edx_api.LmsApi',
@@ -216,7 +257,7 @@ def test_bad_fetch(*_):
216257
def test_bad_lms_deletion(*_):
217258
result = _call_script()
218259
assert result.exit_code == ERR_DELETING
219-
assert 'Unexpected error occurred deleting retirements!' in result.output
260+
assert 'Unexpected error occurred redacting/deleting retirements!' in result.output
220261

221262

222263
@patch('scripts.user_retirement.utils.edx_api.BaseApiClient.get_access_token', return_value=('THIS_IS_A_JWT', None))

scripts/user_retirement/utils/edx_api.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -352,11 +352,19 @@ def retirement_retire_proctoring_backend_data(self, learner):
352352
return self._request("POST", api_url)
353353

354354
@_retry_lms_api()
355-
def bulk_cleanup_retirements(self, usernames):
356-
"""
357-
Deletes the retirements for all given usernames
358-
"""
359-
data = {"usernames": usernames}
355+
def bulk_cleanup_retirements(self, usernames, redacted_username=None,
356+
redacted_email=None, redacted_name=None):
357+
"""
358+
Redacts and then deletes the retirements for all given usernames.
359+
Optionally pass caller-defined redacted values for each PII field before deletion.
360+
"""
361+
data = {'usernames': usernames}
362+
if redacted_username is not None:
363+
data['redacted_username'] = redacted_username
364+
if redacted_email is not None:
365+
data['redacted_email'] = redacted_email
366+
if redacted_name is not None:
367+
data['redacted_name'] = redacted_name
360368
api_url = self.get_api_url("api/user/v1/accounts/retirement_cleanup")
361369
return self._request("POST", api_url, json=data)
362370

0 commit comments

Comments
 (0)