webrecorder · tw4l · Apr 7, 2026 · Apr 7, 2026
diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py
@@ -388,7 +388,10 @@ async def list_crawls(
 
     async def get_config_crawl_ids(self, cids: list[UUID]) -> list[str]:
         """get list of crawl ids belonging to given crawlconfigs"""
-        res = self.crawls.find({"cid": {"$in": cids}}, {"_id": 1})
+        res = self.crawls.find(
+            {"cid": {"$in": cids}, "state": {"$in": SUCCESSFUL_STATES}},
+            {"_id": 1},
+        )
         res_list = await res.to_list()
         return [res["_id"] for res in res_list]
 

diff --git a/backend/test/conftest.py b/backend/test/conftest.py
@@ -29,6 +29,7 @@
 _auto_add_config_id = None
 _all_crawls_config_id = None
 _all_crawls_delete_config_id = None
+_canceled_crawl_config_id = None
 
 NON_DEFAULT_ORG_NAME = "Non-default org"
 NON_DEFAULT_ORG_SLUG = "non-default-org"
@@ -586,6 +587,9 @@ def canceled_crawl_id(admin_auth_headers, default_org_id):
     )
     data = r.json()
 
+    global _canceled_crawl_config_id
+    _canceled_crawl_config_id = data["id"]
+
     crawl_id = data["run_now_job"]
 
     # Cancel crawl after it's started
@@ -620,6 +624,11 @@ def canceled_crawl_id(admin_auth_headers, default_org_id):
     return crawl_id
 
 
+@pytest.fixture(scope="session")
+def canceled_crawl_config_id(canceled_crawl_id):
+    return _canceled_crawl_config_id
+
+
 @pytest.fixture(scope="session")
 def url_list_config_id(crawler_auth_headers, default_org_id):
     # Start crawl.

diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py
@@ -362,11 +362,16 @@ def test_add_remove_config_crawls_from_collection(
     crawler_config_id,
     admin_crawl_id,
     admin_config_id,
+    canceled_crawl_id,
+    canceled_crawl_config_id,
 ):
     # Add crawls by config and crawl id
     r = requests.post(
         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
-        json={"crawlIds": [admin_crawl_id], "crawlconfigIds": [crawler_config_id]},
+        json={
+            "crawlIds": [admin_crawl_id],
+            "crawlconfigIds": [crawler_config_id, canceled_crawl_config_id],
+        },
         headers=crawler_auth_headers,
     )
     assert r.status_code == 200
@@ -382,6 +387,26 @@ def test_add_remove_config_crawls_from_collection(
     assert data["dateLatest"]
     assert data["topPageHosts"]
 
+    # Verify crawls were added to collection
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
+        headers=crawler_auth_headers,
+    )
+    assert _coll_id in r.json()["collectionIds"]
+
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
+        headers=crawler_auth_headers,
+    )
+    assert _coll_id in r.json()["collectionIds"]
+
+    # Verify non-successful crawl from workflow wasn't added to collection
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{canceled_crawl_id}/replay.json",
+        headers=crawler_auth_headers,
+    )
+    assert _coll_id not in r.json()["collectionIds"]
+
     # Remove crawls by crawl and config id, and test that specifying a
     # config and also a crawl in that config separately is handled
     # gracefully