diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index a856e6fc20..4d038f8028 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -388,7 +388,10 @@ async def list_crawls( async def get_config_crawl_ids(self, cids: list[UUID]) -> list[str]: """get list of crawl ids belonging to given crawlconfigs""" - res = self.crawls.find({"cid": {"$in": cids}}, {"_id": 1}) + res = self.crawls.find( + {"cid": {"$in": cids}, "state": {"$in": SUCCESSFUL_STATES}}, + {"_id": 1}, + ) res_list = await res.to_list() return [res["_id"] for res in res_list] diff --git a/backend/test/conftest.py b/backend/test/conftest.py index 2b2c013ba7..c9613c692f 100644 --- a/backend/test/conftest.py +++ b/backend/test/conftest.py @@ -29,6 +29,7 @@ _auto_add_config_id = None _all_crawls_config_id = None _all_crawls_delete_config_id = None +_canceled_crawl_config_id = None NON_DEFAULT_ORG_NAME = "Non-default org" NON_DEFAULT_ORG_SLUG = "non-default-org" @@ -586,6 +587,9 @@ def canceled_crawl_id(admin_auth_headers, default_org_id): ) data = r.json() + global _canceled_crawl_config_id + _canceled_crawl_config_id = data["id"] + crawl_id = data["run_now_job"] # Cancel crawl after it's started @@ -620,6 +624,11 @@ def canceled_crawl_id(admin_auth_headers, default_org_id): return crawl_id +@pytest.fixture(scope="session") +def canceled_crawl_config_id(canceled_crawl_id): + return _canceled_crawl_config_id + + @pytest.fixture(scope="session") def url_list_config_id(crawler_auth_headers, default_org_id): # Start crawl. diff --git a/backend/test/test_collections.py b/backend/test/test_collections.py index cd8061f098..37a7a05fa4 100644 --- a/backend/test/test_collections.py +++ b/backend/test/test_collections.py @@ -362,11 +362,16 @@ def test_add_remove_config_crawls_from_collection( crawler_config_id, admin_crawl_id, admin_config_id, + canceled_crawl_id, + canceled_crawl_config_id, ): # Add crawls by config and crawl id r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add", - json={"crawlIds": [admin_crawl_id], "crawlconfigIds": [crawler_config_id]}, + json={ + "crawlIds": [admin_crawl_id], + "crawlconfigIds": [crawler_config_id, canceled_crawl_config_id], + }, headers=crawler_auth_headers, ) assert r.status_code == 200 @@ -382,6 +387,26 @@ def test_add_remove_config_crawls_from_collection( assert data["dateLatest"] assert data["topPageHosts"] + # Verify crawls were added to collection + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json", + headers=crawler_auth_headers, + ) + assert _coll_id in r.json()["collectionIds"] + + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json", + headers=crawler_auth_headers, + ) + assert _coll_id in r.json()["collectionIds"] + + # Verify non-successful crawl from workflow wasn't added to collection + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{canceled_crawl_id}/replay.json", + headers=crawler_auth_headers, + ) + assert _coll_id not in r.json()["collectionIds"] + # Remove crawls by crawl and config id, and test that specifying a # config and also a crawl in that config separately is handled # gracefully