Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion backend/btrixcloud/crawls.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,10 @@ async def list_crawls(

async def get_config_crawl_ids(self, cids: list[UUID]) -> list[str]:
"""get list of crawl ids belonging to given crawlconfigs"""
res = self.crawls.find({"cid": {"$in": cids}}, {"_id": 1})
res = self.crawls.find(
{"cid": {"$in": cids}, "state": {"$in": SUCCESSFUL_STATES}},
{"_id": 1},
)
res_list = await res.to_list()
return [res["_id"] for res in res_list]

Expand Down
9 changes: 9 additions & 0 deletions backend/test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
_auto_add_config_id = None
_all_crawls_config_id = None
_all_crawls_delete_config_id = None
_canceled_crawl_config_id = None

NON_DEFAULT_ORG_NAME = "Non-default org"
NON_DEFAULT_ORG_SLUG = "non-default-org"
Expand Down Expand Up @@ -586,6 +587,9 @@ def canceled_crawl_id(admin_auth_headers, default_org_id):
)
data = r.json()

global _canceled_crawl_config_id
_canceled_crawl_config_id = data["id"]

crawl_id = data["run_now_job"]

# Cancel crawl after it's started
Expand Down Expand Up @@ -620,6 +624,11 @@ def canceled_crawl_id(admin_auth_headers, default_org_id):
return crawl_id


@pytest.fixture(scope="session")
def canceled_crawl_config_id(canceled_crawl_id):
return _canceled_crawl_config_id


@pytest.fixture(scope="session")
def url_list_config_id(crawler_auth_headers, default_org_id):
# Start crawl.
Expand Down
27 changes: 26 additions & 1 deletion backend/test/test_collections.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,11 +362,16 @@ def test_add_remove_config_crawls_from_collection(
crawler_config_id,
admin_crawl_id,
admin_config_id,
canceled_crawl_id,
canceled_crawl_config_id,
):
# Add crawls by config and crawl id
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/add",
json={"crawlIds": [admin_crawl_id], "crawlconfigIds": [crawler_config_id]},
json={
"crawlIds": [admin_crawl_id],
"crawlconfigIds": [crawler_config_id, canceled_crawl_config_id],
},
headers=crawler_auth_headers,
)
assert r.status_code == 200
Expand All @@ -382,6 +387,26 @@ def test_add_remove_config_crawls_from_collection(
assert data["dateLatest"]
assert data["topPageHosts"]

# Verify crawls were added to collection
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id in r.json()["collectionIds"]

r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id in r.json()["collectionIds"]

# Verify non-successful crawl from workflow wasn't added to collection
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{canceled_crawl_id}/replay.json",
headers=crawler_auth_headers,
)
assert _coll_id not in r.json()["collectionIds"]

# Remove crawls by crawl and config id, and test that specifying a
# config and also a crawl in that config separately is handled
# gracefully
Expand Down
Loading