-
Notifications
You must be signed in to change notification settings - Fork 0
feat: webapp file ingestion changes #206
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
39e8715
b80c003
d70ed0e
0403afb
fbec688
bb3447b
3f06762
572a928
2654d85
0822f64
674ddcb
0af9c60
145082b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -12,6 +12,7 @@ | |
| import logging | ||
| from sqlalchemy.exc import IntegrityError | ||
| import re | ||
| import requests | ||
| from ..validation import HardValidationError | ||
| from ..validation_error_formatter import format_validation_error | ||
| import pandas as pd | ||
|
|
@@ -180,6 +181,33 @@ class ValidationResult(BaseModel): | |
| source: str | ||
|
|
||
|
|
||
| class BronzeImportRequest(BaseModel): | ||
| """Request to import a dataset from the institution's bronze volume into GCS.""" | ||
|
|
||
| name: str | ||
|
|
||
|
|
||
| class BronzeImportResponse(BaseModel): | ||
| """Response for bronze import request.""" | ||
|
|
||
| file_name: str | ||
| message: str | ||
|
|
||
|
|
||
| def _upload_file_bytes_to_signed_url(file_bytes: bytes, upload_signed_url: str) -> None: | ||
| """Upload file bytes to a signed GCS URL using the same request shape as the worker path.""" | ||
| upload_response = requests.put( | ||
| upload_signed_url, | ||
| data=file_bytes, | ||
| headers={"Content-Type": "text/csv"}, | ||
| timeout=600, | ||
| ) | ||
| if upload_response.status_code != 200: | ||
| raise requests.RequestException( | ||
| f"{upload_response.status_code} {upload_response.text}" | ||
| ) | ||
|
|
||
|
|
||
| class DataOverview(BaseModel): | ||
| """All data for a given institution (batches and files).""" | ||
|
|
||
|
|
@@ -1812,6 +1840,127 @@ def get_upload_url( | |
| raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) | ||
|
|
||
|
|
||
| @router.get("/{inst_id}/input/bronze-datasets", response_model=list[str]) | ||
| def list_bronze_datasets( | ||
| inst_id: str, | ||
| current_user: Annotated[BaseUser, Depends(get_current_active_user)], | ||
| sql_session: Annotated[Session, Depends(get_session)], | ||
| databricks_control: Annotated[DatabricksControl, Depends(DatabricksControl)], | ||
| ) -> Any: | ||
| """List `.csv` files directly under the institution's Databricks bronze volume root.""" | ||
| has_access_to_inst_or_err(inst_id, current_user) | ||
| local_session.set(sql_session) | ||
|
|
||
| inst = ( | ||
| local_session.get() | ||
| .execute(select(InstTable).where(InstTable.id == str_to_uuid(inst_id))) | ||
| .scalar_one_or_none() | ||
| ) | ||
| if inst is None: | ||
| raise HTTPException( | ||
| status_code=status.HTTP_404_NOT_FOUND, | ||
| detail="Institution not found.", | ||
| ) | ||
|
|
||
| try: | ||
| return databricks_control.list_bronze_volume_csvs(inst.name) | ||
| except ValueError as ve: | ||
| msg = str(ve) | ||
| if "not configured" in msg.lower(): | ||
| raise HTTPException(status_code=status.HTTP_501_NOT_IMPLEMENTED, detail=msg) | ||
| raise HTTPException( | ||
| status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=msg | ||
| ) | ||
|
|
||
|
|
||
| @router.post( | ||
| "/{inst_id}/input/upload-from-volume-to-gcs-bucket", | ||
| response_model=BronzeImportResponse, | ||
| ) | ||
| def upload_from_volume_to_gcs_bucket( | ||
| inst_id: str, | ||
| req: BronzeImportRequest, | ||
| current_user: Annotated[BaseUser, Depends(get_current_active_user)], | ||
| sql_session: Annotated[Session, Depends(get_session)], | ||
| storage_control: Annotated[StorageControl, Depends(StorageControl)], | ||
| databricks_control: Annotated[DatabricksControl, Depends(DatabricksControl)], | ||
| ) -> Any: | ||
| """Import a selected dataset from the institution's bronze volume into GCS unvalidated/.""" | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can a user select multiple datasets? For example a cohort and a course file?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that’s something the frontend would need to handle. Essentially, it would just be calling the endpoint multiple times. |
||
| has_access_to_inst_or_err(inst_id, current_user) | ||
| local_session.set(sql_session) | ||
|
|
||
| inst = ( | ||
| local_session.get() | ||
| .execute(select(InstTable).where(InstTable.id == str_to_uuid(inst_id))) | ||
| .scalar_one_or_none() | ||
| ) | ||
| if inst is None: | ||
| raise HTTPException( | ||
| status_code=status.HTTP_404_NOT_FOUND, | ||
| detail="Institution not found.", | ||
| ) | ||
|
|
||
| requested_name = (req.name or "").strip() | ||
| if not requested_name: | ||
| raise HTTPException( | ||
| status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, | ||
| detail="Dataset name is required.", | ||
| ) | ||
| if "/" in requested_name: | ||
| raise HTTPException( | ||
| status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, | ||
| detail="Dataset name can't contain '/'.", | ||
| ) | ||
|
|
||
| # Ensure this is actually present in the bronze root (and matches naming rules). | ||
| try: | ||
| available = databricks_control.list_bronze_volume_csvs(inst.name) | ||
| except ValueError as ve: | ||
| msg = str(ve) | ||
| if "not configured" in msg.lower(): | ||
| raise HTTPException(status_code=status.HTTP_501_NOT_IMPLEMENTED, detail=msg) | ||
| raise HTTPException( | ||
| status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=msg | ||
| ) | ||
|
|
||
| available_map = {x.lower(): x for x in available} | ||
| file_name = available_map.get(requested_name.lower()) | ||
| if not file_name: | ||
| raise HTTPException( | ||
| status_code=status.HTTP_404_NOT_FOUND, | ||
| detail="Bronze dataset not found.", | ||
| ) | ||
|
|
||
| stream = None | ||
| try: | ||
| stream = databricks_control.download_bronze_volume_file(inst.name, file_name) | ||
| file_bytes = stream.read() | ||
| upload_url = storage_control.generate_upload_signed_url( | ||
| get_external_bucket_name(inst_id), file_name | ||
| ) | ||
| _upload_file_bytes_to_signed_url(file_bytes, upload_url) | ||
| except ValueError as ve: | ||
| raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) | ||
| except requests.RequestException as rexc: | ||
| raise HTTPException( | ||
| status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | ||
| detail=f"Failed to upload dataset to GCS: {rexc}", | ||
| ) | ||
| except Exception as e: | ||
| raise HTTPException( | ||
| status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, | ||
| detail=f"Unexpected error importing dataset: {e}", | ||
| ) | ||
| finally: | ||
| if stream is not None and hasattr(stream, "close"): | ||
| try: | ||
| stream.close() | ||
| except Exception: | ||
| pass | ||
|
|
||
| return {"file_name": file_name, "message": "Upload successful."} | ||
|
|
||
|
|
||
| @router.post("/{inst_id}/add-custom-school-job/{job_run_id}") | ||
| def add_custom_school_job( | ||
| inst_id: str, | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So frontend flow will be... FE first list available datasets through
"/{inst_id}/input/bronze-datasets", then user selects a CSV, then clicks upload or something (which then makes a call to"/{inst_id}/input/upload-from-volume-to-gcs-bucket") and this creates an unvalidated batch? Then we proceed with validation to create a batch correct?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Exactly. Frontend would implement something like a dropdown to select the course file and another to select a cohort file. Then, when you click upload, it calls the endpoint
/{inst_id}/input/upload-from-volume-to-gcs-bucketto pull both files into the GCS bucket as unvalidated.