Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
24efa26
Add S3 credentials and download helpers (#783)
ftgoktas May 15, 2026
61f21e0
Merge branch 'develop' into feature/fgoktas/s3-obs-ingest
ftgoktas May 15, 2026
9e459de
Fix S3 session cookies (#783)
ftgoktas May 15, 2026
d611e0c
Update yaml
ftgoktas May 15, 2026
f9697cd
Use authenticated session in HTTPS path
ftgoktas May 15, 2026
1610032
Try cmr method
ftgoktas May 15, 2026
8ecbe42
Update config for tempo
ftgoktas May 15, 2026
b8a95e1
Make NO2 uppercase
ftgoktas May 15, 2026
e033bf7
Fix abort when not in dry run
ftgoktas May 15, 2026
4fc7c10
Change default dates
ftgoktas May 15, 2026
cf43c31
Handle corrupt files
ftgoktas May 15, 2026
81fb55b
add s3 public download and yamls for tropomi no2 and co
jeromebarre May 20, 2026
f2fd0fa
remove tempo
jeromebarre May 20, 2026
edd2190
limit times
jeromebarre May 20, 2026
4f2c05c
Fix IndentationError in _download_obs_s3_secure introduced in previou…
jeromebarre May 20, 2026
943a1cb
fix name
ftgoktas May 21, 2026
76f2a22
Remove dead code
ftgoktas May 21, 2026
f219411
Remove boto3
ftgoktas May 21, 2026
71a31fb
Add documentation (#783)
ftgoktas May 22, 2026
0511f9b
Fix pycode (#783)
ftgoktas May 22, 2026
fc0eaaa
Merge branch 'develop' into feature/tempo-no2-ingest
ftgoktas Jun 1, 2026
922d547
Merge branch 'feature/tempo-no2-ingest' into feature/ingest_tropomi_n…
jeromebarre Jun 3, 2026
22a90f6
reintroduce changes removed with merge
jeromebarre Jun 3, 2026
7c94dc4
remove useless method
jeromebarre Jun 3, 2026
12b0fff
Merge branch 'develop' into feature/ingest_tropomi_no2_co
jeromebarre Jun 3, 2026
3205542
tropomi tropo column doesnt exists
jeromebarre Jun 3, 2026
7f2a90b
revert original date in config
jeromebarre Jun 3, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# TROPOMI S5P CO Total Column L2 — IODA converter configuration
# Converter: tropomi_no2_co_nc2ioda.py (must be available in `converter_path`
# or in jedi_bundle/build/bin/)
#
# Invocation (one call per cycle, all granules passed together):
# python3 <jedi_bundle>/build/bin/tropomi_no2_co_nc2ioda.py
# -i <granule1.nc> <granule2.nc> ...
# -o <cycle_dir>/ioda/tropomi_s5p_co_total/tropomi_s5p_co_total_YYYYMMDDHH.nc
# -v co
# -c total
# -n 0.0
# -q 0.5

# Name of the ioda-converters Python script.
converter_script: tropomi_no2_co_nc2ioda.py

# Output filename template (must match the source pattern in
# ingest_observations/tropomi_s5p_co_total.yaml).
output_filename_template: "tropomi_s5p_co_total_%Y%m%d%H.nc"

# Additional flags passed verbatim to the converter after -i and -o.
extra_flags:
-v: co # variable name
-c: total # column type
-n: 0.0 # no thinning
-q: 0.5 # quality value threshold
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# TROPOMI S5P NO2 Tropospheric Column L2 — IODA converter configuration
# Converter: tropomi_no2_co_nc2ioda.py (must be available in `converter_path`
# or in jedi_bundle/build/bin/)
#
# Invocation (one call per cycle, all granules passed together):
# python3 <jedi_bundle>/build/bin/tropomi_no2_co_nc2ioda.py
# -i <granule1.nc> <granule2.nc> ...
# -o <cycle_dir>/ioda/tropomi_s5p_no2_tropo/tropomi_s5p_no2_tropo_YYYYMMDDHH.nc
# -v no2
# -c troposphere
# -n 0.0
# -q 0.75

# Name of the ioda-converters Python script.
converter_script: tropomi_no2_co_nc2ioda.py

# Output filename template (must match the source pattern in
# ingest_observations/tropomi_s5p_no2_tropo.yaml).
output_filename_template: "tropomi_s5p_no2_tropo_%Y%m%d%H.nc"

# Additional flags passed verbatim to the converter after -i and -o.
extra_flags:
-v: no2 # variable name
-c: troposphere # column type
-n: 0.0 # no thinning
-q: 0.75 # quality value threshold
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# TROPOMI S5P CO Total Column L2 — download configuration
# Instrument: TROPOMI on Sentinel-5P
# Product: NRTI L2__CO____ (Near Real-Time total column CO)
# Source: MEEO S3 public bucket (no credentials required)
#
# Files are named:
# S5P_NRTI_L2__CO_____YYYYMMDDTHHmmss_*.nc

retrieval_method: s3_public
s3_source: 's3://meeo-s5p/NRTI/L2__CO____/YYYY/MM/DD'

# Maximum duration of a single TROPOMI orbit granule. Used to extend the
# file search window backwards so that orbits starting before window_begin
# but containing data within the DA window are not missed.
# TROPOMI orbits are approximately 101 minutes; PT2H provides a safe margin.
max_orbit_duration: PT2H
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# TROPOMI S5P NO2 Tropospheric Column L2 — download configuration
# Instrument: TROPOMI on Sentinel-5P
# Product: NRTI L2__NO2___ (Near Real-Time tropospheric NO2)
# Source: MEEO S3 public bucket (no credentials required)
#
# Files are named:
# S5P_NRTI_L2__NO2____YYYYMMDDTHHmmss_*.nc

retrieval_method: s3_public
s3_source: 's3://meeo-s5p/NRTI/L2__NO2___/YYYY/MM/DD'

# Maximum duration of a single TROPOMI orbit granule. Used to extend the
# file search window backwards so that orbits starting before window_begin
# but containing data within the DA window are not missed.
# TROPOMI orbits are approximately 101 minutes; PT2H provides a safe margin.
max_orbit_duration: PT2H
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# TROPOMI S5P CO Total Column L2 — R2D2 ingestion configuration
# Source: output of ConvertObsToIoda, relative to the cycle work directory.

acquisition_method: local
source: ioda/tropomi_s5p_co_total/tropomi_s5p_co_total_%Y%m%d%H.nc # in CYCLE_DIR
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# TROPOMI S5P NO2 Tropospheric Column L2 — R2D2 ingestion configuration
# Source: output of ConvertObsToIoda, relative to the cycle work directory.

acquisition_method: local
source: ioda/tropomi_s5p_no2_tropo/tropomi_s5p_no2_tropo_%Y%m%d%H.nc # in CYCLE_DIR
4 changes: 2 additions & 2 deletions src/swell/configuration/jedi/observation_ioda_names.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -354,8 +354,8 @@ ioda instrument names:
full name: TROPOMI tropospheric column NO2
inst type: retrieval
provider : esa
- ioda name: tropomi_s5p_co_tropo
full name: TROPOMI tropospheric column CO
- ioda name: tropomi_s5p_co_total
full name: TROPOMI S5P total column CO
inst type: retrieval
provider : esa
- ioda name: omps_o3_nm_total
Expand Down
6 changes: 4 additions & 2 deletions src/swell/suites/ingest_obs/suite_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,10 @@ class SuiteConfig(QuestionContainer, Enum):
],
geos_cf=[
qd.window_length("PT6H"),
qd.obs_to_download(['tempo_no2_tropo']),
qd.obs_to_ingest(['tempo_no2_tropo']),
qd.obs_to_download(['omps_o3_nm_total', 'tropomi_s5p_no2_tropo',
'tropomi_s5p_co_total', 'tempo_no2_tropo']),
qd.obs_to_ingest(['omps_o3_nm_total', 'tropomi_s5p_no2_tropo',
'tropomi_s5p_co_total', 'tempo_no2_tropo']),
qd.converter_path(
"/discover/nobackup/projects/jcsda/s2127/maryamao/"
"jedi-bundle/build-intel-1.9/bin/"
Expand Down
135 changes: 133 additions & 2 deletions src/swell/tasks/download_obs.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,18 @@
"""
Task for downloading raw observation files from remote servers.

Two retrieval methods are supported, selected by ``retrieval_method`` in
the per-obs download YAML:
Downloads native observation files (e.g. HDF5, NetCDF) from either HTTPS
servers such as NASA GES DISC, from public AWS S3 buckets, or from
authenticated HTTPS endpoints for NASA ASDC datasets.

``https`` (default)
Scrapes an HTML directory listing and streams files via HTTPS.
Authentication uses ``~/.netrc``.

``s3_public``
Downloads from a publicly readable S3 bucket using anonymous boto3
access (e.g. ``s3://meeo-s5p``). Requires ``boto3``.

``cmr``
Queries the NASA CMR API for granule URLs, then downloads via
authenticated HTTPS (Earthdata ``~/.netrc``). Use this for NASA ASDC
Expand Down Expand Up @@ -136,6 +141,10 @@ def _download_obs(
"""
retrieval_method = obs_config.get('retrieval_method', 'https')

if retrieval_method == 's3_public':
return self._download_obs_s3_public(
obs_config, obs_name, window_begin_dto, window_end_dto, dry_run)

if retrieval_method == 'cmr':
return self._download_obs_cmr(
obs_config, obs_name, window_begin_dto, window_end_dto, dry_run)
Expand Down Expand Up @@ -220,6 +229,114 @@ def _download_obs(

return downloaded, failed

def _download_obs_s3_public(
self,
obs_config: dict,
obs_name: str,
window_begin_dto: datetime.datetime,
window_end_dto: datetime.datetime,
dry_run: bool,
) -> tuple[int, int]:
"""Download files for one observation type from a publicly readable
S3 bucket using anonymous (unsigned) boto3 access.

The ``obs_config`` dict must contain:

- ``s3_source``: S3 URI template with ``YYYY``, ``MM``, ``DD``
placeholders, e.g.
``s3://meeo-s5p/NRTI/L2__NO2___/YYYY/MM/DD``.

Optional keys:

- ``max_orbit_duration``: ISO-8601 duration; extends the search
window backwards (default ``PT0H``).

Returns ``(n_downloaded, n_failed)``.
"""
try:
import boto3
from botocore import UNSIGNED
from botocore.config import Config as BotocoreConfig
from botocore.exceptions import BotoCoreError, ClientError
except ImportError:
self.logger.abort(
"boto3 is required for 's3_public' retrieval but is not installed. "
'Install it with: pip install boto3')

s3_source_template = obs_config['s3_source']
max_orbit_dur = isodate.parse_duration(
obs_config.get('max_orbit_duration', 'PT0H'))

search_start = window_begin_dto - max_orbit_dur
search_end = window_end_dto

dest_dir = os.path.join(self.cycle_dir(), 'download', obs_name)
if not dry_run:
os.makedirs(dest_dir, exist_ok=True)

without_scheme = s3_source_template[len('s3://'):]
bucket, _, prefix_template = without_scheme.partition('/')

if dry_run:
for day_date in self._day_slots(search_start, search_end):
prefix = self._resolve_path(prefix_template, day_date)
self.logger.info(
f' [DRY RUN] Would list s3://{bucket}/{prefix}')
return 0, 0

s3_client = boto3.client(
's3',
config=BotocoreConfig(signature_version=UNSIGNED))

downloaded = 0
failed = 0

for day_date in self._day_slots(search_start, search_end):
prefix = self._resolve_path(prefix_template, day_date)
self.logger.info(f' Listing s3://{bucket}/{prefix}')

try:
paginator = s3_client.get_paginator('list_objects_v2')
pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
keys = [
obj['Key']
for page in pages
for obj in page.get('Contents', [])
]
except (BotoCoreError, ClientError) as exc:
self.logger.error(
f' Failed to list s3://{bucket}/{prefix}: {exc}')
failed += 1
continue

if not keys:
self.logger.info(
f' No objects found under s3://{bucket}/{prefix}')
continue

self.logger.info(
f' {day_date}: {len(keys)} object(s) found')

for key in keys:
filename = os.path.basename(key)
dest_path = os.path.join(dest_dir, filename)

if os.path.exists(dest_path):
self.logger.info(f' Already exists, skipping: {filename}')
downloaded += 1
continue

try:
s3_client.download_file(bucket, key, dest_path)
self.logger.info(f' Downloaded: {filename}')
downloaded += 1
except (BotoCoreError, ClientError) as exc:
self.logger.error(
f' Failed to download {filename}: {exc}')
failed += 1

return downloaded, failed

def _download_obs_cmr(
self,
obs_config: dict,
Expand Down Expand Up @@ -414,6 +531,20 @@ def _create_earthdata_session(self) -> requests.Session:
# Slot/date helpers
# ------------------------------------------------------------------

def _day_slots(
self,
search_start: datetime.datetime,
search_end: datetime.datetime,
) -> list[datetime.date]:
"""Return a list of unique date objects from search_start to search_end."""
days = []
current = search_start.date()
end_date = search_end.date()
while current <= end_date:
days.append(current)
current += datetime.timedelta(days=1)
return days

def _hour_slots(
self,
search_start: datetime.datetime,
Expand Down
Loading