From 3060712bc33b3fed6878913846d6581a1c6c1e7a Mon Sep 17 00:00:00 2001 From: Thomas Chamberlin Date: Thu, 12 Mar 2026 18:51:20 +0000 Subject: [PATCH 1/9] Fix SDFITS index file writing to match sparrow3/GBTIDL format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The write_index() output was incompatible with GBTIDL because column names, header padding, format widths, and derived column computations all differed from the sparrow3 reference implementation. Changes: - Generate column header dynamically using sparrow3's exact format specs (EXTENSION→EXT, POLARIZATION→POL, etc. via %N.Ns truncation) - Pad header lines to 256 chars (was 200) matching sparrow3 - Use sparrow3's exact row format strings (%6d, %16.9e, %22.22s, etc.) - Implement skip-spacing for #INDEX# and ROW columns at 1M boundary - Compute CENTFREQ from WCS params (CRVAL1/CRPIX1/CDELT1/NUMCHN) instead of using CRVAL1 directly - Derive POLARIZATION from CRVAL4, PROCEDURE from OBSMODE - Map dysh/FITS column names (HDU→EXTENSION, ELEVATIO→ELEVATION, etc.) - Port 3 sparrow3 IndexWriterTests with character-for-character comparison against .index.expected reference files - Un-skip the roundtrip test Co-Authored-By: Claude Opus 4.6 --- src/dysh/fits/index_file.py | 461 +++++++++++++++---------- src/dysh/fits/tests/test_index_file.py | 385 ++++++++++++++++++++- 2 files changed, 648 insertions(+), 198 deletions(-) diff --git a/src/dysh/fits/index_file.py b/src/dysh/fits/index_file.py index bdc9581bb..ae85bca4b 100644 --- a/src/dysh/fits/index_file.py +++ b/src/dysh/fits/index_file.py @@ -103,6 +103,81 @@ class IndexMetadata: DYSH_TO_SDFITS_INDEX_MAP = {v: k for k, v in SDFITS_INDEX_TO_DYSH_MAP.items()} +# --- Writer format specification matching sparrow3/GBTIDL IndexWriter exactly --- + +_HEADER_LINE_LEN = 256 # Header lines padded to 256 chars (sparrow3's headerLineLen) + +# Writer column spec: (canonical_name, col_header_fmt, row_value_fmt, skip_spacing) +# Matches sparrow3 IndexWriter.cells exactly. +# - col_header_fmt: formats the column name for the header row (may truncate, e.g. EXTENSION → EXT) +# - row_value_fmt: formats the data value for each row +# - skip_spacing: if True, no trailing space in header; in data rows, space added only when int val < 1e6 +_WRITER_SPEC = [ + ("#INDEX#", "%7.7s", "%6d", True), + ("PROJECT", "%16.16s", "%16.16s", False), + ("FILE", "%64.64s", "%64.64s", False), + ("EXTENSION", "%3.3s", "%3d", False), + ("ROW", "%7.7s", "%6d", True), + ("SOURCE", "%32.32s", "%32.32s", False), + ("PROCEDURE", "%9.9s", "%9.9s", False), + ("OBSID", "%32.32s", "%32.32s", False), + ("E2ESCAN", "%5.5s", "%5d", False), + ("PROCSEQN", "%5.5s", "%5d", False), + ("SCAN", "%10.10s", "%10d", False), + ("POLARIZATION", "%3.3s", "%3.3s", False), + ("PLNUM", "%5.5s", "%5d", False), + ("IFNUM", "%5.5s", "%5d", False), + ("FEED", "%5.5s", "%5d", False), + ("FDNUM", "%5.5s", "%5d", False), + ("INT", "%10.10s", "%10d", False), + ("NUMCHN", "%6.6s", "%6d", False), + ("SIG", "%3.3s", "%3.3s", False), + ("CAL", "%3.3s", "%3.3s", False), + ("SAMPLER", "%12.12s", "%12.12s", False), + ("AZIMUTH", "%16.16s", "%16.9e", False), + ("ELEVATION", "%16.16s", "%16.9e", False), + ("LONGITUDE", "%16.16s", "%16.9e", False), + ("LATITUDE", "%16.16s", "%16.9e", False), + ("TRGTLONG", "%16.16s", "%16.9e", False), + ("TRGTLAT", "%16.16s", "%16.9e", False), + ("SUBREF", "%3.3s", "%3d", False), + ("LST", "%16.16s", "%16.9e", False), + ("CENTFREQ", "%16.16s", "%16.9e", False), + ("RESTFREQ", "%16.16s", "%16.9e", False), + ("VELOCITY", "%16.16s", "%16.9e", False), + ("FREQINT", "%16.16s", "%16.9e", False), + ("FREQRES", "%16.16s", "%16.9e", False), + ("DATEOBS", "%22.22s", "%22.22s", False), + ("TIMESTAMP", "%22.22s", "%22.22s", False), + ("BANDWIDTH", "%16.16s", "%16.9e", False), + ("EXPOSURE", "%16.16s", "%16.9e", False), + ("TSYS", "%16.16s", "%16.9e", False), + ("NSAVE", "%10.10s", "%10d", False), + ("PROCSCAN", "%16.16s", "%16.16s", False), + ("PROCTYPE", "%16.16s", "%16.16s", False), + ("WCALPOS", "%16.16s", "%16.16s", False), +] + +# Column names from the writer spec (#INDEX# → INDEX for DataFrame compatibility) +_WRITER_COLUMNS = [name if name != "#INDEX#" else "INDEX" for name, _, _, _ in _WRITER_SPEC] + +# Stokes/polarization code mapping (FITS CRVAL4 integer → string) +_POLARIZATION_MAP = { + 1: "I", + 2: "Q", + 3: "U", + 4: "V", + -1: "RR", + -2: "LL", + -3: "RL", + -4: "LR", + -5: "XX", + -6: "YY", + -7: "XY", + -8: "YX", +} + + def get_index_path(fits_path: str | Path) -> Path: """ Generate .index filename from FITS path. @@ -531,6 +606,140 @@ def read_index_incremental(index_path: Path, start_row: int = 0, end_row: int | return df +def _generate_rows_header() -> str: + """Generate the column header row matching sparrow3's IndexWriter format. + + Column names are formatted using the col_header_fmt from _WRITER_SPEC, + which truncates long names (e.g., EXTENSION → EXT, POLARIZATION → POL). + Skip-spacing columns (#INDEX#, ROW) get no trailing space in the header. + """ + header = "" + last_name = _WRITER_SPEC[-1][0] + for name, col_fmt, _row_fmt, skip_spacing in _WRITER_SPEC: + header += col_fmt % name + if name != last_name and not skip_spacing: + header += " " + return header + + +def _get_polarization(crval4) -> str: + """Translate CRVAL4 Stokes parameter integer to polarization string. + + Matches sparrow3's IndexWriter.getPolarization(). + """ + try: + return _POLARIZATION_MAP.get(int(crval4), "??") + except (ValueError, TypeError): + return "??" + + +def _get_center_frequency(crval1, crpix1, cdelt1, numchn) -> float: + """Compute center frequency from WCS parameters. + + Matches sparrow3's IndexWriter.getCenterFrequency(). + """ + center_chan = (float(numchn) / 2.0) - 0.5 + return ((center_chan - float(crpix1)) * float(cdelt1)) + float(crval1) + + +def _get_procedure(obsmode) -> str: + """Extract procedure name from OBSMODE string. + + Matches sparrow3's IndexWriter.getProcedure(). + """ + if pd.isna(obsmode) or not obsmode: + return "" + return str(obsmode).split(":")[0] + + +def _translate_boolean(val) -> str: + """Convert a boolean-like value to T/F string. + + Matches sparrow3's IndexWriter.translateBoolean(). + """ + if isinstance(val, str): + upper = val.strip().upper() + if upper in ("T", "TRUE"): + return "T" + return "F" + if pd.isna(val): + return "F" + return "T" if val else "F" + + +def _prepare_for_writing(df: pd.DataFrame) -> pd.DataFrame: + """Convert a dysh/FITS DataFrame to canonical SDFITS index column names for writing. + + Computes derived columns (CENTFREQ, POLARIZATION, PROCEDURE, etc.) and renames + columns from dysh/FITS names to canonical SDFITS index names matching sparrow3. + """ + df = df.copy() + + # --- Compute derived columns before renaming --- + + # CENTFREQ: compute from WCS parameters if all are available + if "CRVAL1" in df.columns and "CRPIX1" in df.columns and "CDELT1" in df.columns and "NUMCHN" in df.columns: + df["CENTFREQ"] = [ + _get_center_frequency(r["CRVAL1"], r["CRPIX1"], r["CDELT1"], r["NUMCHN"]) for _, r in df.iterrows() + ] + elif "CRVAL1" in df.columns and "CENTFREQ" not in df.columns: + # Fallback: use CRVAL1 as approximate CENTFREQ + df["CENTFREQ"] = df["CRVAL1"] + + # POLARIZATION: derive from CRVAL4 + if "CRVAL4" in df.columns and "POLARIZATION" not in df.columns and "POL" not in df.columns: + df["POLARIZATION"] = df["CRVAL4"].apply(_get_polarization) + + # PROCEDURE: extract from OBSMODE + if "OBSMODE" in df.columns and "PROCEDURE" not in df.columns and "PROC" not in df.columns: + df["PROCEDURE"] = df["OBSMODE"].apply(_get_procedure) + + # FREQINT: from CDELT1 + if "CDELT1" in df.columns and "FREQINT" not in df.columns: + df["FREQINT"] = df["CDELT1"] + + # BANDWIDTH: from BANDWID + if "BANDWID" in df.columns and "BANDWIDTH" not in df.columns: + df["BANDWIDTH"] = df["BANDWID"] + + # SUBREF: from SUBREF_STATE + if "SUBREF_STATE" in df.columns and "SUBREF" not in df.columns and "SUB" not in df.columns: + df["SUBREF"] = df["SUBREF_STATE"] + + # E2ESCAN: default to 0 (sparrow3 always sets to 0) + if "E2ESCAN" not in df.columns and "E2ESC" not in df.columns: + df["E2ESCAN"] = 0 + + # --- Rename columns from dysh/FITS names to canonical SDFITS index names --- + rename_map = { + "INTNUM": "INT", + "PROJID": "PROJECT", + "DATE-OBS": "DATEOBS", + "HDU": "EXTENSION", + "OBJECT": "SOURCE", + "PROC": "PROCEDURE", + "ELEVATIO": "ELEVATION", + "CRVAL2": "LONGITUDE", + "CRVAL3": "LATITUDE", + # Abbreviated → canonical (for DataFrames from old dysh-written index files) + "EXT": "EXTENSION", + "POL": "POLARIZATION", + "E2ESC": "E2ESCAN", + "PROCS": "PROCSEQN", + "SUB": "SUBREF", + } + actual_renames = {src: dst for src, dst in rename_map.items() if src in df.columns and dst not in df.columns} + if actual_renames: + df = df.rename(columns=actual_renames) + + # --- Convert SIG/CAL to T/F strings --- + for col in ("SIG", "CAL"): + if col in df.columns: + df[col] = df[col].apply(_translate_boolean) + + return df + + def write_index(index_path: str | Path, metadata: IndexMetadata, df: pd.DataFrame): """ Write an SDFITS-compatible .index file. @@ -546,238 +755,108 @@ def write_index(index_path: str | Path, metadata: IndexMetadata, df: pd.DataFram Notes ----- - Creates an SDFITS-compatible ASCII format with: - - Header section with 200-character padded lines - - Data section with fixed-width columns - - Scientific notation for floats + Creates an SDFITS-compatible ASCII format matching sparrow3/GBTIDL: + - Header section with 256-character padded lines + - Data section with fixed-width columns matching sparrow3's IndexWriter + - Scientific notation for floats (%16.9e) - T/F for booleans + - Column header generated from format specs (matches sparrow3 exactly) """ index_path = Path(index_path) - # Convert dysh column names to SDFITS index names - df_sdfits = convert_dysh_to_sdfits_index(df.copy()) + # Convert to canonical SDFITS index format + df_sdfits = _prepare_for_writing(df) - # Ensure all 43 standard SDFITS index columns exist (add missing ones with default values) - standard_cols = [ - "INDEX", - "PROJECT", - "FILE", - "EXT", - "ROW", - "SOURCE", - "PROCEDURE", - "OBSID", - "E2ESC", - "PROCS", - "SCAN", - "POL", - "PLNUM", - "IFNUM", - "FEED", - "FDNUM", - "INT", - "NUMCHN", - "SIG", - "CAL", - "SAMPLER", - "AZIMUTH", - "ELEVATION", - "LONGITUDE", - "LATITUDE", - "TRGTLONG", - "TRGTLAT", - "SUB", - "LST", - "CENTFREQ", - "RESTFREQ", - "VELOCITY", - "FREQINT", - "FREQRES", - "DATEOBS", - "TIMESTAMP", - "BANDWIDTH", - "EXPOSURE", - "TSYS", - "NSAVE", - "PROCSCAN", - "PROCTYPE", - "WCALPOS", - ] - - for col in standard_cols: + # Ensure all standard columns exist with appropriate defaults + for col in _WRITER_COLUMNS: if col not in df_sdfits.columns: - # Add missing column with appropriate default value - if col in ["SIG", "CAL"]: - df_sdfits[col] = False - elif col in [ - "AZIMUTH", - "ELEVATION", - "LONGITUDE", - "LATITUDE", - "TRGTLONG", - "TRGTLAT", - "LST", - "CENTFREQ", - "RESTFREQ", - "VELOCITY", - "FREQINT", - "FREQRES", - "BANDWIDTH", - "EXPOSURE", - "TSYS", - ]: - df_sdfits[col] = 0.0 - elif col in [ - "INDEX", - "EXT", - "ROW", - "E2ESC", - "PROCS", - "SCAN", - "PLNUM", - "IFNUM", - "FEED", - "FDNUM", - "INT", - "NUMCHN", - "SUB", - "NSAVE", - ]: - df_sdfits[col] = 0 - else: - df_sdfits[col] = "" - - # Reorder columns to match standard order - df_sdfits = df_sdfits[standard_cols] + # Find the spec entry for this column + spec_name = "#INDEX#" if col == "INDEX" else col + spec = next((s for s in _WRITER_SPEC if s[0] == spec_name), None) + if spec: + row_fmt = spec[2] + if col == "NSAVE": + df_sdfits[col] = -1 # sparrow3 defaults NSAVE to -1 + elif "d" in row_fmt: + df_sdfits[col] = 0 + elif "e" in row_fmt: + df_sdfits[col] = 0.0 + else: + df_sdfits[col] = "" with open(index_path, "w") as f: # Write header section f.write("[header]\n") - # Write each metadata field with 200-char padding + # Write each metadata field with 256-char padding (matching sparrow3) for field in fields(IndexMetadata): key = field.name value = getattr(metadata, key) line = f"{key} = {value}" - # Pad to 200 characters - line = line.ljust(200) - f.write(line + "\n") + f.write(line.ljust(_HEADER_LINE_LEN) + "\n") # Write [rows] marker f.write("[rows]\n") - # Write column header - f.write( - "#INDEX# PROJECT FILE EXT ROW SOURCE PROCEDURE OBSID E2ESC PROCS SCAN POL PLNUM IFNUM FEED FDNUM INT NUMCHN SIG CAL SAMPLER AZIMUTH ELEVATION LONGITUDE LATITUDE TRGTLONG TRGTLAT SUB LST CENTFREQ RESTFREQ VELOCITY FREQINT FREQRES DATEOBS TIMESTAMP BANDWIDTH EXPOSURE TSYS NSAVE PROCSCAN PROCTYPE WCALPOS\n" - ) + # Write column header (generated dynamically to match sparrow3 format) + f.write(_generate_rows_header() + "\n") # Write data rows for _, row in df_sdfits.iterrows(): - line = _format_sdfits_row(row) - f.write(line + "\n") + f.write(_format_sdfits_row(row) + "\n") def _format_sdfits_row(row: pd.Series) -> str: - """Format a single row in SDFITS index format. + """Format a single row matching sparrow3's IndexWriter format. Parameters ---------- row : pd.Series - Row with SDFITS index column names + Row with canonical SDFITS index column names Returns ------- str Formatted row string """ - # SDFITS column format specification - # Format: (column_name, width, format_type) - # format_type: 'i'=integer (right-aligned), 'f'=float (scientific), 's'=string (left-aligned), 'b'=boolean (T/F) - sdfits_spec = [ - ("INDEX", 7, "i"), - ("PROJECT", 16, "s"), - ("FILE", 64, "s"), - ("EXT", 3, "i"), - ("ROW", 7, "i"), - ("SOURCE", 32, "s"), - ("PROCEDURE", 9, "s"), - ("OBSID", 32, "s"), - ("E2ESC", 5, "i"), - ("PROCS", 5, "i"), - ("SCAN", 10, "i"), - ("POL", 3, "s"), - ("PLNUM", 5, "i"), - ("IFNUM", 5, "i"), - ("FEED", 5, "i"), - ("FDNUM", 5, "i"), - ("INT", 10, "i"), - ("NUMCHN", 6, "i"), - ("SIG", 3, "b"), - ("CAL", 3, "b"), - ("SAMPLER", 12, "s"), - ("AZIMUTH", 16, "f"), - ("ELEVATION", 16, "f"), - ("LONGITUDE", 16, "f"), - ("LATITUDE", 16, "f"), - ("TRGTLONG", 16, "f"), - ("TRGTLAT", 16, "f"), - ("SUB", 3, "i"), - ("LST", 16, "f"), - ("CENTFREQ", 16, "f"), - ("RESTFREQ", 16, "f"), - ("VELOCITY", 16, "f"), - ("FREQINT", 16, "f"), - ("FREQRES", 16, "f"), - ("DATEOBS", 21, "s"), - ("TIMESTAMP", 21, "s"), - ("BANDWIDTH", 16, "f"), - ("EXPOSURE", 16, "f"), - ("TSYS", 16, "f"), - ("NSAVE", 10, "i"), - ("PROCSCAN", 16, "s"), - ("PROCTYPE", 16, "s"), - ("WCALPOS", 16, "s"), - ] - - parts = [] - for col_name, width, fmt in sdfits_spec: - val = row.get(col_name, "") - - # Format based on type - if fmt == "i": - # Integer: right-aligned - if pd.isna(val) or val == "": - formatted = str(0).rjust(width) + result = "" + last_name = _WRITER_SPEC[-1][0] + + for name, _col_fmt, row_fmt, skip_spacing in _WRITER_SPEC: + # Handle #INDEX# which is stored as INDEX in DataFrame + lookup_name = "INDEX" if name == "#INDEX#" else name + val = row.get(lookup_name) + + # Determine value type from format string and apply defaults + if "d" in row_fmt: + # Integer format + if pd.isna(val) or val is None or val == "": + val = 0 else: - formatted = str(int(val)).rjust(width) - elif fmt == "f": + val = int(val) + elif "e" in row_fmt: # Float scientific notation - if pd.isna(val) or val == "": - formatted = "0.000000000e+00".rjust(width) - else: - formatted = f"{float(val):.9e}".rjust(width) - elif fmt == "b": - # Boolean: T/F, left-aligned - if pd.isna(val) or val == "": - formatted = "F".ljust(width) + if pd.isna(val) or val is None or val == "": + val = 0.0 else: - val_str = "T" if val else "F" - formatted = val_str.ljust(width) - elif fmt == "s": - # String: left-aligned - if pd.isna(val) or val == "": - formatted = "".ljust(width) - else: - formatted = str(val).ljust(width) + val = float(val) + elif pd.isna(val) or val is None: + # String format, missing value + val = "" else: - formatted = str(val).ljust(width) + # String format + val = str(val) + + result += row_fmt % val - # Truncate if too long - formatted = formatted[:width] - parts.append(formatted) - parts.append(" ") # Space between columns + # Spacing logic matching sparrow3's initRowString: + # - skip_spacing columns (#INDEX#, ROW): add space only when int value < 1e6 + # - other columns: always add space (except the last column) + if name != last_name: + if not skip_spacing or (isinstance(val, int) and val < 1000000): + result += " " - return "".join(parts).rstrip() # Remove trailing space + return result def validate_index(fits_path: str | Path, index_path: str | Path) -> bool: diff --git a/src/dysh/fits/tests/test_index_file.py b/src/dysh/fits/tests/test_index_file.py index 92a54eedc..6971a9124 100644 --- a/src/dysh/fits/tests/test_index_file.py +++ b/src/dysh/fits/tests/test_index_file.py @@ -10,6 +10,11 @@ from dysh.fits.index_file import ( IndexMetadata, + _generate_rows_header, + _get_center_frequency, + _get_polarization, + _get_procedure, + _translate_boolean, convert_dysh_to_sdfits_index, convert_sdfits_index_to_dysh, create_index_metadata, @@ -162,7 +167,6 @@ def test_read_nonexistent_index(self, tmp_path): class TestWriteIndex: """Tests for write_index()""" - @pytest.mark.skip(reason="Write functionality not currently needed") def test_write_and_read_roundtrip(self, tmp_path, sample_metadata, sample_dataframe): """Test writing and reading back produces same data.""" index_path = tmp_path / "test.index" @@ -188,10 +192,16 @@ def test_write_and_read_roundtrip(self, tmp_path, sample_metadata, sample_datafr for col in sample_dataframe.columns: assert col in df_read.columns, f"Column {col} not found in read dataframe" - # Check a few specific values - if "SCAN" in df_read.columns: + # Check integer values survive roundtrip + pd.testing.assert_series_equal(df_read["SCAN"], sample_dataframe["SCAN"], check_names=False, check_dtype=False) + pd.testing.assert_series_equal( + df_read["IFNUM"], sample_dataframe["IFNUM"], check_names=False, check_dtype=False + ) + + # Check float values survive roundtrip (within scientific notation precision) + for col in ("AZIMUTH", "ELEVATION", "CENTFREQ", "EXPOSURE"): pd.testing.assert_series_equal( - df_read["SCAN"], sample_dataframe["SCAN"], check_names=False, check_dtype=False + df_read[col], sample_dataframe[col], check_names=False, check_dtype=False, rtol=1e-8 ) def test_write_creates_proper_format(self, tmp_path, sample_metadata, sample_dataframe): @@ -208,11 +218,11 @@ def test_write_creates_proper_format(self, tmp_path, sample_metadata, sample_dat assert any("[rows]" in line for line in lines) assert any("#INDEX#" in line for line in lines) - # Check header lines are padded to 200 chars + # Check header lines are padded to 256 chars (matching sparrow3) for i, line in enumerate(lines[1:9]): # Header field lines if "=" in line: - # Should be padded (newline adds 1 char, so 201 total) - assert len(line) == 201, f"Line {i + 2} not properly padded: {len(line)} chars" + # Should be padded (newline adds 1 char, so 257 total) + assert len(line) == 257, f"Line {i + 2} not properly padded: {len(line)} chars" def test_write_scientific_notation(self, tmp_path, sample_metadata, sample_dataframe): """Test that floats are written in scientific notation.""" @@ -246,6 +256,188 @@ def test_write_boolean_as_TF(self, tmp_path, sample_metadata, sample_dataframe): break +class TestDerivedColumns: + """Tests for derived column computation functions""" + + def test_get_polarization(self): + """Test CRVAL4 → polarization string mapping.""" + assert _get_polarization(1) == "I" + assert _get_polarization(2) == "Q" + assert _get_polarization(3) == "U" + assert _get_polarization(4) == "V" + assert _get_polarization(-1) == "RR" + assert _get_polarization(-2) == "LL" + assert _get_polarization(-5) == "XX" + assert _get_polarization(-6) == "YY" + assert _get_polarization(99) == "??" + + def test_get_center_frequency(self): + """Test CENTFREQ computation matches sparrow3.""" + # centerChan = (numchn / 2.0) - 0.5 + # centerFreq = ((centerChan - crpix1) * cdelt1) + crval1 + crval1 = 1.42e9 + crpix1 = 512.0 + cdelt1 = 1000.0 + numchn = 1024 + + expected_center_chan = (1024 / 2.0) - 0.5 # = 511.5 + expected = ((expected_center_chan - 512.0) * 1000.0) + 1.42e9 # = 1.42e9 - 500.0 + result = _get_center_frequency(crval1, crpix1, cdelt1, numchn) + assert result == pytest.approx(expected, rel=1e-12) + + def test_get_center_frequency_not_crval1(self): + """Test that CENTFREQ differs from CRVAL1 when CRPIX1 is not at center.""" + crval1 = 1.0e9 + crpix1 = 0.0 # Reference pixel at edge, not center + cdelt1 = 1000.0 + numchn = 1024 + + result = _get_center_frequency(crval1, crpix1, cdelt1, numchn) + # centerChan = 511.5, so centerFreq = (511.5 - 0) * 1000 + 1e9 + assert result != crval1 # Should NOT be the same as CRVAL1 + assert result == pytest.approx(1.0e9 + 511500.0, rel=1e-12) + + def test_get_procedure(self): + """Test OBSMODE → PROCEDURE extraction.""" + assert _get_procedure("OnOff:Nod") == "OnOff" + assert _get_procedure("Track") == "Track" + assert _get_procedure("") == "" + assert _get_procedure("OffOn:PSWITCHOFF:1:2") == "OffOn" + + def test_translate_boolean(self): + """Test boolean T/F conversion.""" + assert _translate_boolean(True) == "T" + assert _translate_boolean(False) == "F" + assert _translate_boolean("T") == "T" + assert _translate_boolean("F") == "F" + assert _translate_boolean("True") == "T" + assert _translate_boolean(1) == "T" + assert _translate_boolean(0) == "F" + + +class TestSparrow3FormatCompat: + """Tests for sparrow3/GBTIDL format compatibility""" + + def test_header_line_padding_256(self, tmp_path, sample_metadata, sample_dataframe): + """Test that header lines are padded to 256 chars (matching sparrow3).""" + index_path = tmp_path / "test.index" + write_index(index_path, sample_metadata, sample_dataframe) + + with open(index_path) as f: + lines = f.readlines() + + for line in lines[1:9]: # Header key=value lines + if "=" in line: + # Content should be 256 chars + newline = 257 + assert len(line) == 257, f"Header line not padded to 256: {len(line) - 1} chars" + + def test_generated_header_matches_sparrow3(self): + """Test that generated column header row matches sparrow3's format.""" + header = _generate_rows_header() + + # Should start with #INDEX# (7 chars, no trailing space) + assert header.startswith("#INDEX#") + + # Verify truncated column names appear correctly + # EXTENSION → EXT (truncated by %3.3s) + assert " EXT" in header + # POLARIZATION → POL (truncated by %3.3s) + assert " POL " in header + # E2ESCAN → E2ESC (truncated by %5.5s) + assert " E2ESC " in header + # PROCSEQN → PROCS (truncated by %5.5s) + assert " PROCS " in header + # SUBREF → SUB (truncated by %3.3s) + assert " SUB " in header + + # Verify non-truncated names + assert "ELEVATION" in header + assert "CENTFREQ" in header + assert "DATEOBS" in header + + def test_write_with_dysh_column_names(self, tmp_path, sample_metadata): + """Test writing a DataFrame with dysh/FITS column names.""" + df = pd.DataFrame( + { + "INDEX": [0], + "PROJID": ["TestProject"], + "HDU": [1], + "OBJECT": ["W3OH"], + "SCAN": [100], + "CRVAL4": [-5], # XX polarization + "CRVAL1": [1.42e9], + "CRPIX1": [512.0], + "CDELT1": [1000.0], + "NUMCHN": [1024], + "CRVAL2": [45.72], + "CRVAL3": [-10.3], + "ELEVATIO": [30.75], + "OBSMODE": ["OnOff:Nod"], + "SIG": [True], + "CAL": [False], + } + ) + index_path = tmp_path / "test.index" + write_index(index_path, sample_metadata, df) + + # Read back and verify derived columns + _metadata, df_read = read_index(index_path) + + # POLARIZATION should be derived from CRVAL4=-5 → XX + assert "POL" in df_read.columns + assert df_read["POL"].iloc[0] == "XX" + + # CENTFREQ should be computed (not just CRVAL1) + # centerChan = (1024/2) - 0.5 = 511.5 + # centerFreq = ((511.5 - 512.0) * 1000.0) + 1.42e9 + expected_centfreq = _get_center_frequency(1.42e9, 512.0, 1000.0, 1024) + assert df_read["CENTFREQ"].iloc[0] == pytest.approx(expected_centfreq, rel=1e-8) + + # PROCEDURE should be extracted from OBSMODE + assert df_read["PROCEDURE"].iloc[0] == "OnOff" + + # SOURCE should be mapped from OBJECT + assert df_read["SOURCE"].iloc[0] == "W3OH" + + # EXT should be mapped from HDU + assert df_read["EXT"].iloc[0] == 1 + + # ELEVATION should be mapped from ELEVATIO + assert df_read["ELEVATION"].iloc[0] == pytest.approx(30.75) + + # SIG/CAL should be T/F booleans + assert df_read["SIG"].iloc[0] is True # read_index converts T→True + assert df_read["CAL"].iloc[0] is False # read_index converts F→False + + def test_nsave_defaults_to_minus_one(self, tmp_path, sample_metadata, sample_dataframe): + """Test that NSAVE defaults to -1 (matching sparrow3).""" + index_path = tmp_path / "test.index" + write_index(index_path, sample_metadata, sample_dataframe) + + _, df_read = read_index(index_path) + assert all(df_read["NSAVE"] == -1) + + def test_dateobs_timestamp_width_22(self, tmp_path, sample_metadata): + """Test DATEOBS and TIMESTAMP columns use 22-char width (not 21).""" + df = pd.DataFrame( + { + "INDEX": [0], + "SCAN": [1], + "DATEOBS": ["2013_11_05_16:15:21.35"], # 22 chars + "TIMESTAMP": ["2013_11_05_16:15:21"], # shorter, right-justified in 22 + } + ) + index_path = tmp_path / "test.index" + write_index(index_path, sample_metadata, df) + + with open(index_path) as f: + content = f.read() + + # The date should be present (right-justified in 22-char field) + assert "2013_11_05_16:15:21.35" in content + assert "2013_11_05_16:15:21" in content + + class TestValidateIndex: """Tests for validate_index()""" @@ -545,3 +737,182 @@ def test_write_compatible_with_sdfits_index_format(self, tmp_path, sample_index_ # Should have same structure assert len(df_new) == len(df_orig) assert set(df_new.columns) == set(df_orig.columns) + + +def _get_rows_section(filepath): + """Extract [rows] section (column header + data rows) from an index file. + + Returns list of lines with trailing newlines/spaces stripped. + Skips the [header] section entirely, returning only the column header + and data rows for format comparison. + """ + with open(filepath) as f: + lines = f.readlines() + rows_start = None + for i, line in enumerate(lines): + if line.strip() == "[rows]": + rows_start = i + 1 + break + if rows_start is None: + raise ValueError("No [rows] section found") + return [line.rstrip("\n") for line in lines[rows_start:] if line.strip()] + + +def _sparrow3_base_row(): + """Return a dict of canonical SDFITS index column values matching + the sparrow3 IndexWriterTests test data. + + Values are pre-computed to match sparrow3's translateInfo() output: + - CENTFREQ = ((1024/2 - 0.5) - 512) * 1.0 + 0.0 = -0.5 + - POLARIZATION = XX (from CRVAL4=-5) + - PROCEDURE = obsmode (from OBSMODE="obsmode", no colon) + - SIG = T, CAL = T (sparrow3's translateBoolean treats string "F" as truthy; + we use the same expected output values for format comparison) + """ + return { + "PROJECT": "projectA", + "FILE": "filepath", + "EXTENSION": 1, + "SOURCE": "object name", + "PROCEDURE": "obsmode", + "OBSID": "obsid", + "E2ESCAN": 0, + "PROCSEQN": 5, + "SCAN": 100, + "POLARIZATION": "XX", + "PLNUM": 1, + "IFNUM": 3, + "FEED": 4, + "FDNUM": 1, + "INT": 10, + "NUMCHN": 1024, + "SIG": "T", + "CAL": "T", + "SAMPLER": "A1_0", + "AZIMUTH": 120.5, + "ELEVATION": 30.75, + "LONGITUDE": 45.72, + "LATITUDE": -10.3, + "TRGTLONG": 45.8, + "TRGTLAT": -10.5, + "SUBREF": 1, + "LST": 1234.56, + "CENTFREQ": -0.5, # computed from CRVAL1=0, CRPIX1=512, CDELT1=1, NUMCHN=1024 + "RESTFREQ": 1420405800.0, + "VELOCITY": 0.0, + "FREQINT": 1.0, + "FREQRES": 1.0, + "DATEOBS": "2013_11_05_16:15:21.35", + "TIMESTAMP": "2013_11_05_16:15:21", + "BANDWIDTH": 1024.0, + "EXPOSURE": 2.0, + "TSYS": 20.0, + "NSAVE": -1, + "PROCSCAN": 10, + "PROCTYPE": "unknown", + "WCALPOS": "Unknown", + } + + +# Path to sparrow3 expected test data +_SPARROW3_DATA_DIR = ( + Path(__file__).parent.parent.parent.parent.parent / ".context" / "sparrow3" / "gbt" / "api" / "sdfits" / "data" +) + + +class TestSparrow3Ported: + """Tests ported from sparrow3 IndexWriterTests. + + These compare dysh-written data rows character-for-character against + sparrow3's .index.expected reference files to verify exact format + compatibility with GBTIDL. + """ + + def _write_test_index(self, tmp_path, num_rows, start_index=0, start_row=0): + """Helper: write an index file with the sparrow3 test data. + + Returns the path to the written file. + """ + base = _sparrow3_base_row() + rows = [] + for i in range(num_rows): + row = dict(base) + row["INDEX"] = start_index + i + row["ROW"] = start_row + i + rows.append(row) + + df = pd.DataFrame(rows) + metadata = IndexMetadata( + created="Mon Mar 23 14:42:48 2015", + last_modified="Mon Mar 23 14:42:48 2015", + ) + index_path = tmp_path / "test.index" + write_index(index_path, metadata, df) + return index_path + + def test_basics_matches_sparrow3(self, tmp_path): + """Port of sparrow3 IndexWriterTests.testBasics. + + Write 3 rows with small INDEX/ROW values and verify data rows + match Basics.index.expected character-for-character. + """ + expected_path = _SPARROW3_DATA_DIR / "Basics.index.expected" + if not expected_path.exists(): + pytest.skip(f"sparrow3 expected file not found: {expected_path}") + + index_path = self._write_test_index(tmp_path, num_rows=3, start_index=0, start_row=0) + + actual = _get_rows_section(index_path) + expected = _get_rows_section(expected_path) + + # Column header should match exactly + assert actual[0] == expected[0], "Column header row does not match sparrow3" + + # Data rows should match exactly + assert len(actual) == len(expected), f"Row count mismatch: {len(actual) - 1} vs {len(expected) - 1} data rows" + for i, (act, exp) in enumerate(zip(actual[1:], expected[1:], strict=True)): + assert act == exp, f"Data row {i} does not match sparrow3 expected output" + + def test_long_row_matches_sparrow3(self, tmp_path): + """Port of sparrow3 IndexWriterTests.testLongRow. + + Write 3 rows where ROW crosses the 1e6 boundary (999999, 1000000, 1000001). + Tests the skip-spacing behavior: ROW < 1e6 gets a trailing space, + ROW >= 1e6 does not (the extra digit fills the space). + """ + expected_path = _SPARROW3_DATA_DIR / "LongRow.index.expected" + if not expected_path.exists(): + pytest.skip(f"sparrow3 expected file not found: {expected_path}") + + index_path = self._write_test_index(tmp_path, num_rows=3, start_index=0, start_row=999999) + + actual = _get_rows_section(index_path) + expected = _get_rows_section(expected_path) + + assert actual[0] == expected[0], "Column header row does not match sparrow3" + assert len(actual) == len(expected) + for i, (act, exp) in enumerate(zip(actual[1:], expected[1:], strict=True)): + assert act == exp, f"Data row {i} (ROW={999999 + i}) does not match sparrow3 expected output" + + def test_long_index_and_row_matches_sparrow3(self, tmp_path): + """Port of sparrow3 IndexWriterTests.testLongIndexAndRow. + + Write 5 rows where both INDEX and ROW cross the 1e6 boundary independently. + INDEX starts at 999999 (crosses at row 2), ROW starts at 999997 (crosses at row 4). + Tests skip-spacing for both columns simultaneously. + """ + expected_path = _SPARROW3_DATA_DIR / "LongIndexAndRow.index.expected" + if not expected_path.exists(): + pytest.skip(f"sparrow3 expected file not found: {expected_path}") + + index_path = self._write_test_index(tmp_path, num_rows=5, start_index=999999, start_row=999997) + + actual = _get_rows_section(index_path) + expected = _get_rows_section(expected_path) + + assert actual[0] == expected[0], "Column header row does not match sparrow3" + assert len(actual) == len(expected) + for i, (act, exp) in enumerate(zip(actual[1:], expected[1:], strict=True)): + assert act == exp, ( + f"Data row {i} (INDEX={999999 + i}, ROW={999997 + i}) does not match sparrow3 expected output" + ) From 6deff40bdf749cd0f687ed7b0cb1e7f08d0b806b Mon Sep 17 00:00:00 2001 From: Thomas Chamberlin Date: Thu, 12 Mar 2026 19:27:10 +0000 Subject: [PATCH 2/9] Add multi-file VEGAS index write tests Test writing index files with heterogeneous multi-bank VEGAS data: - Synthetic 16-row test with 4 banks, 2 scans, 2 polarizations, varying FILE/EXTENSION/CRVAL4/OBSMODE/CENTFREQ across rows - Real VEGAS data roundtrip using AGBT18B_354_03 (4 banks, 128 rows) Co-Authored-By: Claude Opus 4.6 --- src/dysh/fits/tests/test_index_file.py | 159 +++++++++++++++++++++++++ 1 file changed, 159 insertions(+) diff --git a/src/dysh/fits/tests/test_index_file.py b/src/dysh/fits/tests/test_index_file.py index 6971a9124..ccbf43efb 100644 --- a/src/dysh/fits/tests/test_index_file.py +++ b/src/dysh/fits/tests/test_index_file.py @@ -916,3 +916,162 @@ def test_long_index_and_row_matches_sparrow3(self, tmp_path): assert act == exp, ( f"Data row {i} (INDEX={999999 + i}, ROW={999997 + i}) does not match sparrow3 expected output" ) + + +class TestMultiFileVegas: + """Tests for writing index data from multi-file VEGAS observations. + + Real VEGAS data has multiple FITS files (one per bank: A, B, C, D), + each with different IFNUMs, polarizations, and frequencies. The index + must handle varying FILE, EXTENSION, POLARIZATION, PROCEDURE, CENTFREQ, + and other columns across rows. + """ + + def test_write_multifile_roundtrip(self, tmp_path): + """Test writing and reading an index with multi-file VEGAS-like data.""" + # Simulate 4 VEGAS banks (A, B, C, D) with 2 scans, 2 pols each + rows = [] + files = [ + "AGBT18B_354_03.raw.vegas.A.fits", + "AGBT18B_354_03.raw.vegas.B.fits", + "AGBT18B_354_03.raw.vegas.C.fits", + "AGBT18B_354_03.raw.vegas.D.fits", + ] + # Each bank has different CRVAL1/CRPIX1/CDELT1 (different tunings) + bank_freqs = [ + (1.40e9, 512.0, 1464.84375), # Bank A + (1.42e9, 512.0, 1464.84375), # Bank B + (1.44e9, 512.0, 1464.84375), # Bank C + (1.46e9, 512.0, 1464.84375), # Bank D + ] + idx = 0 + for bank_i, (filename, (crval1, crpix1, cdelt1)) in enumerate(zip(files, bank_freqs, strict=True)): + for scan in (6, 7): + for crval4 in (-1, -2): # RR, LL polarizations + rows.append( + { + "INDEX": idx, + "FILE": filename, + "EXTENSION": 1, + "ROW": idx, + "OBJECT": "W49N", + "OBSMODE": "OffOn:PSWITCHON:TPWCAL" if scan == 6 else "OffOn:PSWITCHOFF:TPWCAL", + "OBSID": "Observation:1", + "PROJID": "AGBT18B_354_03", + "SCAN": scan, + "CRVAL4": crval4, + "CRVAL1": crval1, + "CRPIX1": crpix1, + "CDELT1": cdelt1, + "NUMCHN": 1024, + "PLNUM": 0 if crval4 == -1 else 1, + "IFNUM": bank_i, + "FEED": 1, + "FDNUM": 0, + "INTNUM": 0, + "PROCSEQN": 1, + "SIG": True, + "CAL": False, + "SAMPLER": f"A{bank_i}_0", + "AZIMUTH": 202.26, + "ELEVATIO": 66.45, + "CRVAL2": 287.755, + "CRVAL3": 14.136, + "TRGTLONG": 287.755, + "TRGTLAT": 14.136, + "SUBREF_STATE": 1, + "LST": 45123.0, + "RESTFREQ": 1.42040575e9, + "VELOCITY": 0.0, + "FREQRES": 1464.84375, + "DATE-OBS": "2018-12-15T06:12:00.00", + "TIMESTAMP": "2018_12_15_06:12:00", + "BANDWID": 1.5e6, + "EXPOSURE": 30.0, + "TSYS": 25.0, + } + ) + idx += 1 + + df = pd.DataFrame(rows) + metadata = create_index_metadata(observer="Test", backend="VEGAS") + + # Write + index_path = tmp_path / "test.index" + write_index(index_path, metadata, df) + + # Read back + _read_metadata, df_read = read_index(index_path) + + # Verify all 16 rows survived + assert len(df_read) == 16 + + # Verify FILE column has 4 distinct values + assert df_read["FILE"].nunique() == 4 + + # Verify POLARIZATION derived correctly from CRVAL4 + pol_values = set(df_read["POL"].unique()) + assert pol_values == {"RR", "LL"} + + # Verify PROCEDURE extracted from OBSMODE + proc_values = set(df_read["PROCEDURE"].unique()) + assert proc_values == {"OffOn"} + + # Verify CENTFREQ computed (not just CRVAL1) and varies across banks + assert df_read["CENTFREQ"].nunique() == 4 + + # Verify PROJECT mapped from PROJID + assert all(df_read["PROJECT"] == "AGBT18B_354_03") + + # Verify SOURCE mapped from OBJECT + assert all(df_read["SOURCE"] == "W49N") + + # Verify SIG/CAL booleans roundtripped + assert df_read["SIG"].all() + assert not df_read["CAL"].any() + + def test_write_real_vegas_data(self, testdata_dir, tmp_path): + """Test writing an index from real multi-bank VEGAS FITS data.""" + vegas_dir = testdata_dir / "AGBT18B_354_03" / "AGBT18B_354_03.raw.vegas" + if not vegas_dir.exists(): + pytest.skip(f"VEGAS testdata not found: {vegas_dir}") + + from dysh.fits.gbtfitsload import GBTFITSLoad + + loader = GBTFITSLoad(str(vegas_dir)) + df = loader._selection + + metadata = create_index_metadata( + observer=df["OBSERVER"].iloc[0] if "OBSERVER" in df.columns else "Unknown", + backend=df["BACKEND"].iloc[0] if "BACKEND" in df.columns else "VEGAS", + ) + + # Write index + index_path = tmp_path / "AGBT18B_354_03.raw.vegas.index" + write_index(index_path, metadata, df) + + # Read it back + _meta, df_read = read_index(index_path) + + # Should have all rows + assert len(df_read) == len(df) + + # Should have all 43 standard columns + assert len(df_read.columns) == 43 + + # Key columns should survive roundtrip + assert df_read["SCAN"].nunique() == df["SCAN"].nunique() + assert df_read["IFNUM"].nunique() == df["IFNUM"].nunique() + + # POLARIZATION should be derived from CRVAL4 + if "CRVAL4" in df.columns: + assert "POL" in df_read.columns + expected_pols = set(df["CRVAL4"].apply(_get_polarization).unique()) + actual_pols = set(df_read["POL"].unique()) + assert actual_pols == expected_pols + + # Written file should be parseable by parse_sdfits_index_file too + from dysh.fits.index_file import parse_sdfits_index_file + + df_parsed = parse_sdfits_index_file(index_path) + assert len(df_parsed) == len(df) From 789a04b43bdf5bcad5fb2036ede20af9ccec6b5e Mon Sep 17 00:00:00 2001 From: Thomas Chamberlin Date: Thu, 12 Mar 2026 20:04:48 +0000 Subject: [PATCH 3/9] Port sparrow3 VEGAS multi-bank index test (8 banks, 32 rows) Adds character-for-character verification against sparrow3's test.banks.vegas.raw.ints.index.expected for 8 VEGAS bank files (A-H) with varying FILE, POL, SAMPLER, FEED, IFNUM, coordinates, CENTFREQ, CAL, and EXPOSURE across 32 rows. Co-Authored-By: Claude Opus 4.6 --- src/dysh/fits/tests/test_index_file.py | 199 +++++++++++++++++++++++++ 1 file changed, 199 insertions(+) diff --git a/src/dysh/fits/tests/test_index_file.py b/src/dysh/fits/tests/test_index_file.py index ccbf43efb..7d345a1ef 100644 --- a/src/dysh/fits/tests/test_index_file.py +++ b/src/dysh/fits/tests/test_index_file.py @@ -917,6 +917,205 @@ def test_long_index_and_row_matches_sparrow3(self, tmp_path): f"Data row {i} (INDEX={999999 + i}, ROW={999997 + i}) does not match sparrow3 expected output" ) + def test_vegas_multibank_matches_sparrow3(self, tmp_path): + """Port of sparrow3 MPSDFITSWriterTests.testVegasIntegrationsAndIndex. + + Write 32 rows across 8 VEGAS bank files (A-H) with varying FILE, POL, + SAMPLER, FEED, FDNUM, IFNUM, coordinates, CENTFREQ, CAL, and EXPOSURE. + Verify data rows match test.banks.vegas.raw.ints.index.expected + character-for-character. + """ + expected_path = _SPARROW3_DATA_DIR / "test.banks.vegas.raw.ints.index.expected" + if not expected_path.exists(): + pytest.skip(f"sparrow3 expected file not found: {expected_path}") + + # Bank configs: (file, ifnum, feed, fdnum, centfreq, sampler_ll, sampler_rr, az, el, lon, lat) + banks = [ + ( + "test.banks.vegas.raw.ints.A.fits", + 0, + 4, + 3, + 2.370934760e10, + "A9_0", + "A13_0", + 1.448764788e02, + 1.379075952e01, + 2.668584627e02, + -2.835169168e01, + ), + ( + "test.banks.vegas.raw.ints.B.fits", + 1, + 5, + 4, + 2.370937201e10, + "B17_0", + "B21_0", + 1.448529791e02, + 1.380393730e01, + 2.668730975e02, + -2.832869620e01, + ), + ( + "test.banks.vegas.raw.ints.C.fits", + 0, + 3, + 2, + 2.370934760e10, + "C25_0", + "C29_0", + 1.448764761e02, + 1.376440397e01, + 2.668737744e02, + -2.837434610e01, + ), + ( + "test.banks.vegas.raw.ints.D.fits", + 0, + 1, + 0, + 2.370934760e10, + "D33_0", + "D37_0", + 1.448529791e02, + 1.377758174e01, + 2.668884105e02, + -2.835134734e01, + ), + ( + "test.banks.vegas.raw.ints.E.fits", + 0, + 2, + 1, + 2.370934760e10, + "E10_0", + "E14_0", + 1.448529791e02, + 1.375122619e01, + 2.669037308e02, + -2.837399619e01, + ), + ( + "test.banks.vegas.raw.ints.F.fits", + 2, + 6, + 5, + 2.370937201e10, + "F18_0", + "F22_0", + 1.448294795e02, + 1.379075952e01, + 2.669030393e02, + -2.832834597e01, + ), + ( + "test.banks.vegas.raw.ints.G.fits", + 2, + 7, + 6, + 2.370937201e10, + "G26_0", + "G30_0", + 1.448294821e02, + 1.376440397e01, + 2.669183618e02, + -2.835099279e01, + ), + ( + "test.banks.vegas.raw.ints.H.fits", + 2, + 1, + 0, + 2.414234760e10, + "H34_0", + "H38_0", + 1.448529791e02, + 1.377758174e01, + 2.668884105e02, + -2.835134734e01, + ), + ] + + trgtlong = 2.668918438e02 + trgtlat = -2.835127778e01 + + rows = [] + idx = 0 + for filename, ifnum, feed, fdnum, centfreq, sampler_ll, sampler_rr, az, el, lon, lat in banks: + row_in_file = 0 + for pol, plnum, sampler in [("LL", 0, sampler_ll), ("RR", 1, sampler_rr)]: + for cal, exposure in [("T", 4.949683249e-01), ("F", 3.954508901e-01)]: + rows.append( + { + "INDEX": idx, + "PROJECT": "KFPA", + "FILE": filename, + "EXTENSION": 1, + "ROW": row_in_file, + "SOURCE": "SGRB2", + "PROCEDURE": "RALongMap", + "OBSID": "unknown", + "E2ESCAN": 0, + "PROCSEQN": 22, + "SCAN": 34, + "POLARIZATION": pol, + "PLNUM": plnum, + "IFNUM": ifnum, + "FEED": feed, + "FDNUM": fdnum, + "INT": 1, + "NUMCHN": 4096, + "SIG": "T", + "CAL": cal, + "SAMPLER": sampler, + "AZIMUTH": az, + "ELEVATION": el, + "LONGITUDE": lon, + "LATITUDE": lat, + "TRGTLONG": trgtlong, + "TRGTLAT": trgtlat, + "SUBREF": 1, + "LST": 5.462375752e04, + "CENTFREQ": centfreq, + "RESTFREQ": 2.370629500e10, + "VELOCITY": 0.0, + "FREQINT": -1.220703125e04, + "FREQRES": 1.477050781e04, + "DATEOBS": "2010-04-08T07:23:56.00", + "TIMESTAMP": "2010_04_08_07:23:55", + "BANDWIDTH": 5.000000000e07, + "EXPOSURE": exposure, + "TSYS": 1.0, + "NSAVE": -1, + "PROCSCAN": "Unknown", + "PROCTYPE": "MAP", + "WCALPOS": "Unknown", + } + ) + idx += 1 + row_in_file += 1 + + df = pd.DataFrame(rows) + metadata = IndexMetadata( + created="Tue Jul 26 12:43:16 2016", + last_modified="Tue Jul 26 12:43:16 2016", + created_by="index_writer", + ) + index_path = tmp_path / "test.index" + write_index(index_path, metadata, df) + + actual = _get_rows_section(index_path) + expected = _get_rows_section(expected_path) + + # Column header should match exactly + assert actual[0] == expected[0], "Column header row does not match sparrow3" + + # All 32 data rows should match exactly + assert len(actual) == len(expected), f"Row count mismatch: {len(actual) - 1} vs {len(expected) - 1} data rows" + for i, (act, exp) in enumerate(zip(actual[1:], expected[1:], strict=True)): + assert act == exp, f"VEGAS multi-bank data row {i} does not match sparrow3 expected output" + class TestMultiFileVegas: """Tests for writing index data from multi-file VEGAS observations. From 8e130b2f536aba340233f9bb4690a7671912de60 Mon Sep 17 00:00:00 2001 From: Thomas Chamberlin Date: Wed, 18 Mar 2026 17:47:52 +0000 Subject: [PATCH 4/9] feat: add write_index param to GBTFITSLoad.write() and use alphabetic multi-file naming Addresses feedback on PR #1047: integrate index file writing into the SDFITS write path and change multi-file naming to match GBTIDL/sparrow3 conventions. - Add write_index=False parameter to write(). When True, writes sparrow3-compatible .index files alongside each output FITS file. For multifile writes with >1 file, also writes a parent directory index that aggregates all per-file indices. - Change multi-file naming from numeric (0, 1, 2) to alphabetic (A, B, C) with dot separator: output.A.fits, output.B.fits, etc. This matches the GBTIDL/VEGAS convention and produces correct index filenames via get_index_path() (output.A.index, etc.). - Add helper methods: _build_index_metadata(), _build_and_write_index(), _build_and_write_parent_index(), _multifile_name() - Add 8 new tests in TestWriteIndex covering alphabetic naming, single file index, multifile indices, parent index, selection filtering, non-multifile mode, roundtrip, and row count verification. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/dysh/fits/gbtfitsload.py | 151 ++++++++++++++++++++-- src/dysh/fits/tests/test_gbtfitsload.py | 159 ++++++++++++++++++++++++ 2 files changed, 302 insertions(+), 8 deletions(-) diff --git a/src/dysh/fits/gbtfitsload.py b/src/dysh/fits/gbtfitsload.py index b70990ca2..83b6d5ab2 100644 --- a/src/dysh/fits/gbtfitsload.py +++ b/src/dysh/fits/gbtfitsload.py @@ -60,6 +60,7 @@ from ..util.selection import Flag, Selection # noqa: F811 from ..util.weatherforecast import GBTWeatherForecast from . import conf, core +from .index_file import create_index_metadata, get_index_path, write_index from .sdfitsload import FITSBackend, SDFITSLoad, _log_mem try: @@ -3849,6 +3850,7 @@ def write( fileobj, multifile=True, flags=True, + write_index=False, verbose=False, output_verify="exception", overwrite=False, @@ -3871,6 +3873,9 @@ def write( Otherwise, write to a single SDFITS file. flags: bool, optional If True, write the applied flags to a `FLAGS` column in the binary table. + write_index: bool, optional + If True, write a sparrow3/GBTIDL-compatible .index file alongside each output FITS file. + When multifile=True and multiple files are written, also writes a parent directory index. verbose: bool, optional If True, print out some information about number of rows written per file output_verify : str @@ -3892,11 +3897,121 @@ def write( e.g., `ifnum=1, plnum=[2,3]` etc. """ if HAS_FITSIO: - self._write_fitsio(fileobj, multifile, flags, verbose, overwrite, **kwargs) + self._write_fitsio(fileobj, multifile, flags, write_index, verbose, overwrite, **kwargs) else: - self._write_astropy(fileobj, multifile, flags, verbose, output_verify, overwrite, checksum, **kwargs) + self._write_astropy( + fileobj, multifile, flags, write_index, verbose, output_verify, overwrite, checksum, **kwargs + ) + + def _build_index_metadata(self, df): + """Extract observer/backend from index DataFrame and create IndexMetadata.""" + observer = "Unknown" + if "OBSERVER" in df.columns: + obs_vals = df["OBSERVER"].dropna().unique() + if len(obs_vals) > 0: + observer = str(obs_vals[0]) + backend = "Unknown" + if "BACKEND" in df.columns: + be_vals = df["BACKEND"].dropna().unique() + if len(be_vals) > 0: + backend = str(be_vals[0]) + return create_index_metadata(observer=observer, backend=backend) + + def _build_and_write_index(self, outfile, per_file_df, metadata, overwrite): + """Build and write a .index file for a single output FITS file. + + Parameters + ---------- + outfile : str or Path + Path to the output FITS file + per_file_df : pd.DataFrame + Index rows for this file (already filtered by FITSINDEX if applicable) + metadata : IndexMetadata + Pre-constructed metadata + overwrite : bool + Whether to overwrite existing index files + + Returns + ------- + pd.DataFrame + The prepared index DataFrame (for use in parent index aggregation) + """ + outfile = Path(outfile) + index_path = get_index_path(outfile) + if index_path.exists() and not overwrite: + raise OSError(f"Index file already exists: {index_path}") + + # Renumber ROW and HDU/BINTABLE for the output file + index_rows = [] + bintables = sorted(per_file_df.BINTABLE.unique()) + new_extension = 1 + for b in bintables: + bt_rows = per_file_df[per_file_df.BINTABLE == b].sort_values("ROW") + for new_row_idx, (_, row) in enumerate(bt_rows.iterrows()): + row_copy = row.copy() + row_copy["ROW"] = new_row_idx + row_copy["HDU"] = new_extension + row_copy["BINTABLE"] = new_extension - 1 + index_rows.append(row_copy) + new_extension += 1 + + index_df = pd.DataFrame(index_rows).reset_index(drop=True) + index_df["INDEX"] = range(len(index_df)) + index_df["FILE"] = outfile.name + + # Remove internal columns not part of the standard index format + for col in ["FITSINDEX", "CHAN"]: + if col in index_df.columns: + index_df = index_df.drop(columns=col) + + write_index(index_path, metadata, index_df) + return index_df + + def _build_and_write_parent_index(self, fileobj, per_file_dfs, metadata, overwrite): + """Write a parent directory index aggregating all per-file indices. - def _write_fitsio(self, fileobj, multifile, flags, verbose, overwrite, **kwargs): + Parameters + ---------- + fileobj : str or Path + The base output file path (e.g., 'output.fits'); used to derive parent index name + per_file_dfs : list of pd.DataFrame + List of prepared per-file index DataFrames (from _build_and_write_index) + metadata : IndexMetadata + Pre-constructed metadata + overwrite : bool + Whether to overwrite existing index files + """ + fileobj = Path(fileobj) + parent_index_path = get_index_path(fileobj) + if parent_index_path.exists() and not overwrite: + raise OSError(f"Parent index file already exists: {parent_index_path}") + + combined_df = pd.concat(per_file_dfs, ignore_index=True) + combined_df["INDEX"] = range(len(combined_df)) + write_index(parent_index_path, metadata, combined_df) + + @staticmethod + def _multifile_name(fileobj, count): + """Generate output filename for multi-file writes using alphabetic suffixes. + + Parameters + ---------- + fileobj : str or Path + Base output file path + count : int + File index (0-based, maps to A, B, C, ...) + + Returns + ------- + Path + Output path with alphabetic suffix (e.g., 'output.A.fits') + """ + if count > 25: + raise ValueError("Cannot write more than 26 multi-file outputs with alphabetic suffixes") + p = Path(fileobj) + return p.parent / (p.stem + "." + chr(ord("A") + count) + p.suffix) + + def _write_fitsio(self, fileobj, multifile, flags, write_index_file, verbose, overwrite, **kwargs): """Write using the fitsio chunked path (memory-efficient for large files).""" chunk_size = kwargs.pop("chunk_size", 5000) logger.debug(kwargs) @@ -3912,8 +4027,11 @@ def _write_fitsio(self, fileobj, multifile, flags, verbose, overwrite, **kwargs) fi = _final["FITSINDEX"].unique() logger.debug(f"fitsindex {fi} ") total_rows_written = 0 + if write_index_file: + metadata = self._build_index_metadata(_final) if multifile: count = 0 + all_index_dfs = [] for k in fi: # Build bintable groups for this FITSINDEX df = select_from("FITSINDEX", k, _final) @@ -3927,8 +4045,7 @@ def _write_fitsio(self, fileobj, multifile, flags, verbose, overwrite, **kwargs) continue if len(fi) > 1: - p = Path(fileobj) - outfile = p.parent / (p.stem + str(count) + p.suffix) + outfile = self._multifile_name(fileobj, count) count += 1 else: outfile = fileobj @@ -3939,6 +4056,11 @@ def _write_fitsio(self, fileobj, multifile, flags, verbose, overwrite, **kwargs) total_rows_written += rows_written if verbose: logger.info(f"Writing {rows_written} rows to {outfile}.") + if write_index_file: + prepared_df = self._build_and_write_index(outfile, df, metadata, overwrite) + all_index_dfs.append(prepared_df) + if write_index_file and len(fi) > 1: + self._build_and_write_parent_index(fileobj, all_index_dfs, metadata, overwrite) if verbose: logger.info(f"Total of {total_rows_written} rows written to files.") else: @@ -3963,8 +4085,12 @@ def _write_fitsio(self, fileobj, multifile, flags, verbose, overwrite, **kwargs) ) if verbose: logger.info(f"Writing {total_rows_written} to {fileobj}") + if write_index_file: + self._build_and_write_index(fileobj, _final, metadata, overwrite) - def _write_astropy(self, fileobj, multifile, flags, verbose, output_verify, overwrite, checksum, **kwargs): + def _write_astropy( + self, fileobj, multifile, flags, write_index_file, verbose, output_verify, overwrite, checksum, **kwargs + ): """Write using the astropy path (fallback when fitsio is unavailable).""" logger.debug(kwargs) selection = Selection(self._index) @@ -3979,8 +4105,11 @@ def _write_astropy(self, fileobj, multifile, flags, verbose, output_verify, over fi = _final["FITSINDEX"].unique() logger.debug(f"fitsindex {fi} ") total_rows_written = 0 + if write_index_file: + metadata = self._build_index_metadata(_final) if multifile: count = 0 + all_index_dfs = [] for k in fi: this_rows_written = 0 hdu = self._sdf[k]._hdu[0].copy() @@ -4004,8 +4133,7 @@ def _write_astropy(self, fileobj, multifile, flags, verbose, output_verify, over total_rows_written += lr this_rows_written += lr if len(fi) > 1: - p = Path(fileobj) - outfile = p.parent / (p.stem + str(count) + p.suffix) + outfile = self._multifile_name(fileobj, count) count += 1 else: outfile = fileobj @@ -4016,6 +4144,11 @@ def _write_astropy(self, fileobj, multifile, flags, verbose, output_verify, over if verbose: logger.info(f"Writing {this_rows_written} rows to {outfile}.") outhdu.writeto(outfile, output_verify=output_verify, overwrite=overwrite, checksum=checksum) + if write_index_file: + prepared_df = self._build_and_write_index(outfile, df, metadata, overwrite) + all_index_dfs.append(prepared_df) + if write_index_file and len(fi) > 1: + self._build_and_write_parent_index(fileobj, all_index_dfs, metadata, overwrite) if verbose: logger.info(f"Total of {total_rows_written} rows written to files.") else: @@ -4048,6 +4181,8 @@ def _write_astropy(self, fileobj, multifile, flags, verbose, output_verify, over if verbose: logger.info(f"Writing {total_rows_written} to {fileobj}") outhdu.writeto(fileobj, output_verify=output_verify, overwrite=overwrite, checksum=checksum) + if write_index_file: + self._build_and_write_index(fileobj, _final, metadata, overwrite) def _update_radesys(self): """ diff --git a/src/dysh/fits/tests/test_gbtfitsload.py b/src/dysh/fits/tests/test_gbtfitsload.py index 838da423d..889995c51 100644 --- a/src/dysh/fits/tests/test_gbtfitsload.py +++ b/src/dysh/fits/tests/test_gbtfitsload.py @@ -2061,6 +2061,165 @@ def compare_tsys_dicts(result, expected): assert "Missing system temperature for scan(s): 2,3" in str(excinfo.value) +class TestWriteIndex: + """Tests for write_index integration in GBTFITSLoad.write().""" + + def test_write_alphabetic_naming(self, tmp_path): + """Multi-file write should use A, B, C... suffixes instead of 0, 1, 2...""" + f = util.get_project_testdata() / "AGBT18B_354_03/AGBT18B_354_03.raw.vegas/" + g = gbtfitsload.GBTFITSLoad(f) + o = tmp_path / "sub" + o.mkdir() + output = o / "testmulti.fits" + g.write(output, multifile=True, scan=6, overwrite=True) + written_files = sorted(o.glob("*.fits")) + assert len(written_files) == 4 + for i, f in enumerate(written_files): + expected_letter = chr(ord("A") + i) + assert f.name == f"testmulti.{expected_letter}.fits", ( + f"Expected testmulti.{expected_letter}.fits, got {f.name}" + ) + + def test_write_index_default_false(self, tmp_path): + """write_index=False (default) should not create any .index files.""" + f = util.get_project_testdata() / "AGBT18B_354_03/AGBT18B_354_03.raw.vegas/" + g = gbtfitsload.GBTFITSLoad(f) + o = tmp_path / "sub" + o.mkdir() + output = o / "testmulti.fits" + g.write(output, multifile=True, scan=6, overwrite=True) + index_files = list(o.glob("*.index")) + assert len(index_files) == 0 + + def test_write_index_single_file(self, tmp_path): + """Writing a single file with write_index=True should create one .index file.""" + p = util.get_project_testdata() / "AGBT20B_014_03.raw.vegas" + data_file = p / "AGBT20B_014_03.raw.vegas.A6.fits" + sdf = gbtfitsload.GBTFITSLoad(data_file, index_file_threshold=1000000000) + d = tmp_path / "sub" + d.mkdir() + output = d / "test_single.fits" + sdf.write(output, write_index=True, overwrite=True, flags=False) + + # Verify index file was created + index_path = d / "test_single.index" + assert index_path.exists() + + # Verify it's parseable and has correct structure + from dysh.fits.index_file import parse_sdfits_index_file + + df = parse_sdfits_index_file(index_path) + assert len(df) == len(sdf._index) + # ROW should start from 0 + assert df["ROW"].min() == 0 + # FILE column should reference the output filename + assert (df["FILE"] == "test_single.fits").all() + + def test_write_index_multifile(self, tmp_path): + """Multi-file write with write_index=True should create per-file + parent indices.""" + f = util.get_project_testdata() / "AGBT18B_354_03/AGBT18B_354_03.raw.vegas/" + g = gbtfitsload.GBTFITSLoad(f) + o = tmp_path / "sub" + o.mkdir() + output = o / "testmulti.fits" + g.write(output, multifile=True, scan=6, write_index=True, overwrite=True) + + from dysh.fits.index_file import parse_sdfits_index_file + + # Should have 4 per-file index files + 1 parent index + per_file_indices = sorted(o.glob("testmulti.?.index")) + assert len(per_file_indices) == 4 + parent_index = o / "testmulti.index" + assert parent_index.exists() + + # Each per-file index should reference only its own FITS file + total_per_file_rows = 0 + for idx_file in per_file_indices: + df = parse_sdfits_index_file(idx_file) + assert len(df) > 0 + expected_fits = idx_file.stem + ".fits" + assert (df["FILE"] == expected_fits).all() + total_per_file_rows += len(df) + + # Parent index should contain all rows + parent_df = parse_sdfits_index_file(parent_index) + assert len(parent_df) == total_per_file_rows + # Parent index should reference multiple FITS files + assert parent_df["FILE"].nunique() == 4 + + def test_write_index_with_selection(self, tmp_path): + """write_index with subselection should only index the selected rows.""" + f = util.get_project_testdata() / "AGBT18B_354_03/AGBT18B_354_03.raw.vegas/" + g = gbtfitsload.GBTFITSLoad(f) + o = tmp_path / "sub" + o.mkdir() + output = o / "selected.fits" + g.write(output, multifile=True, scan=6, write_index=True, overwrite=True) + + from dysh.fits.index_file import parse_sdfits_index_file + + # Load all per-file indices and check SCAN values + for idx_file in sorted(o.glob("selected.?.index")): + df = parse_sdfits_index_file(idx_file) + assert set(df["SCAN"].unique()) == {6} + # ROW numbering should be sequential starting from 0 + for hdu_val in df["HDU"].unique(): + hdu_rows = df[df["HDU"] == hdu_val]["ROW"] + assert hdu_rows.min() == 0 + assert list(hdu_rows) == list(range(len(hdu_rows))) + + def test_write_index_non_multifile(self, tmp_path): + """Non-multifile write with write_index=True should create one index.""" + f = util.get_project_testdata() / "AGBT18B_354_03/AGBT18B_354_03.raw.vegas/" + g = gbtfitsload.GBTFITSLoad(f) + o = tmp_path / "sub" + o.mkdir() + output = o / "merged.fits" + g.write(output, multifile=False, scan=6, write_index=True, overwrite=True) + + from dysh.fits.index_file import parse_sdfits_index_file + + index_path = o / "merged.index" + assert index_path.exists() + df = parse_sdfits_index_file(index_path) + assert len(df) > 0 + assert (df["FILE"] == "merged.fits").all() + assert set(df["SCAN"].unique()) == {6} + + def test_write_index_roundtrip(self, tmp_path): + """Write FITS+index, reload, and verify data matches.""" + f = util.get_project_testdata() / "AGBT18B_354_03/AGBT18B_354_03.raw.vegas/" + g = gbtfitsload.GBTFITSLoad(f) + o = tmp_path / "sub" + o.mkdir() + output = o / "roundtrip.fits" + g.write(output, multifile=True, scan=6, write_index=True, overwrite=True) + + # Reload the written files + sdf = gbtfitsload.GBTFITSLoad(o) + assert set(sdf["SCAN"]) == {6} + + def test_write_parent_index_row_count(self, tmp_path): + """Parent index total rows should equal sum of per-file index rows.""" + f = util.get_project_testdata() / "AGBT18B_354_03/AGBT18B_354_03.raw.vegas/" + g = gbtfitsload.GBTFITSLoad(f) + o = tmp_path / "sub" + o.mkdir() + output = o / "testmulti.fits" + g.write(output, multifile=True, write_index=True, overwrite=True) + + from dysh.fits.index_file import parse_sdfits_index_file + + per_file_total = 0 + for idx_file in sorted(o.glob("testmulti.?.index")): + per_file_total += len(parse_sdfits_index_file(idx_file)) + + parent_df = parse_sdfits_index_file(o / "testmulti.index") + assert len(parent_df) == per_file_total + # INDEX should be sequential 0..N-1 + assert list(parent_df["INDEX"]) == list(range(len(parent_df))) + + class TestOnlineGBTFITSLoad: """Tests for OnlineGBTFITSLoad (GBTOnline) functionality.""" From e3ee4f8d209dabd5ea40657d951ed02ebdc5fc47 Mon Sep 17 00:00:00 2001 From: Thomas Chamberlin Date: Wed, 18 Mar 2026 18:11:58 +0000 Subject: [PATCH 5/9] test: add GBTIDL and sparrow3 regression tests for index file writing Add regression tests comparing dysh's index output against reference implementations: GBTIDL regression (TestGBTIDLRegression): - test_cal_vegas_matches_gbtidl: TGBT22A_503_02 calibration data - test_raw_vegas_a6_matches_gbtidl: AGBT20B_014_03 raw VEGAS data - test_multibank_vegas_matches_gbtidl: AGBT22A_325_15 banks A and B - test_gbtidl_getps_output_matches: column header format comparison Compares integer columns (exact), string columns (exact), float columns (rtol=1e-6), and DATEOBS date prefix. Known exclusions documented: FILE (output vs original name), DATEOBS (GBTIDL truncates), WCALPOS (CALPOSITION not always in dysh index). Sparrow3 extended (TestSparrow3Extended) - character-for-character: - test_vegas_raw_ints_matches_sparrow3 (4 rows) - test_acs_raw_ints_matches_sparrow3 (4 rows) - test_vegas_raw_only_matches_sparrow3 (224 rows) - test_vegas_raw_badif_matches_sparrow3 (384 rows) - test_banks_vegas_raw_badif_matches_sparrow3 (384 rows) Also fix: derive NUMCHN from TDIM7 in _prepare_for_writing() when NUMCHN column is missing (common when index is built from FITS data). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/dysh/fits/index_file.py | 4 + src/dysh/fits/tests/test_index_file.py | 317 +++++++++++++++++++++++++ 2 files changed, 321 insertions(+) diff --git a/src/dysh/fits/index_file.py b/src/dysh/fits/index_file.py index ae85bca4b..3561aa471 100644 --- a/src/dysh/fits/index_file.py +++ b/src/dysh/fits/index_file.py @@ -677,6 +677,10 @@ def _prepare_for_writing(df: pd.DataFrame) -> pd.DataFrame: # --- Compute derived columns before renaming --- + # NUMCHN: derive from TDIM7 if not present (TDIM7 is e.g. "(32768,1,1,1)") + if "NUMCHN" not in df.columns and "TDIM7" in df.columns: + df["NUMCHN"] = df["TDIM7"].apply(lambda t: int(str(t).strip("()").split(",")[0]) if pd.notna(t) else 0) + # CENTFREQ: compute from WCS parameters if all are available if "CRVAL1" in df.columns and "CRPIX1" in df.columns and "CDELT1" in df.columns and "NUMCHN" in df.columns: df["CENTFREQ"] = [ diff --git a/src/dysh/fits/tests/test_index_file.py b/src/dysh/fits/tests/test_index_file.py index 7d345a1ef..80841726f 100644 --- a/src/dysh/fits/tests/test_index_file.py +++ b/src/dysh/fits/tests/test_index_file.py @@ -5,6 +5,7 @@ from datetime import datetime from pathlib import Path +import numpy as np import pandas as pd import pytest @@ -1274,3 +1275,319 @@ def test_write_real_vegas_data(self, testdata_dir, tmp_path): df_parsed = parse_sdfits_index_file(index_path) assert len(df_parsed) == len(df) + + +_GBTIDL_DATA_DIR = get_project_testdata() + + +class TestGBTIDLRegression: + """Compare dysh-written index files against GBTIDL-generated reference indices. + + These tests load a real FITS file, build an index from the FITS data, + write it with dysh's write_index(), then compare the data rows against + the GBTIDL-generated .index file that ships with the testdata. + """ + + @staticmethod + def _compare_index_rows(dysh_path, gbtidl_path, atol_float=1e-3): + """Compare data rows between dysh-written and GBTIDL-generated index files. + + Compares column-by-column with appropriate tolerance for floats. + Skips header comparison (timestamps/created_by will differ). + + Parameters + ---------- + dysh_path : Path + Path to dysh-written .index file + gbtidl_path : Path + Path to GBTIDL reference .index file + atol_float : float + Absolute tolerance for float comparisons + """ + _, dysh_df = read_index(dysh_path) + _, gbtidl_df = read_index(gbtidl_path) + + assert len(dysh_df) == len(gbtidl_df), f"Row count mismatch: dysh={len(dysh_df)}, gbtidl={len(gbtidl_df)}" + + # Integer columns: must match exactly + int_cols = [ + "INDEX", + "EXTENSION", + "ROW", + "E2ESCAN", + "PROCSEQN", + "SCAN", + "PLNUM", + "IFNUM", + "FEED", + "FDNUM", + "INT", + "NUMCHN", + "SUBREF", + "NSAVE", + "PROCSCAN", + ] + for col in int_cols: + if col in dysh_df.columns and col in gbtidl_df.columns: + np.testing.assert_array_equal( + dysh_df[col].to_numpy(), gbtidl_df[col].to_numpy(), err_msg=f"Integer column {col} mismatch" + ) + + # String columns: must match exactly (after stripping whitespace) + # FILE is excluded — dysh uses the output filename, GBTIDL uses the original + # DATEOBS is excluded — GBTIDL truncates to date-only in some cases + # WCALPOS is excluded — GBTIDL derives from CALPOSITION which dysh doesn't always have + str_cols = [ + "PROJECT", + "SOURCE", + "PROCEDURE", + "OBSID", + "POLARIZATION", + "SIG", + "CAL", + "SAMPLER", + "TIMESTAMP", + "PROCTYPE", + ] + for col in str_cols: + if col in dysh_df.columns and col in gbtidl_df.columns: + dysh_vals = dysh_df[col].astype(str).str.strip().to_numpy() + gbtidl_vals = gbtidl_df[col].astype(str).str.strip().to_numpy() + np.testing.assert_array_equal(dysh_vals, gbtidl_vals, err_msg=f"String column {col} mismatch") + + # DATEOBS: dysh writes full ISO timestamp, GBTIDL may truncate to date-only. + # Compare the date prefix (first 10 chars). + if "DATEOBS" in dysh_df.columns and "DATEOBS" in gbtidl_df.columns: + dysh_dates = dysh_df["DATEOBS"].astype(str).str[:10].to_numpy() + gbtidl_dates = gbtidl_df["DATEOBS"].astype(str).str[:10].to_numpy() + np.testing.assert_array_equal(dysh_dates, gbtidl_dates, err_msg="DATEOBS date prefix mismatch") + + # Float columns: compare with tolerance + float_cols = [ + "AZIMUTH", + "ELEVATION", + "LONGITUDE", + "LATITUDE", + "TRGTLONG", + "TRGTLAT", + "LST", + "CENTFREQ", + "RESTFREQ", + "VELOCITY", + "FREQINT", + "FREQRES", + "BANDWIDTH", + "EXPOSURE", + ] + for col in float_cols: + if col in dysh_df.columns and col in gbtidl_df.columns: + dysh_vals = pd.to_numeric(dysh_df[col], errors="coerce").to_numpy() + gbtidl_vals = pd.to_numeric(gbtidl_df[col], errors="coerce").to_numpy() + # Use relative tolerance for large values, absolute for small + np.testing.assert_allclose( + dysh_vals, gbtidl_vals, rtol=1e-6, atol=atol_float, err_msg=f"Float column {col} mismatch" + ) + + def test_cal_vegas_matches_gbtidl(self, tmp_path): + """Compare dysh index against GBTIDL for TGBT22A_503_02.cal.vegas.""" + fits_path = _GBTIDL_DATA_DIR / "TGBT22A_503_02" / "TGBT22A_503_02.cal.vegas.fits" + gbtidl_index = _GBTIDL_DATA_DIR / "TGBT22A_503_02" / "TGBT22A_503_02.cal.vegas.index" + if not fits_path.exists() or not gbtidl_index.exists(): + pytest.skip("Test data not found") + + from dysh.fits import gbtfitsload + + sdf = gbtfitsload.GBTFITSLoad(fits_path, index_file_threshold=1000000000) + output = tmp_path / "TGBT22A_503_02.cal.vegas.fits" + sdf.write(output, write_index=True, overwrite=True, flags=False) + + dysh_index = tmp_path / "TGBT22A_503_02.cal.vegas.index" + assert dysh_index.exists() + self._compare_index_rows(dysh_index, gbtidl_index) + + def test_raw_vegas_a6_matches_gbtidl(self, tmp_path): + """Compare dysh index against GBTIDL for AGBT20B_014_03.raw.vegas.A6.""" + fits_path = _GBTIDL_DATA_DIR / "AGBT20B_014_03.raw.vegas" / "AGBT20B_014_03.raw.vegas.A6.fits" + gbtidl_index = _GBTIDL_DATA_DIR / "AGBT20B_014_03.raw.vegas" / "AGBT20B_014_03.raw.vegas.A6.index" + if not fits_path.exists() or not gbtidl_index.exists(): + pytest.skip("Test data not found") + + from dysh.fits import gbtfitsload + + sdf = gbtfitsload.GBTFITSLoad(fits_path, index_file_threshold=1000000000) + output = tmp_path / "AGBT20B_014_03.raw.vegas.A6.fits" + sdf.write(output, write_index=True, overwrite=True, flags=False) + + dysh_index = tmp_path / "AGBT20B_014_03.raw.vegas.A6.index" + assert dysh_index.exists() + self._compare_index_rows(dysh_index, gbtidl_index) + + def test_multibank_vegas_matches_gbtidl(self, tmp_path): + """Compare dysh per-bank indices against GBTIDL for AGBT22A_325_15 (banks A and B).""" + bank_a_fits = _GBTIDL_DATA_DIR / "AGBT22A_325_15" / "AGBT22A_325_15.raw.vegas.A.fits" + bank_a_gbtidl = _GBTIDL_DATA_DIR / "AGBT22A_325_15" / "AGBT22A_325_15.raw.vegas.A.index" + bank_b_fits = _GBTIDL_DATA_DIR / "AGBT22A_325_15" / "AGBT22A_325_15.raw.vegas.B.fits" + bank_b_gbtidl = _GBTIDL_DATA_DIR / "AGBT22A_325_15" / "AGBT22A_325_15.raw.vegas.B.index" + if not all(p.exists() for p in [bank_a_fits, bank_a_gbtidl, bank_b_fits, bank_b_gbtidl]): + pytest.skip("Test data not found") + + from dysh.fits import gbtfitsload + + # Test Bank A + sdf_a = gbtfitsload.GBTFITSLoad(bank_a_fits, index_file_threshold=1000000000) + out_a = tmp_path / "bank_a.fits" + sdf_a.write(out_a, write_index=True, overwrite=True, flags=False) + self._compare_index_rows(tmp_path / "bank_a.index", bank_a_gbtidl) + + # Test Bank B + sdf_b = gbtfitsload.GBTFITSLoad(bank_b_fits, index_file_threshold=1000000000) + out_b = tmp_path / "bank_b.fits" + sdf_b.write(out_b, write_index=True, overwrite=True, flags=False) + self._compare_index_rows(tmp_path / "bank_b.index", bank_b_gbtidl) + + def test_gbtidl_getps_output_matches(self, tmp_path): + """Compare column header format against a GBTIDL-generated output index.""" + gbtidl_index = _GBTIDL_DATA_DIR / "AGBT05B_047_01" / "gbtidl" / "AGBT05B_047_01.getps.acs.index" + if not gbtidl_index.exists(): + pytest.skip("GBTIDL reference index not found") + + # Verify the column header row matches exactly + gbtidl_rows = _get_rows_section(gbtidl_index) + # The column header is the first line of the rows section + expected_header = gbtidl_rows[0] + dysh_header = _generate_rows_header() + assert dysh_header == expected_header, "Column header format does not match GBTIDL output" + + +class TestSparrow3Extended: + """Extended sparrow3 regression tests using additional .index.expected files. + + These tests port additional sparrow3 test cases beyond the basic ones + already in TestSparrow3Ported. + """ + + def test_vegas_raw_ints_matches_sparrow3(self, tmp_path): + """Compare against sparrow3's test.vegas.raw.ints.index.expected. + + This is a 4-row VEGAS test with real-world-like values including + varying polarizations (XX/YY), CAL states (T/F), and exposures. + """ + expected_path = _SPARROW3_DATA_DIR / "test.vegas.raw.ints.index.expected" + if not expected_path.exists(): + pytest.skip(f"sparrow3 expected file not found: {expected_path}") + + expected = _get_rows_section(expected_path) + # Extract column values from the expected file to reconstruct input data + _, expected_df = read_index(expected_path) + + # Write the expected data back through dysh's writer + metadata = IndexMetadata( + created="Fri Mar 22 16:59:58 2013", + last_modified="Fri Mar 22 16:59:58 2013", + ) + index_path = tmp_path / "test.index" + write_index(index_path, metadata, expected_df) + + actual = _get_rows_section(index_path) + + # Column header should match + assert actual[0] == expected[0], "Column header does not match sparrow3" + # Data rows should match + assert len(actual) == len(expected), f"Row count: {len(actual) - 1} vs {len(expected) - 1}" + for i, (act, exp) in enumerate(zip(actual[1:], expected[1:], strict=True)): + assert act == exp, f"Data row {i} does not match sparrow3 (vegas.raw.ints)" + + def test_acs_raw_ints_matches_sparrow3(self, tmp_path): + """Compare against sparrow3's test.acs.raw.ints.index.expected.""" + expected_path = _SPARROW3_DATA_DIR / "test.acs.raw.ints.index.expected" + if not expected_path.exists(): + pytest.skip(f"sparrow3 expected file not found: {expected_path}") + + expected = _get_rows_section(expected_path) + _, expected_df = read_index(expected_path) + + metadata = IndexMetadata( + created="Fri Mar 22 16:59:58 2013", + last_modified="Fri Mar 22 16:59:58 2013", + ) + index_path = tmp_path / "test.index" + write_index(index_path, metadata, expected_df) + + actual = _get_rows_section(index_path) + assert actual[0] == expected[0], "Column header does not match sparrow3" + assert len(actual) == len(expected) + for i, (act, exp) in enumerate(zip(actual[1:], expected[1:], strict=True)): + assert act == exp, f"Data row {i} does not match sparrow3 (acs.raw.ints)" + + def test_vegas_raw_only_matches_sparrow3(self, tmp_path): + """Compare against sparrow3's test.vegas.raw.only.index.expected (224 data rows).""" + expected_path = _SPARROW3_DATA_DIR / "test.vegas.raw.only.index.expected" + if not expected_path.exists(): + pytest.skip(f"sparrow3 expected file not found: {expected_path}") + + expected = _get_rows_section(expected_path) + _, expected_df = read_index(expected_path) + + metadata = IndexMetadata( + created="Fri Mar 22 16:59:58 2013", + last_modified="Fri Mar 22 16:59:58 2013", + ) + index_path = tmp_path / "test.index" + write_index(index_path, metadata, expected_df) + + actual = _get_rows_section(index_path) + assert actual[0] == expected[0], "Column header does not match sparrow3" + assert len(actual) == len(expected), f"Row count: {len(actual) - 1} vs {len(expected) - 1}" + for i, (act, exp) in enumerate(zip(actual[1:], expected[1:], strict=True)): + assert act == exp, f"Data row {i} does not match sparrow3 (vegas.raw.only)" + + def test_vegas_raw_badif_matches_sparrow3(self, tmp_path): + """Compare against sparrow3's test.vegas.raw.badif.index.expected (384 data rows). + + This is the largest standard sparrow3 test case, exercising many + different scan/integration/polarization combinations. + """ + expected_path = _SPARROW3_DATA_DIR / "test.vegas.raw.badif.index.expected" + if not expected_path.exists(): + pytest.skip(f"sparrow3 expected file not found: {expected_path}") + + expected = _get_rows_section(expected_path) + _, expected_df = read_index(expected_path) + + metadata = IndexMetadata( + created="Fri Mar 22 16:59:58 2013", + last_modified="Fri Mar 22 16:59:58 2013", + ) + index_path = tmp_path / "test.index" + write_index(index_path, metadata, expected_df) + + actual = _get_rows_section(index_path) + assert actual[0] == expected[0], "Column header does not match sparrow3" + assert len(actual) == len(expected), f"Row count: {len(actual) - 1} vs {len(expected) - 1}" + for i, (act, exp) in enumerate(zip(actual[1:], expected[1:], strict=True)): + assert act == exp, f"Data row {i} does not match sparrow3 (vegas.raw.badif)" + + def test_banks_vegas_raw_badif_matches_sparrow3(self, tmp_path): + """Compare against sparrow3's test.banks.vegas.raw.badif.index.expected (384 rows). + + Multi-bank VEGAS test with 8 banks and many scan combinations. + """ + expected_path = _SPARROW3_DATA_DIR / "test.banks.vegas.raw.badif.index.expected" + if not expected_path.exists(): + pytest.skip(f"sparrow3 expected file not found: {expected_path}") + + expected = _get_rows_section(expected_path) + _, expected_df = read_index(expected_path) + + metadata = IndexMetadata( + created="Fri Mar 22 16:59:58 2013", + last_modified="Fri Mar 22 16:59:58 2013", + ) + index_path = tmp_path / "test.index" + write_index(index_path, metadata, expected_df) + + actual = _get_rows_section(index_path) + assert actual[0] == expected[0], "Column header does not match sparrow3" + assert len(actual) == len(expected), f"Row count: {len(actual) - 1} vs {len(expected) - 1}" + for i, (act, exp) in enumerate(zip(actual[1:], expected[1:], strict=True)): + assert act == exp, f"Data row {i} does not match sparrow3 (banks.vegas.raw.badif)" From 97a75b1995a0b375078a2cd6f74e7cf3477cdaa6 Mon Sep 17 00:00:00 2001 From: Thomas Chamberlin Date: Wed, 18 Mar 2026 18:17:29 +0000 Subject: [PATCH 6/9] =?UTF-8?q?fix:=20add=20CALPOSITION=20=E2=86=92=20WCAL?= =?UTF-8?q?POS=20mapping=20for=20index=20file=20writing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SDFITS index format uses WCALPOS (originally W-band calibration position) but sparrow/sparrow3/GBTIDL all populate it from the generic FITS column CALPOSITION for all backends. Add the mapping in both SDFITS_INDEX_TO_DYSH_MAP and the _prepare_for_writing rename_map so dysh correctly writes this column. Also re-enable WCALPOS comparison in the GBTIDL regression tests. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/dysh/fits/index_file.py | 8 ++++++++ src/dysh/fits/tests/test_index_file.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/src/dysh/fits/index_file.py b/src/dysh/fits/index_file.py index 3561aa471..bb52b6030 100644 --- a/src/dysh/fits/index_file.py +++ b/src/dysh/fits/index_file.py @@ -98,6 +98,13 @@ class IndexMetadata: "LONGITUDE": "CRVAL2", # .index LONGITUDE = FITS CRVAL2 (lon-like coordinate) "LATITUDE": "CRVAL3", # .index LATITUDE = FITS CRVAL3 (lat-like coordinate) "CENTFREQ": "CRVAL1", # .index CENTFREQ = FITS CRVAL1 (center frequency) + # WCALPOS was originally added to the index format for the W-band receiver's + # calibration vane positions ("Vane", "Observing", "Cold1", "Cold2"), hence + # the "W" prefix. However, sparrow/sparrow3/GBTIDL all populate it from the + # generic FITS column CALPOSITION for ALL backends (defaulting to "Unknown"). + # So despite the W-band-specific name, it's a general-purpose column. + # See sparrow3 IndexWriter.py:345 and sparrow IndexWriter.py:330. + "WCALPOS": "CALPOSITION", } DYSH_TO_SDFITS_INDEX_MAP = {v: k for k, v in SDFITS_INDEX_TO_DYSH_MAP.items()} @@ -725,6 +732,7 @@ def _prepare_for_writing(df: pd.DataFrame) -> pd.DataFrame: "ELEVATIO": "ELEVATION", "CRVAL2": "LONGITUDE", "CRVAL3": "LATITUDE", + "CALPOSITION": "WCALPOS", # Abbreviated → canonical (for DataFrames from old dysh-written index files) "EXT": "EXTENSION", "POL": "POLARIZATION", diff --git a/src/dysh/fits/tests/test_index_file.py b/src/dysh/fits/tests/test_index_file.py index 80841726f..490876913 100644 --- a/src/dysh/fits/tests/test_index_file.py +++ b/src/dysh/fits/tests/test_index_file.py @@ -1336,7 +1336,6 @@ def _compare_index_rows(dysh_path, gbtidl_path, atol_float=1e-3): # String columns: must match exactly (after stripping whitespace) # FILE is excluded — dysh uses the output filename, GBTIDL uses the original # DATEOBS is excluded — GBTIDL truncates to date-only in some cases - # WCALPOS is excluded — GBTIDL derives from CALPOSITION which dysh doesn't always have str_cols = [ "PROJECT", "SOURCE", @@ -1348,6 +1347,7 @@ def _compare_index_rows(dysh_path, gbtidl_path, atol_float=1e-3): "SAMPLER", "TIMESTAMP", "PROCTYPE", + "WCALPOS", ] for col in str_cols: if col in dysh_df.columns and col in gbtidl_df.columns: From 6f798d16b67392d6c7dc40919574671e947305ff Mon Sep 17 00:00:00 2001 From: Thomas Chamberlin Date: Wed, 18 Mar 2026 18:26:08 +0000 Subject: [PATCH 7/9] refactor: derive NUMCHN in create_index() instead of at write time Move NUMCHN derivation from TDIM7 out of _prepare_for_writing() and into SDFITSLoad.create_index() so the in-memory index always has NUMCHN available, not just during index file writing. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/dysh/fits/index_file.py | 4 ---- src/dysh/fits/sdfitsload.py | 13 +++++++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/dysh/fits/index_file.py b/src/dysh/fits/index_file.py index bb52b6030..0532b5fb9 100644 --- a/src/dysh/fits/index_file.py +++ b/src/dysh/fits/index_file.py @@ -684,10 +684,6 @@ def _prepare_for_writing(df: pd.DataFrame) -> pd.DataFrame: # --- Compute derived columns before renaming --- - # NUMCHN: derive from TDIM7 if not present (TDIM7 is e.g. "(32768,1,1,1)") - if "NUMCHN" not in df.columns and "TDIM7" in df.columns: - df["NUMCHN"] = df["TDIM7"].apply(lambda t: int(str(t).strip("()").split(",")[0]) if pd.notna(t) else 0) - # CENTFREQ: compute from WCS parameters if all are available if "CRVAL1" in df.columns and "CRPIX1" in df.columns and "CDELT1" in df.columns and "NUMCHN" in df.columns: df["CENTFREQ"] = [ diff --git a/src/dysh/fits/sdfitsload.py b/src/dysh/fits/sdfitsload.py index 1ba762c40..4f94c2aab 100644 --- a/src/dysh/fits/sdfitsload.py +++ b/src/dysh/fits/sdfitsload.py @@ -358,8 +358,21 @@ def create_index(self, hdu: int | list[int] | None = None, skipindex=("DATA", "F else: self._index = pd.concat([self._index, df], axis=0, ignore_index=True) self._add_primary_hdu() + self._derive_numchn() self._index_source = "fits" + def _derive_numchn(self): + """Derive NUMCHN (number of spectral channels) from TDIM7 if not already present. + + TDIM7 describes the shape of the DATA column (e.g., "(32768,1,1,1)"). + The first dimension is the number of channels. This avoids having to + read the actual DATA array just to get its length. + """ + if self._index is not None and "NUMCHN" not in self._index.columns and "TDIM7" in self._index.columns: + self._index["NUMCHN"] = self._index["TDIM7"].apply( + lambda t: int(str(t).strip("()").split(",")[0]) if pd.notna(t) else 0 + ) + def _add_primary_hdu(self): """ Add the columns to the index for header keywords that are not in primary header or not in the DATA column. From f277bb7fc9078b8afad9fed66f656e18d8e030cd Mon Sep 17 00:00:00 2001 From: astrofle Date: Mon, 15 Jun 2026 14:40:35 -0400 Subject: [PATCH 8/9] Fix swaped entries for -7 and -8 --- src/dysh/coordinates/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/dysh/coordinates/core.py b/src/dysh/coordinates/core.py index 8a82cdfdf..9ce9a4c48 100644 --- a/src/dysh/coordinates/core.py +++ b/src/dysh/coordinates/core.py @@ -161,8 +161,8 @@ -4: "LR", -5: "XX", -6: "YY", - -7: "YX", - -8: "XY", + -7: "XY", + -8: "YX", 0: "UNKNOWN", 1: "I", 2: "Q", From 553f12f21f8bcc34f3b8825ff6d688d075638f14 Mon Sep 17 00:00:00 2001 From: astrofle Date: Mon, 15 Jun 2026 14:42:57 -0400 Subject: [PATCH 9/9] Use coordinates.core.crval4_to_pol --- src/dysh/fits/index_file.py | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/src/dysh/fits/index_file.py b/src/dysh/fits/index_file.py index 0532b5fb9..1a180a005 100644 --- a/src/dysh/fits/index_file.py +++ b/src/dysh/fits/index_file.py @@ -19,6 +19,7 @@ import pandas as pd +from dysh.coordinates.core import crval4_to_pol from dysh.log import logger # Optional fast Rust-based parser @@ -169,20 +170,7 @@ class IndexMetadata: _WRITER_COLUMNS = [name if name != "#INDEX#" else "INDEX" for name, _, _, _ in _WRITER_SPEC] # Stokes/polarization code mapping (FITS CRVAL4 integer → string) -_POLARIZATION_MAP = { - 1: "I", - 2: "Q", - 3: "U", - 4: "V", - -1: "RR", - -2: "LL", - -3: "RL", - -4: "LR", - -5: "XX", - -6: "YY", - -7: "XY", - -8: "YX", -} +_POLARIZATION_MAP = crval4_to_pol def get_index_path(fits_path: str | Path) -> Path: