weecology · bw4sz · May 14, 2026
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -13,43 +13,28 @@ jobs:
       fail-fast: false
       matrix:
         python-version:
-          # - 3.6
-          - 3.9
-          # - 3.8
+          - "3.11"
         os:
           - "ubuntu-latest"
 
     runs-on: "${{ matrix.os }}"
 
-    # use bash everywhere
-    defaults:
-      run:
-        shell: "bash -l {0}"
-
     steps:
       - name: "Checkout code"
-        uses: "actions/checkout@v2"
+        uses: "actions/checkout@v4"
 
-      - name: "Cache conda"
-        uses: "actions/cache@v1"
-        env:
-          # Increase this value to reset cache if env.yml has not changed
-          CACHE_NUMBER: 0
+      - name: "Setup Python"
+        uses: "actions/setup-python@v5"
         with:
-          path: "~/conda_pkgs_dir"
-          key: "${{ matrix.os }}-conda-${{ matrix.python-version }}-${{ env.CACHE_NUMBER }}-${{ hashFiles('enviroment.yml') }}"
+          python-version: "${{ matrix.python-version }}"
 
-      - name: "Setup conda"
-        uses: "conda-incubator/setup-miniconda@v2"
+      - name: "Setup uv"
+        uses: "astral-sh/setup-uv@v6"
         with:
-          activate-environment: "DeepForest"
-          environment-file: "environment.yml"
-          python-version: "${{ matrix.python-version }}"
-          channels: conda-forge,spyder-ide
-          allow-softlinks: true
-          channel-priority: flexible
-          show-channel-urls: true
-          use-only-tar-bz2: true
+          enable-cache: true
+
+      - name: "Install dependencies"
+        run: "uv sync --extra dev"
 
       - name: "Run tests"
-        run: "pytest -v"
+        run: "uv run pytest -v"
diff --git a/.gitignore b/.gitignore
@@ -1,11 +1,23 @@
+config.local.yml
+.smoke_train_overrides.yml
+results/**
+!results/.gitkeep
+
 .DS_Store
 project.wpr
 project.wpu
 *.h5
 __pycache__
-data/raw/
-data/processed/
+# Large or machine-local artifacts (keep README / .gitkeep under data/*)
+data/processed/**
+data/external/**
+!data/external/.gitkeep
+!data/interim/.gitkeep
+data/raw/**/*.csv
+data/raw/**/*.zip
+!data/raw/README.md
 *.tif
 *.png
+!docs/figures/*.png
 *.wpr
 *.wpu
diff --git a/README.md b/README.md
diff --git a/SLURM/crown_plot_array.sh b/SLURM/crown_plot_array.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+#SBATCH --job-name=dt_crown_plot
+#SBATCH --account=ewhite
+#SBATCH --cpus-per-task=2
+#SBATCH --mem=16G
+#SBATCH --time=04:00:00
+#SBATCH --output=logs/crown_plot_%A_%a.out
+#SBATCH --error=logs/crown_plot_%A_%a.err
+##SBATCH --array=1-50%10
+
+# One task per line in plots.txt (plotID). Example:
+#   awk 'NR>1{print $1}' plots_export.csv | sort -u > plots.txt   # if first column is plotID
+# Then: #SBATCH --array=1-$(wc -l < plots.txt)%10
+
+set -euo pipefail
+REPO_ROOT="${REPO_ROOT:-$HOME/DeepTreeAttention}"
+export CONFIG_PATH="${CONFIG_PATH:-$REPO_ROOT/config.yml}"
+CANOPY_POINTS="${CANOPY_POINTS:-$REPO_ROOT/data/interim/canopy_points.shp}"
+PLOTS_FILE="${PLOTS_FILE:-$REPO_ROOT/plots.txt}"
+
+cd "$REPO_ROOT"
+mkdir -p logs
+
+LINE_NO="${SLURM_ARRAY_TASK_ID:?Set SLURM_ARRAY_TASK_ID or submit with sbatch --array}"
+PLOT="$(sed -n "${LINE_NO}p" "$PLOTS_FILE")"
+if [[ -z "${PLOT}" ]]; then
+  echo "No plot on line ${LINE_NO} of ${PLOTS_FILE}"
+  exit 1
+fi
+
+RGB_GLOB="$(uv run python -c "import os; from src import utils; print(utils.read_config(os.environ['CONFIG_PATH'])['rgb_sensor_pool'])")"
+
+uv run python -m src.pipelines.crown_one_plot \
+  --canopy-points "$CANOPY_POINTS" \
+  --plot "$PLOT" \
+  --rgb-glob "$RGB_GLOB" \
+  --savedir "${CROWN_BOX_DIR:-$REPO_ROOT/data/interim/boxes}" \
+  --raw-box-savedir "${RAW_BOX_DIR:-$REPO_ROOT/data/interim/raw_boxes}"
diff --git a/SLURM/experiment.sh b/SLURM/experiment.sh
diff --git a/SLURM/osbs_inference.sh b/SLURM/osbs_inference.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+#SBATCH --job-name=osbs_infer
+#SBATCH --mail-type=END
+#SBATCH --mail-user=benweinstein2010@gmail.com
+#SBATCH --account=ewhite
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64GB
+#SBATCH --time=48:00:00
+#SBATCH --output=/home/b.weinstein/logs/osbs_inference_%j.out
+#SBATCH --error=/home/b.weinstein/logs/osbs_inference_%j.err
+#SBATCH --partition=gpu
+#SBATCH --gpus=1
+
+# OSBS tile inference (detection + species). Configure inference_osbs in config.yml, then submit.
+
+set -euo pipefail
+
+ulimit -c 0
+
+REPO_ROOT="${REPO_ROOT:-${HOME}/DeepTreeAttention}"
+CONFIG_PATH="${CONFIG_PATH:-${REPO_ROOT}/config.yml}"
+
+module load git gcc 2>/dev/null || true
+source activate DeepTreeAttention
+
+cd "${REPO_ROOT}"
+export PYTHONPATH="${REPO_ROOT}:${PYTHONPATH:-}"
+
+python -m src.pipelines.osbs_inference --config "${CONFIG_PATH}"
diff --git a/SLURM/osbs_mortality.sh b/SLURM/osbs_mortality.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+#SBATCH --job-name=osbs_mortality
+#SBATCH --mail-type=END
+#SBATCH --mail-user=benweinstein2010@gmail.com
+#SBATCH --account=ewhite
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=96GB
+#SBATCH --time=72:00:00
+#SBATCH --output=/home/b.weinstein/logs/osbs_mortality_%j.out
+#SBATCH --error=/home/b.weinstein/logs/osbs_mortality_%j.err
+#SBATCH --partition=gpu
+#SBATCH --gpus=1
+
+# OSBS tile-scale mortality comparison. Configure osbs_mortality in config.yml, then submit.
+
+set -euo pipefail
+
+ulimit -c 0
+
+REPO_ROOT="${REPO_ROOT:-${HOME}/DeepTreeAttention}"
+CONFIG_PATH="${CONFIG_PATH:-${REPO_ROOT}/config.yml}"
+
+module load git gcc 2>/dev/null || true
+source activate DeepTreeAttention
+
+cd "${REPO_ROOT}"
+export PYTHONPATH="${REPO_ROOT}:${PYTHONPATH:-}"
+
+python -m src.pipelines.osbs_mortality --config "${CONFIG_PATH}"
diff --git a/SLURM/train_experiment.sh b/SLURM/train_experiment.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Submit a queued GPU training run with a stable experiment identity for Comet.
+#
+# Usage (from login node, repo checked out on shared FS):
+#   cd /path/to/DeepTreeAttention
+#   export EXPERIMENT_NAME=osbs-epoch70-bs128-$(date +%Y%m%d)
+#   # optional: export DEEPTREE_OVERRIDES=...  DEEPTREE_CONFIG=...
+#   sbatch SLURM/train_experiment.sh
+#
+# REPO_ROOT defaults to SLURM_SUBMIT_DIR (your cwd when you ran sbatch). Override
+# if you submit from elsewhere: REPO_ROOT=/path/to/DeepTreeAttention sbatch ...
+#
+# Comet: set COMET_API_KEY (and optionally COMET_WORKSPACE) in the environment
+# or load them from a secrets file before sbatch. DEEPTREE_EXPERIMENT_NAME is
+# forwarded so the Comet UI name matches your SLURM intent.
+
+#SBATCH --job-name=dt-train
+#SBATCH --account=ewhite
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=8
+#SBATCH --mem=64G
+#SBATCH --time=12:00:00
+#SBATCH --partition=gpu
+#SBATCH --gpus=1
+#SBATCH --output=/home/b.weinstein/logs/train_%x_%j.out
+#SBATCH --error=/home/b.weinstein/logs/train_%x_%j.err
+#SBATCH --partition=hpg-turin 
+#SBATCH --ntasks-per-node=1
+#SBATCH --gpus=1
+
+set -euo pipefail
+
+REPO_ROOT="${REPO_ROOT:-${SLURM_SUBMIT_DIR:-}}"
+if [[ -z "${REPO_ROOT}" ]]; then
+  echo "[train_experiment] Set REPO_ROOT or run sbatch from the repo: cd .../DeepTreeAttention && sbatch SLURM/train_experiment.sh" >&2
+  exit 1
+fi
+if [[ ! -f "${REPO_ROOT}/train.py" ]]; then
+  echo "[train_experiment] REPO_ROOT=${REPO_ROOT} is not the repo root (no train.py). cd into DeepTreeAttention or set REPO_ROOT." >&2
+  exit 1
+fi
+cd "${REPO_ROOT}"
+
+mkdir -p logs
+
+# Stable name for Comet + log files (override when submitting)
+export DEEPTREE_EXPERIMENT_NAME="${EXPERIMENT_NAME:-train-${SLURM_JOB_ID}}"
+
+# Optional merged config fragment (same as train.py --overrides)
+OVERRIDES_ARGS=()
+if [[ -n "${DEEPTREE_OVERRIDES:-}" ]]; then
+  OVERRIDES_ARGS=(--overrides "${DEEPTREE_OVERRIDES}")
+fi
+
+CONFIG_PATH="${DEEPTREE_CONFIG:-config.yml}"
+
+# Use uv if available (recommended); fall back to python on the module path.
+if command -v uv >/dev/null 2>&1; then
+  RUN=(uv run python train.py --config "${CONFIG_PATH}" "${OVERRIDES_ARGS[@]}" --experiment-name "${DEEPTREE_EXPERIMENT_NAME}")
+else
+  RUN=(python train.py --config "${CONFIG_PATH}" "${OVERRIDES_ARGS[@]}" --experiment-name "${DEEPTREE_EXPERIMENT_NAME}")
+fi
+
+echo "[train_experiment] SLURM_JOB_ID=${SLURM_JOB_ID:-}"
+echo "[train_experiment] DEEPTREE_EXPERIMENT_NAME=${DEEPTREE_EXPERIMENT_NAME}"
+echo "[train_experiment] REPO_ROOT=${REPO_ROOT}"
+echo "[train_experiment] running: ${RUN[*]}"
+
+exec "${RUN[@]}"
diff --git a/abundance.py b/abundance.py
@@ -1,11 +1,11 @@
 #Plot abundance distribution
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from glob import glob
 import os
 import pandas as pd
 import geopandas as gpd
-from src import start_cluster
 
-client = start_cluster.start(cpus=75,mem_size="10GB")
+_IO_WORKERS = min(32, (os.cpu_count() or 4) * 4)
 
 ##Same data
 
@@ -43,9 +43,9 @@ def read_shp(path):
     print(files)
     if len(files) == 0:
         continue
-    counts = []
-    futures = client.map(read_shp,files)
-    counts = [x.result() for x in futures]
+    with ThreadPoolExecutor(max_workers=_IO_WORKERS) as ex:
+        futures = [ex.submit(read_shp, f) for f in files]
+        counts = [f.result() for f in as_completed(futures)]
     total_counts = pd.Series()
     for ser in counts:
         total_counts = total_counts.add(ser, fill_value=0)
@@ -89,9 +89,9 @@ def read_shp(path):
     print(files)
     if len(files) == 0:
         continue
-    counts = []
-    futures = client.map(read_shp,files)
-    counts = [x.result() for x in futures]
+    with ThreadPoolExecutor(max_workers=_IO_WORKERS) as ex:
+        futures = [ex.submit(read_shp, f) for f in files]
+        counts = [f.result() for f in as_completed(futures)]
     total_counts = pd.Series()
     for ser in counts:
         total_counts = total_counts.add(ser, fill_value=0)

diff --git a/config.smoke.example.yml b/config.smoke.example.yml
@@ -0,0 +1,30 @@
+# Copy to ``config.local.yml`` or pass ``--overrides config.smoke.example.yml`` with ``train.py``.
+# If your ``use_data_commit`` folder has split CSVs like ``train_<hash>_[\"OSBS\"].csv`` instead of
+# ``train.csv`` / ``test.csv``, set ``processed_train_csv`` and ``processed_test_csv`` (relative to that folder).
+
+epochs: 1
+workers: 0
+preload_images: false
+batch_size: 32
+
+# Lightning: keep runs tiny (metrics on partial data are not meaningful)
+limit_train_batches: 2
+limit_val_batches: 2
+limit_predict_batches: 4
+
+use_comet: false
+checkpoint_dir: results/checkpoints
+
+# CPU smoke (set accelerator: auto, devices: 1 for a short GPU check)
+accelerator: cpu
+devices: 1
+accelerator: cpu
+
+# Faster crown detection during OSBS inference smoke (optional)
+deepforest_dead_cropmodel_name: null
+
+inference_osbs:
+  tile_limit: 1
+  predict_limit_batches: 2
+  # After a smoke train, point this at results/checkpoints/<id>.pt
+  # species_checkpoint: results/checkpoints/smoke.pt
diff --git a/config.smoke.local.yml b/config.smoke.local.yml
@@ -0,0 +1,18 @@
+data_dir: data/processed
+use_data_commit: 4c02ae98bd774aa494fadb3508ae84ba
+processed_train_csv: train_dd0adf605011f67ea3e3626231a9713a04a9e85e_['OSBS'].csv
+processed_test_csv: test_dd0adf605011f67ea3e3626231a9713a04a9e85e_['OSBS'].csv
+
+epochs: 1
+workers: 0
+preload_images: false
+batch_size: 16
+
+limit_train_batches: 2
+limit_val_batches: 2
+limit_predict_batches: 2
+
+use_comet: false
+checkpoint_dir: results/checkpoints
+accelerator: cpu
+devices: 1