Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
3 changes: 3 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 12 additions & 0 deletions .idea/FLIPv3.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 16 additions & 0 deletions .idea/deployment.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions .idea/inspectionProfiles/Project_Default.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

88 changes: 88 additions & 0 deletions amlt/kky.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
description: Results on FLIPv3.

target:
service: sing
name: msrresrchvc
workspace_name: bio0

environment:
image: amlt-sing/acpt-torch2.7.1-py3.10-cuda12.6-ubuntu22.04
setup:
- cd sequence_models
- pip install -U --user -e .
- cd ..
- pip install biopython
- pip install httpx
- pip install esm
# - pip install scikit-learn
# - pip install scipy
storage:
amlt:
storage_account_name: kevyaneastus2
container_name: amlt
code:
local_dir: src/
jobs:
- name: "get_predictions-2"
identity: managed
sku: 1xG1-A100
command:
- python FLIPv3/baselines/get_predictions.py /mnt/amlt/data/flip_data_pruned/ /mnt/amlt/flip_predictions/
mpi: True
process_count_per_node: -1
submit_args:
env:
WANDB_BASE_URL: "https://microsoft-research.wandb.io"
WANDB_API_KEY: "$WANDB_API_KEY"
WANDB_ENTITY: "bio"
NCCL_DEBUG: "INFO"
NCCL_SOCKET_IFNAME: "bond0"
NCCL_CUMEM_ENABLE: 0
NCCL_DEBUG_SUBSYS: "ALL"
TORCH_DISTRIBUTED_DEBUG: "INFO"
AZUREML_SINGULARITY_JOB_UAI: "/subscriptions/7be94754-107d-43b8-b840-202dff0e7cae/resourceGroups/bio0/providers/Microsoft.ManagedIdentity/userAssignedIdentities/bio0-uai"
tags: [ Project_Name:Biomedical_ML,ProjectID:PRJ-0045-A32,Experiment:Bio-0 ]
search:
job_template:
name: "{model_name}_{task}_{weights}_{seed}_{lr}"
sku: 1xG1-A100
command:
- python FLIPv3/baselines/{model_name}.py /mnt/amlt/data/flip_data_5-9-2025/datasets/ /mnt/amlt/flip_results/ {task} {weights} --seed {seed} --lr {lr}
mpi: True
process_count_per_node: -1
submit_args:
env:
WANDB_BASE_URL: "https://microsoft-research.wandb.io"
WANDB_API_KEY: "$WANDB_API_KEY"
WANDB_ENTITY: "bio"
NCCL_DEBUG: "INFO"
NCCL_SOCKET_IFNAME: "bond0"
NCCL_CUMEM_ENABLE: 0
NCCL_DEBUG_SUBSYS: "ALL"
TORCH_DISTRIBUTED_DEBUG: "INFO"
_AZUREML_SINGULARITY_JOB_UAI: "/subscriptions/7be94754-107d-43b8-b840-202dff0e7cae/resourceGroups/bio0/providers/Microsoft.ManagedIdentity/userAssignedIdentities/bio0-uai"
tags: [Project_Name:Biomedical_ML,ProjectID:PRJ-0045-A32,Experiment:Bio-0]
type: grid
max_trials: 10000
params:
- name: lr
spec: discrete
values: [1e-5]
- name: model_name
spec: discrete
values: [carp]
- name: seed
spec: discrete
values: [1]
- name: weights
spec: discrete
values: [ naive ]
- name: task
spec: discrete
values: [ hydro_med_P06241test_split
]
# RhoMax_by_wt, AMY_BACSU_easy_split, AMY_BACSU_hard_split_, AMY_BACSU_med_split_is_buried_0, AMY_BACSU_med_split_is_buried_1, AMY_BACSU_med_split_is_close_to_as_0, AMY_BACSU_med_split_is_close_to_as_1,
# AMY_BACSU_med_split_is_connected_0, AMY_BACSU_med_split_is_connected_1, AMY_BACSU_med_split_is_secondary_0, AMY_BACSU_med_split_is_secondary_1, AMY_BACSU_random_split,
# hydro_hard_split, hydro_med1_split, hydro_med2_split, hydro_random_split,
# ired_ired_excludeT241mut_mutation_order_split, ired_ired_low_high_split, ired_ired_mutation_order_split, ired_ired_random_split,
# PDZ3_low_vs_high, PDZ3_one_vs_rest, PDZ3_sampled, PDZ3_three_vs_rest, PDZ3_two_vs_rest,
50 changes: 50 additions & 0 deletions amlt/ridge.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
description: Results on FLIPv3.

target:
service: sing
name: msrresrchvc
workspace_name: bio0

environment:
image: amlt-sing/acpt-torch2.7.0-py3.10-cuda12.6-ubuntu22.04
setup:
- cd sequence_models
- pip install -U --user -e .
- cd ..
- pip install biopython
- pip install esm
- pip install scikit-learn
- pip install scipy
storage:
amlt:
storage_account_name: kevyaneastus2
container_name: amlt
code:
local_dir: src/
search:
job_template:
name: "ridge_{task}"
sku: C3
command:
- python FLIPv3/baselines/linear_models.py /mnt/amlt/data/flip_data/datasets/ /mnt/amlt/flip_results/ {task}
mpi: True
submit_args:
env:
WANDB_BASE_URL: "https://microsoft-research.wandb.io"
WANDB_API_KEY: "$WANDB_API_KEY"
WANDB_ENTITY: "bio"
NCCL_DEBUG: "INFO"
NCCL_SOCKET_IFNAME: "bond0"
NCCL_CUMEM_ENABLE: 0
NCCL_DEBUG_SUBSYS: "ALL"
TORCH_DISTRIBUTED_DEBUG: "INFO"
_AZUREML_SINGULARITY_JOB_UAI: "/subscriptions/7be94754-107d-43b8-b840-202dff0e7cae/resourceGroups/bio0/providers/Microsoft.ManagedIdentity/userAssignedIdentities/bio0-uai"
tags: [Project_Name:Biomedical_ML,ProjectID:PRJ-0045-A32,Experiment:Bio-0]
type: random
max_trials: 25
params:
- name: task
spec: discrete
values: [trpb_trpB_four_to_three_split, trpb_trpB_no_position_overlap_split, trpb_trpB_one_vs_many_split, trpb_trpB_three_to_four_split, trpb_trpB_two_vs_many_split,
NucB_easy, NucB_medium, NucB_hard
]
139 changes: 139 additions & 0 deletions analysis/plot_baselines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import os
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
import seaborn as sns
from scipy.stats import spearmanr, pearsonr
_ = sns.set(font_scale=1.7)
_ = sns.set_style('white')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', None)
flip_path = '/home/kevyan/results/flipv3/'
pruned_path = "/home/kevyan/data/flip_data_pruned/"

datasets = os.listdir(os.path.join(flip_path, "all_predictions"))


df = pd.read_csv(os.path.join(flip_path, "random_ridge.csv"))

df_zs = pd.DataFrame()
zs_model_dict = {
'dayhoff': 'Dayhoff',
'carp_640m_zs': 'CARP-640M',
# 'carp_640m_masked_zs': 'CARP-640M',
'esm2_650M_scores': 'ESM2-650M'
}
idx = 0
for dataset in datasets:
predictions = pd.read_csv(os.path.join(flip_path, 'zs', dataset + '_zs.csv'))
for model in zs_model_dict.keys():
if model in predictions.columns:
df_zs.loc[idx, 'dataset'] = dataset
df_zs.loc[idx, 'model'] = zs_model_dict[model]
df_zs.loc[idx, 'Spearman'] = spearmanr(predictions['target'], predictions[model]).correlation
idx += 1
best_zs = df_zs.groupby('dataset').agg({'Spearman': 'max'}).reset_index()

# Get number of training examples in each split
dataset_path = '/home/kevyan/data/flip_data_pruned/'
df_sizes = pd.DataFrame(columns=['dataset', 'split', 'n_train', 'n_valid', 'n_test'])
for dataset in datasets:
split_csvs = os.listdir(os.path.join(dataset_path, dataset, 'splits'))
split_csvs = [c for c in split_csvs if c[-4:] == '.csv']
for split_csv in split_csvs:
df_data = pd.read_csv(os.path.join(dataset_path, dataset, 'splits', split_csv))
n_test = len(df_data[df_data['set'] == 'test'])
n_train = len(df_data[(df_data['set'] == 'train') & (~df_data['validation'])])
n_valid = len(df_data[df_data['validation']])
df_sizes.loc[len(df_sizes)] = [dataset, split_csv[:-4], n_train, n_valid, n_test]
print(df_sizes)

df_metrics = pd.read_csv(os.path.join(flip_path, 'all_metrics.csv'))
model_dict = {
"Ridge": "Ridge (one-hot)",
"zsRidge": 'Ridge (one-hot + likelihoods)',
"Dayhoff": "Dayhoff likelihood",
"ESM2-650M": "ESM2-650M likelihood",
"CARP-640M zero shot": "CARP-640M likelihood",
("CARP-640M", True): "CARP-640M supervised",
("CARP-640M", False): "CARP-640M naive supervised",
("ESMC-300M", True): "ESMC-300M supervised",
("ESMC-300M", False): "ESMC-300M naive supervised",
}

for i, row in df_metrics.iterrows():
if row['model'] in model_dict:
df_metrics.loc[i, 'model'] = model_dict[row['model']]
else:
df_metrics.loc[i, 'model'] = model_dict[(row['model'], row['pretrained'])]
df_metrics = df_metrics.fillna(0)
split_dict = {
'close_to_far': 'position',
'far_to_close': 'position',
'by_mutation': 'mutation',
'by_position': 'position',
'by_wt': 'wild type',
'random': 'random',
'one_to_many': 'number',
'to_P06241': 'wild type',
'to_P01053': 'wild type',
'to_P0A9X9': 'wild type',
'low_to_high': 'fitness',
'three_to_many': 'number',
'single_to_double': 'number',
'two_to_many': 'number',
}
pal = [
'#76B900',
'#A77BB5',
'#4E79A7',
'#FF8A80',
'#F28E2B',
'#E15759'
]
split_hues = {"number": pal[0], "wild type": pal[1], "position": pal[2], "mutation": pal[3], "fitness": pal[4]}


for i, dataset in enumerate(datasets):
fig, ax = plt.subplots()
_ = sns.lineplot(data=df[df['dataset'] == dataset], x='n_train', y='Spearman', color='grey', style='model',
markers=True, ax=ax, ms=20, alpha=0.8)
_ = ax.axhline(y=best_zs[best_zs['dataset'] == dataset]['Spearman'].values[0], color='grey', linestyle='-',
label='best zero-shot likelihood score')
_ = ax.semilogx()
_ = ax.set_xlabel('Number of training examples')
_ = ax.set_ylim([-0.35, 1])
split_csvs = os.listdir(os.path.join(dataset_path, dataset, 'splits'))
split_csvs = [c for c in split_csvs if c[-4:] == '.csv']
for split_csv in split_csvs:
if 'random' in split_csv:
continue
split_name = split_csv[:-4]
if dataset == 'Amylase' and split_name == 'by_position':
split_name = 'by_mutation'
color = split_hues[split_dict[split_name]]
x = df_sizes[(df_sizes['dataset'] == dataset) & (df_sizes['split'] == split_csv[:-4])]['n_train']
y1 = df_metrics[(df_metrics['dataset'] == dataset) & (df_metrics['split'] == split_csv[:-4]) & (df_metrics['model'] == 'Ridge (one-hot)')]['Spearman'].values[0]
_ = ax.plot(x, y1, 'o', color=color, ms=20, alpha=0.7, mew=0)
y2 = df_metrics[(df_metrics['dataset'] == dataset) & (df_metrics['split'] == split_csv[:-4]) & (df_metrics['model'] == 'Ridge (one-hot + likelihoods)')]['Spearman'].values[0]
_ = ax.plot(x, y2, 'x', color=color, ms=20, alpha=1.0, mew=4)
_ = ax.set_title(dataset)
legend = ax.legend(title='Model')
legend.remove()
fig.savefig(os.path.join(flip_path, "plots", "random_ridge_%s.pdf" %dataset), dpi=300, bbox_inches='tight')
handles, labels = ax.get_legend_handles_labels()
fig, ax = plt.subplots()
legend = ax.legend(handles, labels, title='Model')
bbox = legend.get_window_extent().transformed(fig.dpi_scale_trans.inverted())
fig.savefig(os.path.join(flip_path, 'plots', 'random_ridge_models.pdf'), dpi=300, bbox_inches=bbox)
elements = [
Patch(facecolor='gray', edgecolor=None, label='random'),
]
elements += [Patch(facecolor=split_hues[s], edgecolor=None, label=s, alpha=0.7) for s in split_hues]
legend = ax.legend(handles=elements, title='Split type')
bbox = legend.get_window_extent().transformed(fig.dpi_scale_trans.inverted())
fig.savefig(os.path.join(flip_path, 'plots', 'random_ridge_split_types.pdf'), dpi=300, bbox_inches=bbox)
Loading