CCParams/ExtractDimerACoords.py at master · biocryst/CCParams · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import os
from Bio.SeqUtils import seq1
import warnings
import numpy as np
import cPickle
from Bio.PDB.Vector import rotaxis
from pyRMSD import RMSDCalculator
from Bio import PDB

def get_gly_cb_coords(residue):
    try:
        n_v=residue["N"].get_vector()
        c_v=residue["C"].get_vector()
        ca_v=residue["CA"].get_vector()
    except:
        return None
    n_v=n_v-ca_v
    c_v=c_v-ca_v
    rot=rotaxis(-np.pi*120.0/180.0, c_v)
    cb_at_origin_v=n_v.left_multiply(rot)
    cb_v=cb_at_origin_v+ca_v
    return cb_v.get_array()

def split_cc(model, selection, n_res_split):
    (pdb_id, chains, start_res, end_res, sequences, registers) = selection
    chain1 = model[chains[0]]
    chain2 = model[chains[1]]

    n_res = end_res[0] - start_res[0] + 1
    backbone_ids = ['N', 'CA', 'C', 'O', 'CB']
    n_atoms_mono = n_res * len(backbone_ids)
    coords_all = np.zeros((n_atoms_mono * 2, 3))
    coord_ind = 0
    for res1_id, res2_id, aa1, aa2 in zip(range(start_res[0], end_res[0] + 1), range(start_res[1], end_res[1] + 1), sequences[0], sequences[1]):
        try:
            res1 = chain1[res1_id]
        except:
            res1 = chain1[('H_MSE', res1_id, ' ')]
        try:
            res2 = chain2[res2_id]
        except:
            res2 = chain2[('H_MSE', res2_id, ' ')]
        if aa1 != 'X':
            assert aa1 == seq1(res1.resname)
        if aa2 != 'X':
            assert aa2 == seq1(res2.resname)
        for backbone_id in backbone_ids:
            try:
                coords_all[coord_ind, :] = res1[backbone_id].coord
            except:
                # CB is not available for GLY, use estimated coordinates
                assert backbone_id == 'CB'
                coords_all[coord_ind, :] = get_gly_cb_coords(res1)
            try:
                coords_all[n_atoms_mono + coord_ind, :] = res2[backbone_id].coord
            except:
                assert backbone_id == 'CB'
                coords_all[n_atoms_mono + coord_ind, :] = get_gly_cb_coords(res2)
            coord_ind += 1

    cc_fragments = []
    n_atoms_mono_split = n_res_split * len(backbone_ids)

    ref_reg = 'abcdefgabcdefga' # we are interested only in fragments with this pattern
    for start_ind in range(0, n_res - n_res_split + 1):
        reg1 = registers[0][start_ind:start_ind+n_res_split]
        reg2 = registers[1][start_ind:start_ind+n_res_split]
        if reg1 != ref_reg or reg2 != ref_reg:
            continue
        start_ind_atom = start_ind * len(backbone_ids)
        coords1 = coords_all[start_ind_atom:start_ind_atom + n_atoms_mono_split, :]
        coords2 = coords_all[n_atoms_mono + start_ind_atom:n_atoms_mono + start_ind_atom + n_atoms_mono_split, :]
        mean1 = np.mean(coords1, 0)
        mean2 = np.mean(coords2, 0)
        dist = np.linalg.norm(mean1 - mean2)
        ca_dists = np.linalg.norm(coords1[1::5]-coords2[1::5],axis=1)
        if dist > 18 or np.max(ca_dists) > 18: # too far apart, probably not a coiled coil
            print "wrong CC segment in", selection
            continue

        coords = np.append(coords1, coords2, axis=0)
        cc_fragments.append(coords)

    return cc_fragments


f = open('cc_dataset_mmol_all').read().splitlines()

records = [line.split() for line in f]
ccoils = []

for c1,c2 in zip(records[0::2], records[1::2]):
    assert c1[0] == c2[0]
    filename = c1[0]
    chains = (c1[1],c2[1])
    if len(c1[4]) != len(c2[4]):
        continue
    sequence1 = c1[4]
    sequence2 = c2[4]
    register1 = c1[5]
    register2 = c2[5]
    start_res = (int(c1[2]),int(c2[2]))
    end_res = (int(c1[3]),int(c2[3]))
    sequence = (sequence1,sequence2)
    register = (register1,register2)

    ccoils.append((filename, chains, start_res, end_res, sequence,register))
    ccoils.append((filename, chains[::-1], start_res[::-1], end_res[::-1], sequence[::-1],register[::-1]))

print "Number of coiled coil records:",len(ccoils)

pdb_path = "MMOL"
warnings.filterwarnings("ignore")

n_window = 15
cc_coords_all = []

for item in ccoils:
    (filename, chains, start_res, end_res, sequence,register) = item
    full_filename = os.path.join(pdb_path,filename)
    if not os.path.exists(full_filename):
        print 'missing ', full_filename
        continue
    structure = PDB.PDBParser().get_structure(filename, full_filename)
    model = structure[0]
    try:
        # extract poly-ala CC coords and find 15-residue fragments which start from the a-position
        coiled_coils = split_cc(model, item, n_window)
    except:
        print 'problem with', item
        continue
    for cc in coiled_coils:
        cc_coords_all.append(cc)

cc_coords_all = np.array(cc_coords_all)
cPickle.dump(cc_coords_all, open('dimer_a_all.pkl', "wb"))

print "Total number of fragments collected:", cc_coords_all.shape[0]

threshold = 0.2

# go through the fragments in reverse chronological order
# and filter out the structurally redundant ones

# the order probably doesn't matter, but the idea was that
# later models would be more accurate, so we bias the selection accordingly
cc_unique = [cc_coords_all[-1]]
global_ind = 0
unique_ind = 0
for cc in reversed(cc_coords_all[:-1]):
    cc_test = [cc]+cc_unique
    cc_test = np.array(cc_test)
    calculator = RMSDCalculator.RMSDCalculator("QCP_OMP_CALCULATOR", cc_test)
    dist = calculator.oneVsFollowing(0)
    global_ind+=1
    if np.min(dist) > threshold:
        cc_unique.append(cc)
        unique_ind+=1

cc_unique = np.array(cc_unique)

print "Number of unique fragments:",cc_unique.shape[0]

# get mean CC structure
calculator = RMSDCalculator.RMSDCalculator("QCP_OMP_CALCULATOR", cc_unique)
calculator.iterativeSuperposition()
mean_cc = np.mean(cc_unique,0)

calculator = RMSDCalculator.RMSDCalculator("QCP_OMP_CALCULATOR", np.append([mean_cc], cc_unique,axis=0))
dist = calculator.oneVsFollowing(0)

# standard deviation estimated through median absolute deviation
med_dist = np.median(dist)
mad = np.median(np.abs(dist-med_dist))
std_est = 1.4856*mad

# filter outliers which are too far from the mean structure
cutoff = np.mean(dist)+3*std_est
idx = dist < cutoff
cc_filt = cc_unique[idx]
cPickle.dump(cc_filt, open('dimer_a_unique_0.2.pkl', "wb"))

print "Number of unique fragments excluding outliers:",cc_filt.shape[0]