create_templates/create_templates_csv.py at main · DasLab/create_templates · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
#!/usr/bin/env python3
from Bio import SeqIO,PDB,BiopythonWarning
from Bio.PDB.MMCIF2Dict import MMCIF2Dict
from Bio.Seq import Seq
from Bio.PDB import MMCIFParser
import numpy as np
import pandas as pd
import os
import gzip
import sys
import csv
import warnings
import argparse
from copy import deepcopy
from datetime import datetime

# Suppress warnings
warnings.simplefilter('ignore', BiopythonWarning)

parser = argparse.ArgumentParser(description="Prepare templates.csv file similar to labels.csv but with MMseqs2-identified templates")
parser.add_argument('-s', '--sequences_file',
                    default="validation_sequences.csv",
                    help='CSV file with columns including "target_id" and "sequence". Default is `test_sequences.csv`.')
parser.add_argument('--mmseqs_results_file',
                    default= '',
                    help='MMseqs output with query,target,evalue,qstart,qend,tstart,tend,qaln,taln.')
parser.add_argument('--outfile',
                    default='',
                    help='Name of the output CSV file. Default is `templates.csv`.')
parser.add_argument('--dataset_name','--name',
                    default='',
                    help='full dataset_name, tag for csvs')
parser.add_argument('-o','--outdir',
                    default='./',
                    help='Where to save output CSVs (Default ./)' )
parser.add_argument('--max_templates',
                    default=40, type=int,
                    help='Maximum number of templates for target. Default is 5. Use 40 to prepare solution')
parser.add_argument('--cif_dir',
                    default='',
                    help='Directory holding cif.gz files, pdb_release_dates_NA.csv, and pdb_seqres_NA.fasta')
parser.add_argument('--skip_temporal_cutoff',
                    action='store_true',
                    help='Disable tests of temporal cutoff')
parser.add_argument('--start_idx',
                    default=0, type=int,
                    help='Start index (1,2,...) of test_sequences to work on, for parallelization. Default: 0 (do all sequences).' )
parser.add_argument('--end_idx',
                    default=0, type=int,
                    help='End index (1,2,...) of test_sequences to work on, for parallelization. Default: 0 (do all sequences).' )
parser.add_argument('--id_map',
                    default='',
                    help='CSV file with fields `orig` and `new` for mapping original target IDs to new target IDs. Default is `` (no mapping).')


def clean_res_name( res_name ):
    if res_name in ['A', 'C', 'G', 'U']:
        return res_name
    else: # can be modified residue with 3-letter name.
        return 'X'

#all atoms
ALL_ATOMS=["P","OP1","OP2","O5'","O3'","C1'","C2'","O2'","C3'","C4'","O4'","C5'","N1","C2","O2","N3","C4","N4","C5","C6","O4","N9","N7","C8","N6","N2","O6"]
C1PRIME_KEY="C1\'"

def extract_title_release_date( cif_path ):

    if cif_path.endswith('.gz'):
        with gzip.open(cif_path, 'rt') as cif_file:
            mmcif_dict = MMCIF2Dict(cif_file)
    else:
        mmcif_dict = MMCIF2Dict(cif_path)

    possible_title_fields = [
        '_struct.title',
        '_entry.title',
        '_struct_keywords.pdbx_keywords'
    ]

    pdb_title = None
    for field in possible_title_fields:
        if field in mmcif_dict:
            pdb_title = mmcif_dict[field]
            if isinstance(pdb_title, list):
                pdb_title = ' '.join(pdb_title)
            break

    possible_date_fields = [
        '_pdbx_database_status.initial_release_date',
        '_pdbx_database_status.recvd_initial_deposition_date',
        '_database_PDB_rev.date'
    ]

    release_date = None
    for field in possible_date_fields:
        if field in mmcif_dict:
            release_date = mmcif_dict[field]
            if isinstance(release_date, list):
                release_date = release_date[0]  # Take the first date if it's a list
            break

    return pdb_title, release_date


def extract_rna_sequence(cif_path,chain_id):

    if cif_path.endswith('.gz'):
        with gzip.open(cif_path, 'rt') as cif_file:
            mmcif_dict = MMCIF2Dict(cif_file)
    else:
        mmcif_dict = MMCIF2Dict(cif_path)

    pdb_sequence = None
    pdb_chain_id = None
    chain_seq_nums = None

    # Extract _pdbx_poly_seq_scheme information
    strand_id  = mmcif_dict.get('_pdbx_poly_seq_scheme.pdb_strand_id',[])
    mon_id     = mmcif_dict.get('_pdbx_poly_seq_scheme.mon_id',[])
    pdb_mon_id = mmcif_dict.get('_pdbx_poly_seq_scheme.pdb_mon_id',[])
    pdb_seq_num = mmcif_dict.get('_pdbx_poly_seq_scheme.pdb_seq_num',[])
    auth_seq_num = mmcif_dict.get('_pdbx_poly_seq_scheme.auth_seq_num',[])
    pdb_ins_code = mmcif_dict.get('_pdbx_poly_seq_scheme.pdb_ins_code',[])
    chain_ids = list(set(strand_id))
    seq_chains = []

    full_sequence = ''
    full_sequence = ''
    pdb_chain_sequence = ''
    pdb_chain_seq_nums = []
    pdb_chain_ins_codes = []

    for (strand,mon,pdb_mon,pdb_num,auth_num,ins_code) in zip(strand_id,mon_id,pdb_mon_id,pdb_seq_num,auth_seq_num,pdb_ins_code):
        if strand==chain_id:
            full_sequence += clean_res_name( mon )
            pdb_chain_sequence += clean_res_name( pdb_mon )
            # note use of auth_seq_num instead of pdb_seq_num since that is what Biopython uses for Residue.id
            pdb_chain_seq_nums.append( auth_num )
            pdb_chain_ins_codes.append( ins_code )

    return full_sequence,pdb_chain_sequence,pdb_chain_seq_nums,pdb_chain_ins_codes

def get_coord_labels(cif_path, chain_id, chain_sequence, chain_seq_nums, chain_ins_codes):
    """
    Extract coordinates for an RNA chain based on a reference sequence alignment.

    This function uses Biopython to parse a CIF file, finds the specified chain,
    and extracts coordinates for RNA residues if there is indeed a C1' (nan's otherwise).

    Parameters:
    cif_path (str): Path to the CIF file.
    chain_id (str): Chain identifier in the CIF file.
    chain_sequence (str): sequence of target derived from polyx_ fields, used for output and as sanity check.
    chain_seq_nums  (list of strings): numbers of residues in PDB (auth_seq_num)
    chain_ins_codes (list of strings): ins_codes in PDB (needed to ensure unique lookup!)

    Returns:
    list of tuples: Each tuple contains (resname, resid, xyz, pdb_seq_num), where:
                    resname: Residue name (A, C, G, or U) from the reference sequence
                    resid: Residue ID (1, 2, 3, ...) based on position in reference sequence
                    xyz: dictionary of xyz coords for all 26 (heavy) atom names,
                        P,OP1,OP2,O5',O3',C1',C2',C3',C4',O4',C5',N1,C2,O2,N3,C4,N4,C5,C6,O4,N9,N7,C8,N6,N2,O6
                    pdb_info: (author seq num, ins code, resname)

    The length of the returned list is equal to the length of input chain_sequence.
    """
    # Parse the CIF file
    parser = MMCIFParser()
    if cif_path.endswith('.gz'):
        with gzip.open(cif_path, 'rt') as gz_file:
            structure = parser.get_structure('RNA', gz_file )
    else:
        structure = parser.get_structure('RNA', cif_path)

    # Get the specified chain
    chain = structure[0][chain_id]

    # getting residues out of chain is complex -- easier to get a list ahead of time.
    residues = {}
    for residue in chain:
        residues[ (residue.id[1],residue.id[2]) ] = residue

    # Initialize the result list
    result = []

    assert( len( chain_sequence) == len( chain_seq_nums ) )
    for i, chain_res in enumerate(chain_sequence):
        chain_resid = i+1
        chain_seq_num  = int(chain_seq_nums[i]) if (chain_seq_nums[i].isdigit() and i < len(chain_seq_nums)) else 0
        chain_ins_code = chain_ins_codes[i].replace('.',' ')
        res_id = (chain_seq_num,chain_ins_code)

        xyz = { atom:(np.nan,np.nan,np.nan) for atom in ALL_ATOMS}
        res_info = (chain_res, chain_resid, xyz, (-1e18,' ','') ) # blank
        if res_id in residues:
            residue = residues[ res_id ]
            if 'C1\'' in residue:
                resname = residue.get_resname()
                if chain_res != clean_res_name( resname ):
                    print( f'Warning! mismatch residue at {chain_resid}: target {chain_res} pdb {residue.get_resname()} chain_seq_num {chain_seq_num} residue.id {residue.id[1]}' )

                for atom in ALL_ATOMS:
                    if atom in residue:
                        xyz[atom] = residue[atom].coord

                res_info = (chain_res, chain_resid, xyz, (res_id[0], res_id[1], resname) )
        result.append(res_info)
    return result

def get_target_coord_data( chain_coord_data, alignment ):
    '''
    Inputs
      chain_coord_data = coordinates and other info, read out from PDB file for the chain
      alignment = two strings that map chain to target with gaps as '-'

    Output
      target_coord_data = coordinates for target sequence, with gaps filled with nan.
    '''
    target_coord_data = []
    chain_pos = -1
    target_pos = -1
    xyz_blank = { atom:(np.nan,np.nan,np.nan) for atom in ALL_ATOMS}
    for (chain_res,target_res) in zip(alignment[0],alignment[1]):
        if chain_res  != '-':
            chain_pos += 1
        if target_res != '-':
            target_pos += 1
            if chain_res != '-':
                coord_data=chain_coord_data[ chain_pos ]
                target_coord_data.append( (target_res, target_pos+1,coord_data[2],coord_data[3]) )
            else:
                target_coord_data.append( (target_res, target_pos+1,xyz_blank,(-1e18,' ','') ) )

    return target_coord_data

def is_before_or_on(d1, d2):
    date1 = pd.to_datetime(d1)
    date2 = pd.to_datetime(d2)
    return date1 <= date2

def read_id_map(id_map_file):
    if len(id_map_file)==0: return None
    id_map = {}
    try:
        with open(id_map_file, newline='') as f:
            reader = csv.DictReader(f)
            if 'orig' not in reader.fieldnames or 'new' not in reader.fieldnames:
                print("Warning: ID map file does not contain the fields 'orig' and 'new'. Using original IDs instead.")
                return id_map
            for row in reader:
                id_map[row['orig']] = row['new']
    except FileNotFoundError:
        print(f"Warning: ID map file {id_map_file} not found. Using original IDs instead.", file=sys.stderr)
    except Exception as exc:
        print(f"Error reading {id_map_file}: {exc}", file=sys.stderr)
    return id_map

def read_release_dates( release_data_file ):
    release_dates = {}
    # must have format Entry ID, Release Date
    with open(release_data_file, newline='') as f:
        reader = csv.DictReader(f)
        for row in reader:
            release_dates[row['Entry ID']] = row['Release Date']

    return release_dates


def get_template_labels( sequences_file, mmseqs_results_file, skip_temporal_cutoff,
                         MAX_TEMPLATES, cif_dir, id_map_file='', start_idx=0, end_idx=0 ):

    # Prepare to collect output data
    output_labels = []
    output_allatom_labels = []

    if len(cif_dir) == 0:
        dir_name = os.path.dirname( os.path.abspath( sys.argv[0] ) )
        cif_dir = dir_name+'/PDB_RNA'

    # Read the FASTA file
    df = pd.read_csv( sequences_file )
    targets = df['target_id'].to_list()
    sequences = df['sequence'].to_list()
    temporal_cutoffs = df['temporal_cutoff'].to_list()

    aln_lines = []
    for line in open( mmseqs_results_file ).readlines():
        # query,template,eval,qstart,qend,tstart,tend,qaln,taln
        aln_lines.append( line.strip().split() )

    id_map = read_id_map( id_map_file )

    release_dates = read_release_dates( cif_dir + '/pdb_release_dates_NA.csv' )

    if start_idx == 0 and end_idx == 0: # do all targets by default
        start_idx = 1
        end_idx = len(targets)

    num_targets = 0
    count = 0
    for target,sequence,temporal_cutoff in zip(targets,sequences,temporal_cutoffs):
        count += 1
        if (count < start_idx) or (count > end_idx): continue


        # look for alignments and fill out C1' templates
        templates = []
        template_coord_data = []
        for aln_line in aln_lines:
            if len(aln_line)!=9: continue # some kind of overflow in some alignments?

            query,template,eval,qstart,qend,tstart,tend,qaln,taln = aln_line

            if query != target: continue

            if int(qend)<int(qstart): continue # aligned to reverse complement!

            pdb_id,chain_id = template.split('_')

            # need to do alignment
            cif_path = os.path.join(cif_dir, f'{pdb_id.upper()}.cif.gz')
            if not os.path.isfile( cif_path ):
                cif_path = os.path.join(cif_dir, f'{pdb_id.lower()}.cif') # kaggle style
                if not os.path.isfile( cif_path ): continue # occasional alignment to DNA, ignore!

            release_date = release_dates[pdb_id.upper()] # pulled from PDB server

            if not skip_temporal_cutoff and is_before_or_on(temporal_cutoff,release_date): continue

            # these release dates in the CIF files can be buggy!
            title,release_date_unreliable = extract_title_release_date( cif_path )

            print('\n',target,temporal_cutoff,"   ",template)
            if title: print(f"PDB Title: {title}")
            if release_date: print(f"PDB Release Date: {release_date}")

            # sometimes there is a mismatch between PDB's fasta files and what's actually stored in coordinates,
            # so best to get the actual residue numbers for the chain
            chain_full_sequence,chain_sequence,chain_seq_nums,chain_ins_codes = extract_rna_sequence(cif_path,chain_id)

            # get 3d data
            alignment = []
            qstart=int(qstart)
            qend=int(qend)
            tstart=int(tstart)
            tend=int(tend)
            alignment.append( sequence[:(qstart-1)] + '-'*(tstart-1) + qaln + sequence[qend:]  )
            alignment.append( '-'*(qstart-1)        + 'X'*(tstart-1) + taln + '-'*(len(sequence)-qend) )
            print( alignment[0],'query' )
            print( alignment[1],'template' )
            chain_coord_data = get_coord_labels( cif_path, chain_id, chain_sequence, chain_seq_nums, chain_ins_codes )

            coord_data = get_target_coord_data( chain_coord_data, (alignment[1],alignment[0]) )

            # mismatch in FASTA sequence and the polyx info in the CIF file
            if len(coord_data) != len(sequence):
                print( 'WARNING! len(coord_data) != len(sequence)', 'len coord_data', len(coord_data), 'len sequence', len(sequence), 'qstart',qstart,'len qaln',len(qaln),'qend',qend)
                continue

            templates.append( template )
            template_coord_data.append( coord_data )

            if len(templates) >= MAX_TEMPLATES: break

        print( "Found", len(templates), "templates for", target,'\n' )

        mapped_target = target
        if not id_map is None: mapped_target = id_map[target]

        for i in range(len(sequence)):
            output_label = {
                "ID": f'{mapped_target}_{i+1}',
                "resname": sequence[i],
                "resid": i+1,
            }
            output_allatom_label = deepcopy(output_label)

            # output templates, C1'
            for n in range(len(templates)):
                template = templates[n]
                res,resid,xyz,pdb_info = template_coord_data[n][i]
                assert( resid == i+1 )
                output_label[ f"x_{n+1}" ] = xyz[C1PRIME_KEY][0]
                output_label[ f"y_{n+1}" ] = xyz[C1PRIME_KEY][1]
                output_label[ f"z_{n+1}" ] = xyz[C1PRIME_KEY][2]

                for atom in ALL_ATOMS:
                    output_allatom_label.update( {
                        f"{atom}_x_{n+1}": xyz[atom][0],
                        f"{atom}_y_{n+1}": xyz[atom][1],
                        f"{atom}_z_{n+1}": xyz[atom][2]
                    })
                output_allatom_label.update( {f"pdb_id_{n+1}": template,f"pdb_seq_num_{n+1}": int(pdb_info[0]), f"pdb_ins_code_{n+1}": pdb_info[1], f"pdb_resname_{n+1}": pdb_info[2]} )

            # pad with blank models
            for n in range(len(templates),MAX_TEMPLATES):
                output_label[ f"x_{n+1}" ] = np.nan
                output_label[ f"y_{n+1}" ] = np.nan
                output_label[ f"z_{n+1}" ] = np.nan

                for atom in ALL_ATOMS:
                    output_allatom_label.update( {
                        f"{atom}_x_{n+1}": np.nan,
                        f"{atom}_y_{n+1}": np.nan,
                        f"{atom}_z_{n+1}": np.nan
                    })
                output_allatom_label.update( {f"pdb_id_{n+1}": "",f"pdb_seq_num_{n+1}": np.nan, f"pdb_ins_code_{n+1}": '', f"pdb_resname_{n+1}": ''} )


            output_labels.append( output_label )
            output_allatom_labels.append( output_allatom_label)

        num_targets += 1
        # if num_targets > 1: break # for debug!

    print(f'Completed {num_targets} targets\n')

    return output_labels, output_allatom_labels, targets

# Create a DataFrame and write to CSV
def output_csv( output_data, outfile ):
    df = pd.DataFrame(output_data)
    df.to_csv(outfile, index=False)
    print(f"Output written to {outfile}")


def output_template_labels_to_csv( output_labels, output_allatom_labels, targets, outdir='', outfile='', dataset_name='', start_idx=0, end_idx=0 ):
    assert( not( len(outfile)>0 and len(dataset_name)>0 ) )

    os.makedirs(outdir, exist_ok=True)
    if outdir[-1] != '/': outdir += '/'
    split_tag = ''
    if start_idx > 0:
        num_digits = len(str(len(targets)))
        split_tag = f'.{start_idx:0{num_digits}d}_{end_idx:0{num_digits}d}'

    if len( outfile ) == 0:
        if len( dataset_name) == 0: dataset_name = 'test'
        outfile = f"{outdir}{dataset_name}.templates{split_tag}.csv"
        outfile_allatom = f"{outdir}{dataset_name}.allatom_templates{split_tag}.csv"
    else:
        outfile = f"{outdir}/{outfile}"
        if outfile.count('labels.csv')>1: outfile_allatom = outfile.replace('labels.csv','allatom.csv')
        elif outfile.endswith('.csv'):    outfile_allatom = outfile.replace('.csv','.allatom.csv')
        else: outfile_allatom = outfile + '.allatom.csv'

    output_csv( output_labels, outfile )
    output_csv( output_allatom_labels, outfile_allatom )


if __name__ == "__main__":
    args = parser.parse_args()

    sequences_file = args.sequences_file
    mmseqs_results_file = args.mmseqs_results_file
    MAX_TEMPLATES = args.max_templates
    cif_dir = args.cif_dir
    id_map_file = args.id_map
    start_idx = args.start_idx
    end_idx = args.end_idx
    skip_temporal_cutoff = args.skip_temporal_cutoff

    output_labels,output_allatom_labels,targets = get_template_labels( sequences_file, mmseqs_results_file, skip_temporal_cutoff,
                                                       MAX_TEMPLATES, cif_dir, id_map_file, start_idx, end_idx )


    outdir = args.outdir
    outfile = args.outfile
    dataset_name = args.dataset_name
    output_template_labels_to_csv( output_labels, output_allatom_labels, targets, outdir, outfile, dataset_name, start_idx, end_idx )