-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathremove-duplicate-records.py
More file actions
32 lines (25 loc) · 922 Bytes
/
remove-duplicate-records.py
File metadata and controls
32 lines (25 loc) · 922 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from Bio import SeqIO
import glob
import re
## simple script to remove duplicate records (based on record.id) from a biopython SeqRecord object.
fasta_dir = ''
out_dir = ''
fasta_files = glob.glob(fasta_dir + "*.fas")
for f in fasta_files:
search_str = fasta_dir + "(.+?).fas"
file_name = re.search(search_str, f).group(1)
print "Removing duplicate records from %s..." %f
handle = open(f, 'r')
present_recs = []
for record in SeqIO.parse(handle, 'fasta'):
if record.id not in present_recs:
present_recs.append(record.id)
else:
print "%s is already present. Discarding." %record.id
good_recs = []
out_handle = open(out_dir+file_name+".fas", 'w')
for record in SeqIO.parse(handle, 'fasta'):
if record.id in present_recs:
good_recs.append(record)
SeqIO.write(good_recs, out_handle, 'fasta')
out_handle.close()