forked from jccastrog/pa_genomics
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconvertFormat.py
More file actions
executable file
·168 lines (134 loc) · 4.73 KB
/
convertFormat.py
File metadata and controls
executable file
·168 lines (134 loc) · 4.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Convert biosequences from one format to another.
Usage is: convbioseq [options] FORMAT INFILES ...
Options:
--version show program's version number and exit
-h, --help show this help message and exit
-i FORMAT, --input-format=FORMAT
The format of the input biosequence files. If not
supplied, this will be inferred from the extension of
the files.
-e EXTENSION, --output-extension=EXTENSION
The extension of the output biosequence files. If not
supplied, this will be inferred from the output
format.
And so for example:
convbioseq stockholm foo.genbank
convbioseq -e fast fasta foo.aln
convbioseq -i fasta phylip foo.txt
"""
# TODO: all the format crap should be in one table
__docformat__ = 'restructuredtext en'
__author__ = 'Paul-Michael Agapow <agapow@bbsrc.ac.uk>'
### IMPORTS ###
from os import path
from optparse import OptionParser
from exceptions import BaseException
from Bio import SeqIO
try:
from bioscripts.convert import __version__
except:
__version__ = 'unknown'
### CONSTANTS & DEFINES ###
# Formats in SeqIO that we don't parse as yet. There's no actual problem,
# these just need to be checked:
# ace embl fastq fastq-solex ig pir swiss tab qual
# what extensions to give for each output format
OUT_EXT_MAP = {
'clustal': 'aln',
'fasta': 'fasta',
'genbank': 'gb',
'nexus': 'nexus',
'phd': 'phd',
'phylip': 'phy',
'stockholm': 'sth',
'qual': 'qual',
'tab': 'tab',
}
# the list of known formats
KNOWN_FMTS = sorted (OUT_EXT_MAP.keys())
# what format to derive from inout extensions
# always accept the format name as an extension
IN_EXT_MAP = {
'aln': 'clustal',
'sth': 'stockholm',
'gb': 'genbank',
'nxs': 'nexus',
'phy': 'phylip',
'tab': 'tab',
'qual': 'qual',
'phd': 'phd',
}
IN_EXT_MAP.update (dict ([(x, x) for x in KNOWN_FMTS]))
_DEV_MODE = True
### IMPLEMENTATION ###
def parse_args():
# Construct the option parser.
usage = '%prog [options] FORMAT INFILES ...'
version = "version %s" % __version__
epilog='FORMAT must be one of %s.\n' % ', '.join (["'%s'" % x for x in
KNOWN_FMTS])
epilog += 'The input formats inferred from extensions are %s.\n' % \
', '.join (["%s ('.%s')" % (v, k) for k, v in IN_EXT_MAP.iteritems()])
epilog += 'The default extensions for output formats are %s.\n' % \
', '.join (["'.%s' (%s)" % (v, k) for k, v in OUT_EXT_MAP.iteritems()])
optparser = OptionParser (usage=usage, version=version, epilog=epilog)
optparser.add_option ('--input-format', '-i',
dest="input_format",
help='''The format of the input biosequence files. If not supplied, this will be inferred from the extension of the files.''',
metavar='FORMAT',
)
optparser.add_option ('--output-extension', '-e',
dest="output_extension",
help='''The extension of the output biosequence files. If not supplied, this will be inferred from the output format.''',
metavar='EXTENSION',
)
#optparser.add_option ('--verbose', '-v',
# dest="verbose",
# help='''How much output to generate.''')
options, pargs = optparser.parse_args()
if (len (pargs) < 1):
optparser.error ('No output format specified')
out_fmt = pargs[0].lower()
assert (out_fmt in KNOWN_FMTS), "unknown output format"
infiles = pargs[1:]
if (not infiles):
optparser.error ('No input files specified')
return out_fmt, infiles, options
def main():
out_fmt, infiles, options = parse_args()
for in_path in infiles:
# construct parameters
dir_name, file_name = path.split (in_path)
base_name, orig_ext = path.splitext (file_name)
if (orig_ext.startswith ('.')):
orig_ext = orig_ext[1:]
in_fmt = (options.input_format or IN_EXT_MAP.get (orig_ext, '')).lower()
assert (in_fmt), "no known input format specified"
out_ext = options.output_extension or OUT_EXT_MAP[out_fmt]
out_path = path.join (dir_name, '%s.%s' % (base_name, out_ext))
# open files
in_hndl = open (in_path, 'rb')
out_hndl = open (out_path, 'wb')
# read and write
in_seqs = [x for x in SeqIO.parse (in_hndl, in_fmt)]
assert (in_seqs), '''No sequences read from %s. Perhaps the file is not in %s format.''' % (file_name, in_fmt)
SeqIO.write (in_seqs, out_hndl, out_fmt)
# tidy up
in_hndl.close()
out_hndl.close()
### TEST & DEBUG ###
### MAIN ###
if __name__ == '__main__':
try:
main()
except BaseException, err:
if (_DEV_MODE):
raise
else:
print err
except:
print "An unknown error occurred.\n"
### END ######################################################################