-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcoverage_table_make.py
More file actions
125 lines (110 loc) · 4.38 KB
/
Copy pathcoverage_table_make.py
File metadata and controls
125 lines (110 loc) · 4.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/python
import sys
"""
Script for parsing coverage among numerous strains at certain positions
Can be called from "coverage_table_scaffolds.py"
run with this command: awk -v s=5 -v e=25 -f makeTable.awk data.mpileup
PARAMETERS: s = lower confidence boundary, e = upper confidence boundary
Ezra Huscher
Kane Lab, University of Colorado Boulder
last updated July 2016
"""
# ----------------------------------------------
# All Script Parameters
# ----------------------------------------------
showScaffoldName = False # True/False
endingStringOfStrainFiles = ".sorted.bam_depth"
informationFile = "info_depth_coverage_flock.txt"
strainList = "names_67strains_nospaces_05042016.txt"
geneName = sys.argv[1]
geneStartPos = int(sys.argv[2])
geneEndPos = int(sys.argv[3])
outputFile = "scaffold"+geneName+"_pk_67individuals.txt"
strainFolder = "depth/"
# ----------------------------------------------
print "start pos: ",geneStartPos," stop pos: ",geneEndPos
# Open connection to writable text files
if strainList != "":
allStrainNames = open(strainList, "rw+")
if outputFile != "":
outputfile1 = open(outputFile, "w+") # will create the file if it does not exist
f = open(informationFile, 'r')
infolines = f.readlines()
f.close()
# Loop through cannabis strains in strainList text file... "A2_cat", etc
try:
line = allStrainNames.readlines()
for strainName in line:
strainName = strainName.replace("\n","")
if strainName != "":
fullPath = strainFolder + strainName + endingStringOfStrainFiles
getStrainsLines = open(fullPath, "r+")
print "Processing file:",fullPath
# Loop through rows of each strain, format it like we like, and output to file
linez = getStrainsLines.readlines()
currentPos = geneStartPos
for thisLine in linez:
posnum= int(thisLine.split("\t")[1])
tempGeneName = "scaffold" + geneName + ","
if outputFile != "" and thisLine.find( tempGeneName ) != -1:
# find this strain in the informationFile containing additional relevant information
for linezz in infolines:
if strainName in linezz:
if int(thisLine.split("\t")[2].replace("\n", "")) != 0: # prevent dividing by 0
v0 = "\t" + str(round(int(thisLine.split("\t")[2].replace("\n", "")) / float(linezz.split(",")[3]),4))
else:
v0 = "\t" + "-"
v1 = "\t" + linezz.split(",")[1]
v2 = "\t" + linezz.split(",")[4]
v3 = "\t" + linezz.split(",")[5]
var1 = strainName
var2 = "\t" + thisLine.split("\t")[0]
var3 = "\t" + thisLine.split("\t")[1]
var4 = "\t" + thisLine.split("\t")[2].replace("\n", "")
var5 = "\t" + v1.replace("\n", "")
var6 = "\t" + v2.replace("\n", "")
var7 = "\t" + v3
if posnum <= geneEndPos and posnum >= geneStartPos: # need something more here, not working when
# print "test: ",currentPos," ",posnum
# coverage at this position is 0, so display filler row
while (currentPos < posnum):
outputfile1.write(var1)
if (showScaffoldName == True):
outputfile1.write(var2)
outputfile1.write("\t" + str(currentPos))
outputfile1.write("\t" + "0")
outputfile1.write("\t" + "0") # two \t's here?
outputfile1.write(var5)
outputfile1.write(var6)
outputfile1.write(var7)
currentPos+=1
# position found, display row with desired data
currentPos+=1
outputfile1.write(var1)
if (showScaffoldName == True):
outputfile1.write(var2)
outputfile1.write(var3)
outputfile1.write(var4)
outputfile1.write(v0)
outputfile1.write(var5)
outputfile1.write(var6)
outputfile1.write(var7)
# This section is repeated here to make filler rows if there
# is no coverage on the last positions in the gene's stop position
while (currentPos <= geneEndPos):
outputfile1.write(var1)
if (showScaffoldName == True):
outputfile1.write(var2)
outputfile1.write("\t" + str(currentPos))
outputfile1.write("\t" + "0")
outputfile1.write("\t" + "0") # two \t's here?
outputfile1.write(var5)
outputfile1.write(var6)
outputfile1.write(var7)
currentPos+=1
print strainName + " completed."
finally:
if strainList != "":
allStrainNames.close()
if outputFile != "":
outputfile1.close()