Tools/scripts/aggregate_data_tables.R at main · CCBR/Tools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env Rscript
# This script is used to extract 1 column each from multiple files and report all columns
# in a single file.
# Example use case for this script is extracting raw counts from multiple per sample RSEM
# files and creating a single raw counts matrix
#
# Eg:
#
# Rscript aggregate_data_tables.R \
# 	-s KO1_resent_mutated,KO1_resent,KO1_resent_unmutated,KO1_slam_mutated \
# 	-l ./KO1_resent/KO1_resent_mutated.RSEM.genes.results,./KO1_resent/KO1_resent.RSEM.genes.results,./KO1_resent/KO1_resent_unmutated.RSEM.genes.results \
# 	-c expected_count \
# 	-i gene_id \
# 	-o rsem.raw_counts_matrix.tsv
#
suppressPackageStartupMessages(library("argparse"))
suppressPackageStartupMessages(library("sets"))

## functions
checkfile <- function(filename) {
  if (file.access(filename) == -1) {
    stop(sprintf("Specified file ( %s ) does not exist", filename))
  }
}

readfile <- function(filename, indexcols, datacol, samplename) {
  d <- read.csv(filename, header = TRUE, sep = "\t")
  cols <- colnames(d)
  reqdcols <- c(indexcols, datacol)
  if (set_is_subset(as.set(reqdcols), as.set(cols))) {
    d <- d[, reqdcols]
    cols <- colnames(d)
    cols <- gsub(pattern = datacol, replacement = samplename, cols)
    colnames(d) <- cols
    return(d)
  } else {
    stop(sprintf(
      "Required columns: %s are missing in file %s",
      reqdcols,
      filename
    ))
  }
}

debug <- 0


# create parser object
parser <- ArgumentParser()

# specify our desired options
# by default ArgumentParser will add an help option

parser$add_argument(
  "-l",
  "--filelist",
  type = "character",
  help = "comma separated list of files",
  required = TRUE
)
parser$add_argument(
  "-i",
  "--indexcols",
  type = "character",
  help = "comma separated list of columns to use as index while merging. These need to exist in all files provided in the -l option.",
  required = TRUE
)
parser$add_argument(
  "-c",
  "--datacol",
  type = "character",
  help = "column to be extracted from all samples and aggregated in the outputfile. Should exist in all files provided to -l. Its name will be replaced by corresponding value in -s argument.",
  required = TRUE
)
parser$add_argument(
  "-s",
  "--samplenames",
  type = "character",
  help = "comma separated list of sample names. Need to be unique. Will replace datacol (-c) when reported in the output file. ",
  required = TRUE
)
parser$add_argument(
  "-o",
  "--outfile",
  type = "character",
  help = "aggregated outfile",
  required = TRUE
)


args <- parser$parse_args()

filelist <- unlist(strsplit(args$filelist, ","))
indexcols <- unlist(strsplit(args$indexcols, ","))
datacol <- args$datacol
samplenames <- unlist(strsplit(args$samplenames, ","))
outfile <- args$outfile

if (debug == 1) {
  filelist <- unlist(strsplit(
    "ccbr1060/Hg38_shRNA_hybrid/HGHY2DRXY_analysis/results/596-7-2/STAR/withChimericJunctions/a.tsv,ccbr1060/Hg38_shRNA_hybrid/HGHY2DRXY_analysis/results/596-7-2/STAR/withChimericJunctions/b.tsv",
    ","
  ))
  indexcols <- unlist(strsplit("ensemblID,gene_name,mRNA_length", ","))
  datacol <- "tpm"
  samplenames <- unlist(strsplit("A,B", ","))
  outfile <- "ccbr1060/Hg38_shRNA_hybrid/HGHY2DRXY_analysis/results/596-7-2/STAR/withChimericJunctions/test.tpm.tsv"
}

# 2 or more files required
if (length(filelist) <= 1) {
  stop(sprintf("Two are more comma separated files needed"))
}
# nfiles and nsamplenames should match
if (length(unique(filelist)) != length(unique(samplenames))) {
  stop(sprintf(
    "Number of files need to match number of samplenames. Duplicate files and samplenames not allowed!"
  ))
}

for (f in filelist) {
  checkfile(f)
}

m <- readfile(
  filename = filelist[1],
  indexcols = indexcols,
  datacol = datacol,
  samplename = samplenames[1]
)

for (i in 2:length(filelist)) {
  n <- readfile(
    filename = filelist[i],
    indexcols = indexcols,
    datacol = datacol,
    samplename = samplenames[i]
  )
  m <- merge(m, n, by = indexcols)
}

write.table(m, file = outfile, quote = FALSE, sep = "\t", row.names = FALSE)