Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ Numbat does not require paired DNA or genotype data and operates solely on the d

> [Teng Gao, Ruslan Soldatov, Hirak Sarkar, Adam Kurkiewicz, Evan Biederstedt, Po-Ru Loh, Peter Kharchenko. Haplotype-aware analysis of somatic copy number variations from single-cell transcriptomes. Nature Biotechnology (2022).](https://www.nature.com/articles/s41587-022-01468-y)

## Numbat-multiome
## Numbat-Paired (ATAC and RNA)
Numbat was later extended to multi-modality (single-cell RNA and ATAC) data. Check out the [vignette](https://kharchenkolab.github.io/numbat/articles/numbat-multiome.html) and paper below:
- 10X Multiome data can be ran with this version as well, treating each barcode separately based on modality. The Numbat-Multiome weighting CNV probabilities based on evidence from ATAC and RNA is currently under development.

> [Ruitong Li, Jean-Baptiste Alberge, Tina Keshavarzian, Junko Tsuji, Johan Gustafsson, Mahshid Rahmat, Elizabeth D Lightbody, Stephanie L Deng, Santiago Riviero, Mendy Miller, F Naz Cemre Kalayci, Adrian Wiestner, Clare Sun, Mathieu Lupien, Irene Ghobrial, Erin Parry, Teng Gao, Gad Getz. Numbat-multiome: inferring copy number variations by combining RNA and chromatin accessibility information from single-cell data. Briefings in Bioinformatics (2025).](https://academic.oup.com/bib/article/26/5/bbaf516/8290422)

Expand Down
249 changes: 236 additions & 13 deletions inst/bin/input_prep.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,25 +48,80 @@ cntsubN <- function(cnt,f,maxn,seed){
return(cnt[f,sample(colnames(cnt),size=min(maxn,ncol(cnt)))])
}
}
binCnt <- function(bincntF,seed,maxCB=10000){
binCnt <- function(bincntF,seed,maxCB=10000,add_suffix=FALSE,suffix_tag=c("_RNA","_ATAC")){
cntmat <- map(bincntF,loadcnt)
sharedFeature <- intersect(rownames(cntmat[[1]]),rownames(cntmat[[2]]))
comb_cnt <- cbind(cntsubN(cntmat[[1]],sharedFeature,maxCB,seed),
cntsubN(cntmat[[2]],sharedFeature,maxCB,seed))
return(comb_cnt)

if(add_suffix){

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this necessary? Why can't just append suffix to each matrix and combine?

if(length(suffix_tag) != 2){
stop("suffix_tag must be a character vector of length 2")
}

# get all unique column names across both matrices
all_cols_1 <- colnames(cntmat[[1]])
all_cols_2 <- colnames(cntmat[[2]])
all_unique_cols <- unique(c(all_cols_1, all_cols_2))

# subsample from all barcodes
set.seed(seed)
n_to_sample <- min(maxCB, length(all_unique_cols))
sampled_cells <- sample(all_unique_cols, size = n_to_sample)

result_cols <- character()
result_mat <- NULL

for(cell in sampled_cells){
cell_in_1 <- cell %in% all_cols_1
cell_in_2 <- cell %in% all_cols_2

if(cell_in_1){
cell_data_1 <- cntmat[[1]][sharedFeature, cell, drop=FALSE]
colnames(cell_data_1) <- paste0(cell, suffix_tag[1])
if(is.null(result_mat)){
result_mat <- cell_data_1
} else {
result_mat <- cbind(result_mat, cell_data_1)
}
result_cols <- c(result_cols, colnames(cell_data_1))
}

if(cell_in_2){
cell_data_2 <- cntmat[[2]][sharedFeature, cell, drop=FALSE]
colnames(cell_data_2) <- paste0(cell, suffix_tag[2])
if(is.null(result_mat)){
result_mat <- cell_data_2
} else {
result_mat <- cbind(result_mat, cell_data_2)
}
result_cols <- c(result_cols, colnames(cell_data_2))
}
}

colnames(result_mat) <- result_cols
return(result_mat)

} else {
# no suffix needed, subsample independently then combine
cnt1_sub <- cntsubN(cntmat[[1]],sharedFeature,maxCB,seed)
cnt2_sub <- cntsubN(cntmat[[2]],sharedFeature,maxCB,seed)
comb_cnt <- cbind(cnt1_sub,cnt2_sub)
return(comb_cnt)
}
}

library(purrr)
library(Matrix)

binCnt_union <- function(bincntF, seed, maxCB = 10000) {
binCnt_union <- function(bincntF, seed, maxCB = 10000,
add_suffix = FALSE, suffix_tag = c("_RNA","_ATAC"),
runMultiomeAsSummed = FALSE) {
# 1. Load the two count matrices
cnt1 <- loadcnt(bincntF[1])
cnt2 <- loadcnt(bincntF[2])

allFeat <- union(rownames(cnt1), rownames(cnt2))

# 3. For each matrix, add zero‐rows for any features its missing
# 3. For each matrix, add zero‐rows for any features it's missing
add_missing <- function(mat, allFeat) {
missing <- setdiff(allFeat, rownames(mat))
if (length(missing)) {
Expand All @@ -77,16 +132,184 @@ binCnt_union <- function(bincntF, seed, maxCB = 10000) {
dimnames = list(missing, colnames(mat)))
mat <- rbind(mat, zero_block)
}
# reorder to match allFeat
mat[allFeat, , drop = FALSE]
}
cnt1_u <- add_missing(cnt1, allFeat)
cnt2_u <- add_missing(cnt2, allFeat)

if(runMultiomeAsSummed){
print('Summing read counts for each modality, resulting one summed feature per cell barcode. Use the --add_suffix and --suffix_tag to run multiome as paired. Note: this is only applicable if you are running multiome data as paired and want each barcode to be counted once. If you are looking for the true Multiome version, please follow the appropriate steps in the vignette.')
if(add_suffix){
print('Multiome can either be used in paired mode or as summed mode for this algorithm. Either --runMultiomeAsSummed can be true or --add_suffix. ')
stop('Please specify either --runMultiomeAsSummed or --add_suffix, but not both.')
}
intersect_cells <- intersect(colnames(cnt1), colnames(cnt2))
set.seed(seed)
n_to_sample <- min(maxCB, length(intersect_cells))
sampled_cells <- sample(intersect_cells, size = n_to_sample)
cnt <- cnt1[,sampled_cells] + cnt2[,sampled_cells]
return(cnt)
}

if(add_suffix){

@teng-gao teng-gao Oct 30, 2025

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this necessary for summed? seems repeated

Merge two function -> mode = intersect/union

print('Adding suffix tags to the barcodes used in the count matrices. Note: this is only applicable if you are running multiome data as paired, since we want to ensure each barcode is unique for the algorithm. If you are looking for the true Multiome version, please follow the appropriate steps in the vignette.')
if(length(suffix_tag) != 2){
stop("suffix_tag must be a character vector of length 2")
}

all_cols_1 <- colnames(cnt1_u)
all_cols_2 <- colnames(cnt2_u)
all_unique_cols <- unique(c(all_cols_1, all_cols_2))

# subsample from the union of ALL columns (without suffixes)
set.seed(seed)
n_to_sample <- min(maxCB, length(all_unique_cols))
sampled_cells <- sample(all_unique_cols, size = n_to_sample)

# for each sampled cell, check which modalities it appears in and add appropriate columns
result_cols <- character()
result_mat <- NULL

for(cell in sampled_cells){
cell_in_1 <- cell %in% all_cols_1
cell_in_2 <- cell %in% all_cols_2

if(cell_in_1){
cell_data_1 <- cnt1_u[, cell, drop=FALSE]
colnames(cell_data_1) <- paste0(cell, suffix_tag[1])
if(is.null(result_mat)){
result_mat <- cell_data_1
} else {
result_mat <- cbind(result_mat, cell_data_1)
}
result_cols <- c(result_cols, colnames(cell_data_1))
}

if(cell_in_2){
cell_data_2 <- cnt2_u[, cell, drop=FALSE]
colnames(cell_data_2) <- paste0(cell, suffix_tag[2])
if(is.null(result_mat)){
result_mat <- cell_data_2
} else {
result_mat <- cbind(result_mat, cell_data_2)
}
result_cols <- c(result_cols, colnames(cell_data_2))
}
}

colnames(result_mat) <- result_cols
return(result_mat)

} else {
# if no suffix addition is needed, thensubsample independently then combine
cnt1_sub <- cntsubN(cnt1_u, allFeat, maxCB, seed)
cnt2_sub <- cntsubN(cnt2_u, allFeat, maxCB, seed)
comb_cnt <- cbind(cnt1_sub, cnt2_sub)
return(comb_cnt)
}
}




# input prep for allele dataframe if running multiome as paired
#' Combine and harmonize allele counts from RNA and ATAC modalities
#' @param alleleCounts_RNA Character. Path to the RNA-based allele counts file
#' @param alleleCounts_ATAC Character. Path to the ATAC-based allele counts file
#' @param output_file Character. File for combined allele outputs
#' @param compress Logical. Whether to compress the output file (default: TRUE)
#' @return Character. Path to the output file
combine_allele_counts <- function(alleleCounts_RNA,

@teng-gao teng-gao Oct 30, 2025

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discuss - could take union instead. Is this also used for summed? For summed I just rbind the two dataframes and sum the counts for the same cell-snp combo

alleleCounts_ATAC,
output_file,
addBarcodeSuff = FALSE,
compress = TRUE) {
geno_files <- c(alleleCounts_RNA, alleleCounts_ATAC)

if (!all(file.exists(geno_files))) {
missing_files <- geno_files[!file.exists(geno_files)]
stop("The following genotype files do not exist: ",
paste(missing_files, collapse = ", "))
}


# 4. Subsample & combine
comb_cnt <- cbind(
cntsubN(cnt1_u, allFeat, maxCB, seed),
cntsubN(cnt2_u, allFeat, maxCB, seed)
)
comb_cnt
# message("Harmonizing allele counts for sample: ", sample_id)

# Read first allele counts file (RNA)
df1 <- tryCatch({
data.table::fread(alleleCounts_RNA, header = TRUE, sep = "\t") %>%
filter(CHROM %in% 1:22) %>%
mutate(CHROM = factor(CHROM, 1:22))
}, error = function(e) {
stop("Error reading RNA allele counts file: ", e$message)
})

# Extract unique SNPs and genotypes from RNA allele counts file
geno1 <- df1 %>% distinct(snp_id, GT)

# Read second allele counts file (ATAC)
df2 <- tryCatch({
data.table::fread(alleleCounts_ATAC, header = TRUE, sep = "\t") %>%
filter(CHROM %in% 1:22) %>%
mutate(CHROM = factor(CHROM, 1:22))
}, error = function(e) {
stop("Error reading ATAC allele counts file: ", e$message)
})

# Extract unique SNPs and genotypes from ATAC allele counts file
geno2 <- df2 %>% distinct(snp_id, GT)

# Find shared SNPs between the two datasets
sharedSNP <- intersect(geno1$snp_id, geno2$snp_id)
message("Found ", length(sharedSNP), " shared SNPs between modalities")

# Create reference dataframe from second file's genotypes
snp_df <- geno2 %>%
filter(snp_id %in% sharedSNP) %>%
column_to_rownames("snp_id")

# Update genotypes in first file to match second file for shared SNPs
geno1 <- geno1 %>%
mutate(GT = case_when(
snp_id %in% sharedSNP ~ snp_df[snp_id, "GT"],
TRUE ~ GT
))

geno1_df <- geno1 %>% column_to_rownames("snp_id")
df1$GT <- geno1_df[df1$snp_id, "GT"]

if(addBarcodeSuff == TRUE){
df1$cell = paste0(df1$cell, '_RNA')
df2$cell = paste0(df2$cell, '_ATAC')
}
combined_df <- rbind(df1, df2)

# output_file <- file.path(output_dir, paste0(sample_id, "_allele_counts_consistent.tsv"))

tryCatch({

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think if you name the file .gz fwrite will automatically compress

data.table::fwrite(
combined_df,
output_file,
quote = FALSE,
row.names = FALSE,
#nThread = 3,
sep = "\t"
)
message("Successfully wrote combined allele counts to: ", output_file)
}, error = function(e) {
stop("Error writing combined file: ", e$message)
})

if (compress) {
tryCatch({
system2("gzip", args = output_file)
output_file <- paste0(output_file, ".gz")
message("Successfully compressed output file")
}, error = function(e) {
warning("Failed to compress output file: ", e$message)
})
}

return(output_file)
}

12 changes: 11 additions & 1 deletion inst/bin/run_numbat_multiome.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ option_list = list(make_option("--countmat", default = NULL),
make_option("--ref", default = NULL),
make_option("--out_dir", default = NULL),
make_option("--parL", default = NULL)
make_option("--runMultiomeAsPaired", default = FALSE)
)
args = parse_args(OptionParser(option_list = option_list))

Expand Down Expand Up @@ -83,8 +84,17 @@ if(mean(covs)>150){
df_allele <- df_allele%>%sample_frac(frac)
}

#==== Parameter change ====
#==== Quick check if we are running multiome data as paired ====
cat('Running Numbat\n')
if(args$runMultiomeAsPaired){
# check if suffix for count matrix / allele dataframe is present
if(!(any(endsWith(colnames(df_count), '_RNA')) && any(endsWith(colnames(df_count), '_ATAC')))){
stop("Suffix _RNA or _ATAC not found in count matrix column names")
}
if(!(any(endsWith(df_allele$cell, '_RNA')) && any(endsWith(colnames(df_allele$cell), '_ATAC')))){
stop("Suffix _RNA or _ATAC not found in allele dataframe cell names")
}
}
numbatPars$count_mat = df_count
numbatPars$lambdas_ref =readRDS(args$ref) %>% as.matrix()
numbatPars$df_allele =df_allele
Expand Down
Loading