Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 19 additions & 19 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "xsra"
version = "0.2.25"
version = "0.2.26"
edition = "2021"
license = "MIT"
authors = ["Noam Teyssier <noam.teyssier@arcinstitute.org>"]
Expand All @@ -11,34 +11,34 @@ keywords = ["ncbi", "binseq", "SRA", "genomics", "fasterq-dump"]


[dependencies]
anyhow = "1.0.95"
binseq = "0.6.2"
clap = { version = "4.5.28", features = ["derive"] }
anyhow = "1.0.100"
binseq = "0.7.5"
clap = { version = "4.5.51", features = ["derive"] }
futures = "0.3.31"
gzp = "1"
hashbrown = "0.15.2"
indicatif = "0.17.11"
libc = "0.2.172"
ncbi-vdb-sys = "0.1.5"
num_cpus = "1.16.0"
parking_lot = "0.12.3"
reqwest = { version = "0.12.15", default-features = false, features = [
gzp = "2"
hashbrown = "0.16.0"
indicatif = "0.18.2"
libc = "0.2.177"
ncbi-vdb-sys = "0.1.6"
num_cpus = "1.17.0"
parking_lot = "0.12.5"
reqwest = { version = "0.12.24", default-features = false, features = [
"blocking",
"stream",
"rustls-tls",
] }
serde = { version = "1.0.217", features = ["derive"] }
serde_json = "1.0.138"
tokio = { version = "1.44.1", features = ["rt", "rt-multi-thread"] }
zstd = { version = "0.13.2", features = ["zstdmt"] }
serde = { version = "1.0.228", features = ["derive"] }
serde_json = "1.0.145"
tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread"] }
zstd = { version = "0.13.3", features = ["zstdmt"] }

[dev-dependencies]
assert_cmd = "2.0.17"
assert_cmd = "2.1.1"
predicates = "3.1.3"
mockito = "1.7.0"
tempfile = "3.20.0"
tempfile = "3.23.0"
tokio-test = "0.4.4"

[dev-dependencies.tokio]
version = "1.45.1"
version = "1.48.0"
features = ["rt", "rt-multi-thread", "macros", "test-util"]
12 changes: 12 additions & 0 deletions src/cli/recode.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use super::{InputOptions, RuntimeOptions};
use anyhow::{bail, Result};
use binseq::BitSize;
use clap::Parser;

#[derive(Parser, Debug)]
Expand Down Expand Up @@ -73,6 +74,10 @@ pub struct RecodeOutput {
#[clap(short, long)]
pub flavor: BinseqFlavor,

/// BINSEQ bit size
#[clap(long, default_value_t = 2)]
bitsize: u8,

/// VBQ virtual block size (in bytes)
///
/// Only used by vbq
Expand All @@ -88,6 +93,13 @@ impl RecodeOutput {
format!("output.{}", ext)
}
}
pub fn bitsize(&self) -> BitSize {
if self.bitsize == 4 {
BitSize::Four
} else {
BitSize::Two
}
}
Comment on lines +96 to +102

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The current implementation of bitsize() silently treats any value other than 4 as 2. This could lead to unexpected behavior if a user provides an invalid value like 3. It's better to validate the input and fail fast by panicking for unsupported values.

An even better approach would be to use clap's validation capabilities to reject invalid values at the argument parsing stage, for example by using value_parser.

    pub fn bitsize(&self) -> BitSize {
        match self.bitsize {
            2 => BitSize::Two,
            4 => BitSize::Four,
            other => panic!(
                "Invalid bitsize: {}. Only 2 or 4 are supported.", other
            ),
        }
    }

}

#[derive(clap::ValueEnum, Clone, Copy, Debug)]
Expand Down
4 changes: 2 additions & 2 deletions src/output.rs
Original file line number Diff line number Diff line change
Expand Up @@ -141,13 +141,13 @@ fn compression_passthrough<W: Write + Send + 'static>(
match compression {
Compression::Uncompressed => Ok(Box::new(writer)),
Compression::Gzip => {
let pt: ParCompress<Gzip> = ParCompressBuilder::default()
let pt: ParCompress<Gzip, _> = ParCompressBuilder::default()
.num_threads(num_threads)?
.from_writer(writer);
Ok(Box::new(pt))
}
Compression::Bgzip => {
let pt: ParCompress<Bgzf> = ParCompressBuilder::default()
let pt: ParCompress<Bgzf, _> = ParCompressBuilder::default()
.num_threads(num_threads)?
.from_writer(writer);
Ok(Box::new(pt))
Expand Down
57 changes: 37 additions & 20 deletions src/recode/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ use std::sync::Arc;

use anyhow::{bail, Result};
use binseq::{
bq::{BinseqHeader, BinseqWriterBuilder},
vbq::{VBinseqHeader, VBinseqWriterBuilder},
Policy,
bq::{BinseqHeaderBuilder, BinseqWriterBuilder},
vbq::{VBinseqHeaderBuilder, VBinseqWriterBuilder},
BitSize, Policy,
};
use ncbi_vdb_sys::SraReader;
use parking_lot::Mutex;
Expand Down Expand Up @@ -40,13 +40,15 @@ pub fn recode(args: &RecodeArgs) -> Result<()> {
&args.output.name(),
args.primary_sid(),
args.extended_sid(),
args.output.bitsize(),
args.runtime.threads(),
),
BinseqFlavor::VBinseq => recode_to_vbinseq(
&accession,
&args.output.name(),
args.primary_sid(),
args.extended_sid(),
args.output.bitsize(),
args.output.block_size,
args.runtime.threads(),
),
Expand All @@ -58,6 +60,7 @@ fn recode_to_binseq(
output_path: &str,
primary_sid: usize,
extended_sid: Option<usize>,
bitsize: BitSize,
num_threads: u64,
) -> Result<()> {
let stats = describe_inner(accession, 0, 100)?;
Expand All @@ -80,11 +83,12 @@ fn recode_to_binseq(
};

let output = File::create(output_path).map(BufWriter::new)?;
let header = if xlen > 0 {
BinseqHeader::new_extended(slen, xlen)
} else {
BinseqHeader::new(slen)
};
let header = BinseqHeaderBuilder::new()
.slen(slen)
.xlen(xlen)
.bitsize(bitsize)
.flags(false)
.build()?;
let policy = Policy::RandomDraw;
let g_writer = BinseqWriterBuilder::default()
.header(header)
Expand Down Expand Up @@ -120,10 +124,10 @@ fn recode_to_binseq(
if xlen > 0 {
let primary_seg = record.get_segment(primary_sid).unwrap();
let extended_seg = record.get_segment(extended_sid.unwrap()).unwrap();
t_writer.write_paired(0, primary_seg.seq(), extended_seg.seq())?;
t_writer.write_paired_record(None, primary_seg.seq(), extended_seg.seq())?;
} else {
let primary_seg = record.get_segment(primary_sid).unwrap();
t_writer.write_nucleotides(0, primary_seg.seq())?;
t_writer.write_record(None, primary_seg.seq())?;
}

// Process records at a constant interval
Expand Down Expand Up @@ -161,15 +165,21 @@ fn recode_to_vbinseq(
output_path: &str,
primary_sid: usize,
extended_sid: Option<usize>,
bitsize: BitSize,
block_size: usize,
num_threads: u64,
) -> Result<()> {
let output = File::create(output_path).map(BufWriter::new)?;
let header = if extended_sid.is_some() {
VBinseqHeader::with_capacity(block_size as u64, true, true, true)
} else {
VBinseqHeader::with_capacity(block_size as u64, true, true, false)
};
let header = VBinseqHeaderBuilder::new()
.block(block_size as u64)
.flags(false)
.qual(true)
.headers(false)
.bitsize(bitsize)
.compressed(true)
.paired(extended_sid.is_some())
.build();

let policy = Policy::RandomDraw;
let g_writer = VBinseqWriterBuilder::default()
.header(header)
Expand Down Expand Up @@ -205,16 +215,23 @@ fn recode_to_vbinseq(
if let Some(extended_sid) = extended_sid {
let primary_seg = record.get_segment(primary_sid).unwrap();
let extended_seg = record.get_segment(extended_sid).unwrap();
t_writer.write_nucleotides_quality_paired(
0,
t_writer.write_paired_record(
None,
None,
primary_seg.seq(),
Some(primary_seg.qual()),
None,
extended_seg.seq(),
primary_seg.qual(),
extended_seg.qual(),
Some(extended_seg.qual()),
)?;
Comment on lines +218 to 226

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

This call to write_paired_record is incorrect. It provides 7 arguments, but the function from the updated binseq crate expects 8. This will cause a compilation error. Additionally, the arguments are mismatched, leading to type errors: extended_seg.seq() is passed as a name (n2), and Some(extended_seg.qual()) is passed as a sequence (s2).

You need to pass 8 arguments, with None for the second read's name (n2), and correctly pass the sequence and quality for the second read.

Suggested change
t_writer.write_paired_record(
None,
None,
primary_seg.seq(),
Some(primary_seg.qual()),
None,
extended_seg.seq(),
primary_seg.qual(),
extended_seg.qual(),
Some(extended_seg.qual()),
)?;
t_writer.write_paired_record(
None,
None,
primary_seg.seq(),
Some(primary_seg.qual()),
None,
None,
extended_seg.seq(),
Some(extended_seg.qual()),
)?;

} else {
let primary_seg = record.get_segment(primary_sid).unwrap();
t_writer.write_nucleotides_quality(0, primary_seg.seq(), primary_seg.qual())?;
t_writer.write_record(
None,
None,
primary_seg.seq(),
Some(primary_seg.qual()),
)?;
}

// Process records at a constant interval
Expand Down