Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 17 additions & 19 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "xsra"
version = "0.2.27"
version = "0.2.28"
edition = "2021"
license = "MIT"
authors = ["Noam Teyssier <noam.teyssier@arcinstitute.org>"]
Expand All @@ -11,34 +11,32 @@ keywords = ["ncbi", "binseq", "SRA", "genomics", "fasterq-dump"]


[dependencies]
anyhow = "1.0.100"
binseq = "0.7.5"
clap = { version = "4.5.51", features = ["derive"] }
futures = "0.3.31"
anyhow = "1.0.102"
binseq = "0.9.0"
clap = { version = "4.5.60", features = ["derive"] }
env_logger = "0.11.9"
futures = "0.3.32"
gzp = "2"
hashbrown = "0.16.0"
indicatif = "0.18.2"
libc = "0.2.177"
ncbi-vdb-sys = "0.1.6"
indicatif = "0.18.4"
log = { version = "0.4.29", features = ["kv"] }
ncbi-vdb-sys = "0.1.7"
num_cpus = "1.17.0"
parking_lot = "0.12.5"
reqwest = { version = "0.12.24", default-features = false, features = [
reqwest = { version = "0.13.2", default-features = false, features = [
"blocking",
"stream",
"rustls-tls",
] }
Comment on lines +25 to 29

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

critical

The rustls-tls feature has been removed from the reqwest dependency, but default-features is still false. This means reqwest is being built without any TLS backend, which will cause all HTTPS requests to fail at runtime. Commands like prefetch, dump, describe, and recode rely on HTTPS when an accession is not a local file.

Please add a TLS feature back. For reqwest 0.13+, rustls-tls-native-roots is a good option.

Suggested change
reqwest = { version = "0.13.2", default-features = false, features = [
"blocking",
"stream",
"rustls-tls",
] }
reqwest = { version = "0.13.2", default-features = false, features = [
"blocking",
"stream",
"rustls-tls-native-roots",
] }

serde = { version = "1.0.228", features = ["derive"] }
serde_json = "1.0.145"
tokio = { version = "1.48.0", features = ["rt", "rt-multi-thread"] }
serde_json = "1.0.149"
tokio = { version = "1.50.0", features = ["rt", "rt-multi-thread"] }
zstd = { version = "0.13.3", features = ["zstdmt"] }

[dev-dependencies]
assert_cmd = "2.1.1"
predicates = "3.1.3"
mockito = "1.7.0"
tempfile = "3.23.0"
tokio-test = "0.4.4"
assert_cmd = "2.1.2"
predicates = "3.1.4"
mockito = "1.7.2"
tempfile = "3.26.0"

[dev-dependencies.tokio]
version = "1.48.0"
version = "1.50.0"
features = ["rt", "rt-multi-thread", "macros", "test-util"]
15 changes: 9 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ However, it is not a complete feature-for-feature replacement, and some function

- Multi-threaded extraction to FASTA, FASTQ, and [BINSEQ](https://github.com/arcinstitute/binseq) records.
- Optional built-in compression of output files (FASTA, FASTQ) - [gzip, bgzip, zstd]
- Choice of BINSEQ output format (`*.bq` and `*.vbq`)
- Choice of BINSEQ output format (`*.bq`, `*.vbq`, `*.cbq`)
- Minimum read length filtering
- Technical / biological read segment selection
- Spot subsetting
Expand Down Expand Up @@ -91,17 +91,20 @@ xsra prefetch <ACCESSION>.sra
xsra prefetch <ACCESSION>.sra <ACCESSION2>.sra <ACCESSION3>.sra
```

You can also write [BINSEQ](https://github.com/arcinstitute/binseq) files (`.bq` / `.vbq`) directly from SRA without an intermediate FASTA or FASTQ file.
You can also write [BINSEQ](https://github.com/arcinstitute/binseq) files (`.bq`, `.vbq`, `.cbq`) directly from SRA without an intermediate FASTA or FASTQ file.
These operations can be done with multiple threads for faster processing as well (following same arguments as above).

```bash
# Write a BINSEQ file to (output.bq) selecting segments 1 and 2 (zero-indexed) as primary and extended.
xsra recode <ACCESSION>.sra -fb -I 0,1
# Write a CBQ file to (output.cbq) selecting segments 1 and 2 (zero-indexed) as primary and extended.
xsra recode <ACCESSION>.sra -I 0,1

# Write a BINSEQ file to (output.bq) selecting segment 3 (zero-indexed) as primary.
# Write a CBQ file to (output.cbq) selecting segments 1 and 2 (zero-indexed) as primary and extended.
xsra record <ACCESSION>.sra -fc -I 0,1

Comment on lines +101 to +103

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This example appears to be a copy-paste error. The command has a typo (record instead of recode), and the example itself is redundant. The previous example on line 99 already demonstrates how to create a CBQ file, which is the new default format. I recommend removing this block to avoid confusion.

# Write a BQ file to (output.bq) selecting segment 3 (zero-indexed) as primary.
xsra recode <ACCESSION>.sra -fb -I 2

# Write a VBINSEQ file to (output.vbq) selecting segments 3 and 1 (zero-indexed) as primary and extended.
# Write a VBQ file to (output.vbq) selecting segments 3 and 1 (zero-indexed) as primary and extended.
xsra recode <ACCESSION>.sra -fv -I 3,1
```

Expand Down
2 changes: 1 addition & 1 deletion src/cli/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ pub use dump::{DumpArgs, DumpOutput, OutputFormat};
pub use filter::FilterOptions;
pub use input::{AccessionOptions, InputOptions, MultiInputOptions, Provider};
pub use prefetch::PrefetchArgs;
pub use recode::{BinseqFlavor, RecodeArgs};
pub use recode::RecodeArgs;
pub use runtime::RuntimeOptions;

const STYLES: Styles = Styles::styled()
Expand Down
37 changes: 25 additions & 12 deletions src/cli/recode.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use super::{InputOptions, RuntimeOptions};
use anyhow::{bail, Result};
use binseq::BitSize;
use binseq::{write::Format, BitSize};
use clap::Parser;

#[derive(Parser, Debug)]
Expand Down Expand Up @@ -66,21 +66,23 @@ pub struct SelectionOptions {
#[derive(Parser, Debug)]
#[clap(next_help_heading = "OUTPUT OPTIONS")]
pub struct RecodeOutput {
/// BINSEQ output name (default: "output.{bq,vbq}")
/// BINSEQ output name (default: "output.{bq,vbq,cbq}")
#[clap(short, long)]
pub name: Option<String>,

/// BINSEQ output flavor
#[clap(short, long)]
#[clap(short, long, default_value = "c")]
pub flavor: BinseqFlavor,

/// BINSEQ bit size
///
/// Not used by CBQ
#[clap(long, default_value_t = 2)]
bitsize: u8,

/// VBQ virtual block size (in bytes)
/// Virtual block size (in bytes)
///
/// Only used by vbq
/// Not used by BQ
#[clap(short = 'B', long, value_parser = parse_memory_size, default_value = "128K")]
pub block_size: usize,
}
Expand All @@ -102,18 +104,29 @@ impl RecodeOutput {
}
}

#[derive(clap::ValueEnum, Clone, Copy, Debug)]
#[derive(clap::ValueEnum, Clone, Copy, Debug, Default)]
pub enum BinseqFlavor {
#[clap(name = "b", help = "BINSEQ")]
Binseq,
#[clap(name = "v", help = "VBINSEQ")]
VBinseq,
#[clap(name = "b", help = "BQ")]
BQ,
#[clap(name = "v", help = "VBQ")]
VBQ,
#[clap(name = "c", help = "CBQ")]
#[default]
CBQ,
}
impl BinseqFlavor {
pub fn extension(&self) -> &str {
match self {
BinseqFlavor::Binseq => "bq",
BinseqFlavor::VBinseq => "vbq",
BinseqFlavor::BQ => "bq",
BinseqFlavor::VBQ => "vbq",
BinseqFlavor::CBQ => "cbq",
}
}
pub fn to_format(self) -> Format {
match self {
BinseqFlavor::BQ => Format::Bq,
BinseqFlavor::VBQ => Format::Vbq,
BinseqFlavor::CBQ => Format::Cbq,
}
}
}
Expand Down
9 changes: 4 additions & 5 deletions src/describe/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::path::Path;

use anyhow::Result;
use log::{debug, info};
use ncbi_vdb_sys::{SegmentType, SraReader};

use crate::{
Expand Down Expand Up @@ -70,15 +71,13 @@ pub fn describe_inner(accession: &str, skip: usize, limit: usize) -> Result<Desc

pub fn describe(input: &InputOptions, opts: &DescribeOptions) -> Result<()> {
let accession = if !Path::new(&input.accession).exists() {
eprintln!(
"Identifying SRA data URL for Accession: {}",
&input.accession
);
info!(accession = input.accession.as_str(); "Identifying SRA data URL for accession");
let runtime = tokio::runtime::Runtime::new()?;
let url = runtime.block_on(identify_url(&input.accession, &input.options))?;
eprintln!("Streaming SRA records from URL: {}", url);
info!(url = url.as_str(); "Streaming SRA records from URL");
url
} else {
debug!(path = input.accession.as_str(); "Using local SRA file");
input.accession.to_string()
};
let stats = describe_inner(&accession, opts.skip, opts.limit)?;
Expand Down
20 changes: 11 additions & 9 deletions src/dump/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use std::path::Path;
use std::sync::Arc;

use anyhow::Result;
use log::{debug, info, warn};
use ncbi_vdb_sys::SraReader;
use output::{build_segment_writer, BoxedSegmentWriter};
use parking_lot::Mutex;
Expand Down Expand Up @@ -148,15 +149,13 @@ pub fn dump(
filter_opts: FilterOptions,
) -> Result<()> {
let accession = if !Path::new(&input.accession).exists() {
eprintln!(
"Identifying SRA data URL for Accession: {}",
&input.accession
);
info!(accession = input.accession.as_str(); "Identifying SRA data URL for accession");
let runtime = tokio::runtime::Runtime::new()?;
let url = runtime.block_on(identify_url(&input.accession, &input.options))?;
eprintln!("Streaming SRA records from URL: {}", url);
info!(url = url.as_str(); "Streaming SRA records from URL");
url
} else {
debug!(path = input.accession.as_str(); "Using local SRA file");
input.accession.to_string()
};

Expand All @@ -165,8 +164,11 @@ pub fn dump(
// Adjust the number of records to process if a limit is provided
let num_records = if let Some(limit) = filter_opts.limit {
if limit > num_records {
eprintln!("Warning: Provided spot limit ({}) is greater than the actual number of spots ({}). Will process the full archive.",
limit, num_records);
warn!(
spot_limit = limit,
actual_spots = num_records;
"Provided spot limit exceeds actual number of spots, processing full archive"
);
}
num_records.min(limit)
} else {
Expand Down Expand Up @@ -236,9 +238,9 @@ pub fn dump(
seg_id,
);
if output_opts.keep_empty {
eprintln!("Warning => empty path: {}", path);
warn!(path = path.as_str(), segment_id = seg_id; "Output file is empty but kept due to --keep-empty flag");
} else {
eprintln!("Removing empty path: {}", path);
debug!(path = path.as_str(), segment_id = seg_id; "Removing empty output file");
std::fs::remove_file(path)?;
}
}
Expand Down
10 changes: 10 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,17 @@ use recode::recode;
pub const BUFFER_SIZE: usize = 1024 * 1024;
pub const RECORD_CAPACITY: usize = 1024;

fn initialize_logger() {
env_logger::builder()
.format_timestamp_millis()
.filter_level(log::LevelFilter::Info)
.parse_env("XSRA_LOG")
.init();
}

fn main() -> Result<()> {
initialize_logger();

let args = Cli::parse();
match args.command {
cli::Command::Dump(args) => dump(
Expand Down
6 changes: 2 additions & 4 deletions src/output.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use anyhow::{bail, Result};
use clap::ValueEnum;
use gzp::deflate::{Bgzf, Gzip};
use gzp::par::compress::{ParCompress, ParCompressBuilder};
use log::info;
use std::process::Command;
use zstd::Encoder;

Expand Down Expand Up @@ -75,10 +76,7 @@ fn create_fifo_if_absent(path: OutputFileType) -> Result<()> {
let minfo = std::fs::metadata(path)?;
if cfg!(target_family = "unix") {
if minfo.file_type().is_fifo() {
eprintln!(
"The path {} already existed as is a fifo, so using that for communication.",
path
);
info!(path = path; "Using existing FIFO for communication");
true
} else {
// the file existed but wasn't a fifo
Expand Down
Loading
Loading