From d1772159a9fe61c4246569a563ba00a90fc2944d Mon Sep 17 00:00:00 2001 From: "Philipp A." Date: Sun, 17 May 2026 17:05:52 +0200 Subject: [PATCH] chore: switch sam attributes to bitflags --- Cargo.lock | 29 ++---- Cargo.toml | 1 + src/io/sam.rs | 222 +++++++++++++++++++--------------------------- src/params/mod.rs | 71 +++++---------- src/params/sam.rs | 106 ++++++++++++++++++++++ 5 files changed, 230 insertions(+), 199 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 225f5e0..146ae30 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -114,9 +114,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "2.10.0" +version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" [[package]] name = "bstr" @@ -394,18 +394,6 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" -[[package]] -name = "getrandom" -version = "0.3.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" -dependencies = [ - "cfg-if", - "libc", - "r-efi 5.3.0", - "wasip2", -] - [[package]] name = "getrandom" version = "0.4.2" @@ -414,7 +402,7 @@ checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ "cfg-if", "libc", - "r-efi 6.0.0", + "r-efi", "rand_core", "wasip2", "wasip3", @@ -849,12 +837,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "r-efi" -version = "5.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" - [[package]] name = "r-efi" version = "6.0.0" @@ -868,7 +850,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", - "getrandom 0.4.2", + "getrandom", "rand_core", ] @@ -942,6 +924,7 @@ version = "0.1.0" dependencies = [ "anyhow", "assert_cmd", + "bitflags", "bstr", "byteorder", "chrono", @@ -1075,7 +1058,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.3.4", + "getrandom", "once_cell", "rustix", "windows-sys", diff --git a/Cargo.toml b/Cargo.toml index 9c108e1..bc19d2e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -41,6 +41,7 @@ dashmap = "6" chrono = "0.4" rand = "0.10" tempfile = "3" +bitflags = { version = "2.11.1", features = ["std"] } [dev-dependencies] assert_cmd = "2" diff --git a/src/io/sam.rs b/src/io/sam.rs index d4a8d14..7aab7d3 100644 --- a/src/io/sam.rs +++ b/src/io/sam.rs @@ -6,7 +6,7 @@ use crate::genome::Genome; use crate::io::fastq::{complement_base, decode_base}; use crate::junction::encode_motif; use crate::mapq::calculate_mapq; -use crate::params::Parameters; +use crate::params::{Parameters, SamAttributes}; use bstr::BString; use noodles::sam; use noodles::sam::alignment::io::Write; @@ -104,10 +104,7 @@ impl SamWriter { }; let effective_n = n_alignments.max(n_for_mapq); let mapq = calculate_mapq(effective_n, params.out_sam_mapq_unique); - let mut attrs = params.sam_attribute_set(); - if params.out_sam_strand_field != "intronMotif" { - attrs.remove("XS"); - } + let attrs = params.out_sam_attributes; let rg_id_owned = params.primary_rg_id()?; let rg_id = rg_id_owned.as_deref(); @@ -121,7 +118,7 @@ impl SamWriter { mapq, max_output, // NH = number of reported alignments hit_index + 1, // 1-based - &attrs, + attrs, )?; maybe_insert_rg_tag(&mut record, rg_id); @@ -228,10 +225,7 @@ impl SamWriter { }; let effective_n = n_alignments.max(n_for_mapq); let mapq = calculate_mapq(effective_n, params.out_sam_mapq_unique); - let mut attrs = params.sam_attribute_set(); - if params.out_sam_strand_field != "intronMotif" { - attrs.remove("XS"); - } + let attrs = params.out_sam_attributes; let rg_id_owned = params.primary_rg_id()?; let rg_id = rg_id_owned.as_deref(); @@ -246,7 +240,7 @@ impl SamWriter { mapq, max_output, // NH = number of reported alignments hit_index + 1, // 1-based - &attrs, + attrs, )?; maybe_insert_rg_tag(&mut record, rg_id); records.push(record); @@ -295,10 +289,7 @@ impl SamWriter { }; let effective_n = n_alignments.max(n_for_mapq); let mapq = calculate_mapq(effective_n, params.out_sam_mapq_unique); - let mut attrs = params.sam_attribute_set(); - if params.out_sam_strand_field != "intronMotif" { - attrs.remove("XS"); - } + let attrs = params.out_sam_attributes; let rg_id_owned = params.primary_rg_id()?; let rg_id = rg_id_owned.as_deref(); @@ -325,7 +316,7 @@ impl SamWriter { max_output, // NH = number of reported alignments hit_index, combined_score, - &attrs, + attrs, )?; maybe_insert_rg_tag(&mut rec1, rg_id); records.push(rec1); @@ -345,7 +336,7 @@ impl SamWriter { max_output, // NH = number of reported alignments hit_index, combined_score, - &attrs, + attrs, )?; maybe_insert_rg_tag(&mut rec2, rg_id); records.push(rec2); @@ -381,10 +372,7 @@ impl SamWriter { let n_alignments = 1usize; let effective_n = n_alignments.max(n_for_mapq); let mapq = calculate_mapq(effective_n, params.out_sam_mapq_unique); - let mut attrs = params.sam_attribute_set(); - if params.out_sam_strand_field != "intronMotif" { - attrs.remove("XS"); - } + let attrs = params.out_sam_attributes; let rg_id_owned = params.primary_rg_id()?; let rg_id = rg_id_owned.as_deref(); @@ -451,38 +439,37 @@ impl SamWriter { // Optional tags on mapped mate let data = mapped_rec.data_mut(); - if attrs.contains("NH") { + if attrs.contains(SamAttributes::NH) { data.insert(Tag::ALIGNMENT_HIT_COUNT, Value::from(n_alignments as i32)); } - if attrs.contains("HI") { + if attrs.contains(SamAttributes::HI) { data.insert(Tag::HIT_INDEX, Value::from(1i32)); } - if attrs.contains("AS") { + if attrs.contains(SamAttributes::AS) { data.insert(Tag::ALIGNMENT_SCORE, Value::from(mapped_transcript.score)); } - if attrs.contains("NM") || attrs.contains("nM") { - // STAR maps NM attribute to 'nM' tag (mismatches only, not edit distance) + if attrs.contains(SamAttributes::NM) { data.insert( Tag::new(b'n', b'M'), Value::from(mapped_transcript.n_mismatch as i32), ); } - if attrs.contains("XS") + if attrs.contains(SamAttributes::XS) && let Some(xs_strand) = derive_xs_strand(mapped_transcript) { data.insert(Tag::new(b'X', b'S'), Value::Character(xs_strand as u8)); } - if attrs.contains("jM") + if attrs.contains(SamAttributes::JM) && let Some(jm) = build_jm_tag(mapped_transcript) { data.insert(Tag::new(b'j', b'M'), jm); } - if attrs.contains("jI") + if attrs.contains(SamAttributes::JI) && let Some(ji) = build_ji_tag(mapped_transcript, chr_start) { data.insert(Tag::new(b'j', b'I'), ji); } - if attrs.contains("MD") { + if attrs.contains(SamAttributes::MD) { let md = build_md_tag( mapped_transcript, mapped_seq, @@ -575,15 +562,11 @@ impl SamWriter { if projected.is_empty() { return Ok(Vec::new()); } - let mut attrs = params.sam_attribute_set(); - // Splice tags are meaningless in t-space. - attrs.remove("jM"); - attrs.remove("jI"); - attrs.remove("XS"); - // MD-tag would require the transcript's t-space reference which we do - // not precompute; drop it to keep this writer simple. STAR also does - // not emit MD for transcriptome SAM. - attrs.remove("MD"); + // Splice tags are meaningless in t-space; MD would require the + // transcript's t-space reference which we do not precompute, and + // STAR also does not emit MD for transcriptome SAM. + let attrs = params.out_sam_attributes + - (SamAttributes::JM | SamAttributes::JI | SamAttributes::XS | SamAttributes::MD); let n_alignments = projected.len(); let mut records = Vec::with_capacity(n_alignments); @@ -640,17 +623,17 @@ impl SamWriter { // Optional tags let data = record.data_mut(); - if attrs.contains("NH") { + if attrs.contains(SamAttributes::NH) { data.insert(Tag::ALIGNMENT_HIT_COUNT, Value::from(n_alignments as i32)); } - if attrs.contains("HI") { + if attrs.contains(SamAttributes::HI) { // HI is 1-based; primary = 1, secondaries > 1 in emission order. data.insert(Tag::HIT_INDEX, Value::from((hit_idx + 1) as i32)); } - if attrs.contains("AS") { + if attrs.contains(SamAttributes::AS) { data.insert(Tag::ALIGNMENT_SCORE, Value::from(t.score)); } - if attrs.contains("NM") || attrs.contains("nM") { + if attrs.contains(SamAttributes::NM) { data.insert(Tag::new(b'n', b'M'), Value::from(t.n_mismatch as i32)); } @@ -849,9 +832,9 @@ where Ok(builder.build()) } -/// Insert `RG:Z:` on the record when an ID is set. `sam_attribute_set()` -/// auto-adds `RG` to the attribute set whenever an RG line is configured, so -/// `rg_id.is_some()` implies the attribute is wanted — no extra gate needed. +/// Insert `RG:Z:` on the record when an ID is set. `Parameters::try_parse_from` +/// auto-ORs `SamAttributes::RG` into `out_sam_attributes` whenever an RG line is +/// configured, so `rg_id.is_some()` implies the attribute is wanted. fn maybe_insert_rg_tag(record: &mut RecordBuf, rg_id: Option<&str>) { if let Some(id) = rg_id { record @@ -871,7 +854,7 @@ fn transcript_to_record( mapq: u8, n_alignments: usize, hit_index: usize, - attrs: &HashSet, + attrs: SamAttributes, ) -> Result { let mut record = RecordBuf::default(); @@ -937,38 +920,37 @@ fn transcript_to_record( // Optional tags: gated by --outSAMattributes let data = record.data_mut(); - if attrs.contains("NH") { + if attrs.contains(SamAttributes::NH) { data.insert(Tag::ALIGNMENT_HIT_COUNT, Value::from(n_alignments as i32)); } - if attrs.contains("HI") { + if attrs.contains(SamAttributes::HI) { data.insert(Tag::HIT_INDEX, Value::from(hit_index as i32)); } - if attrs.contains("AS") { + if attrs.contains(SamAttributes::AS) { data.insert(Tag::ALIGNMENT_SCORE, Value::from(transcript.score)); } - if attrs.contains("NM") || attrs.contains("nM") { - // STAR maps NM attribute to 'nM' tag (mismatches only, not edit distance) + if attrs.contains(SamAttributes::NM) { data.insert( Tag::new(b'n', b'M'), Value::from(transcript.n_mismatch as i32), ); } - if attrs.contains("XS") + if attrs.contains(SamAttributes::XS) && let Some(xs_strand) = derive_xs_strand(transcript) { data.insert(Tag::new(b'X', b'S'), Value::Character(xs_strand as u8)); } - if attrs.contains("jM") + if attrs.contains(SamAttributes::JM) && let Some(jm) = build_jm_tag(transcript) { data.insert(Tag::new(b'j', b'M'), jm); } - if attrs.contains("jI") + if attrs.contains(SamAttributes::JI) && let Some(ji) = build_ji_tag(transcript, chr_start) { data.insert(Tag::new(b'j', b'I'), ji); } - if attrs.contains("MD") { + if attrs.contains(SamAttributes::MD) { let md = build_md_tag(transcript, read_seq, genome, transcript.is_reverse); data.insert(Tag::new(b'M', b'D'), Value::String(BString::from(md))); } @@ -1148,7 +1130,7 @@ fn build_paired_mate_record( n_alignments: usize, hit_index: usize, combined_score: i32, - attrs: &HashSet, + attrs: SamAttributes, ) -> Result { let mut record = RecordBuf::default(); @@ -1243,39 +1225,39 @@ fn build_paired_mate_record( // Optional tags: gated by --outSAMattributes let data = record.data_mut(); - if attrs.contains("NH") { + if attrs.contains(SamAttributes::NH) { data.insert(Tag::ALIGNMENT_HIT_COUNT, Value::from(n_alignments as i32)); } - if attrs.contains("HI") { + if attrs.contains(SamAttributes::HI) { data.insert(Tag::HIT_INDEX, Value::from(hit_index as i32)); } - if attrs.contains("AS") { + if attrs.contains(SamAttributes::AS) { // STAR reports combined score (sum of both mates) for PE AS tag data.insert(Tag::ALIGNMENT_SCORE, Value::from(combined_score)); } - if attrs.contains("NM") || attrs.contains("nM") { + if attrs.contains(SamAttributes::NM) { // STAR maps NM attribute to 'nM' tag (mismatches only, not edit distance) data.insert( Tag::new(b'n', b'M'), Value::from(transcript.n_mismatch as i32), ); } - if attrs.contains("XS") + if attrs.contains(SamAttributes::XS) && let Some(xs_strand) = derive_xs_strand(transcript) { data.insert(Tag::new(b'X', b'S'), Value::Character(xs_strand as u8)); } - if attrs.contains("jM") + if attrs.contains(SamAttributes::JM) && let Some(jm) = build_jm_tag(transcript) { data.insert(Tag::new(b'j', b'M'), jm); } - if attrs.contains("jI") + if attrs.contains(SamAttributes::JI) && let Some(ji) = build_ji_tag(transcript, chr_start) { data.insert(Tag::new(b'j', b'I'), ji); } - if attrs.contains("MD") { + if attrs.contains(SamAttributes::MD) { let md = build_md_tag(transcript, mate_seq, genome, transcript.is_reverse); data.insert(Tag::new(b'M', b'D'), Value::String(BString::from(md))); } @@ -1291,22 +1273,6 @@ mod tests { use noodles::sam::alignment::record::cigar; use tempfile::NamedTempFile; - /// Build an attribute set with all tags enabled (for tests that don't care about filtering) - fn all_attrs() -> HashSet { - ["NH", "HI", "AS", "NM", "nM", "XS", "jM", "jI", "MD"] - .iter() - .map(ToString::to_string) - .collect() - } - - /// Build the standard attribute set (NH, HI, AS, NM, nM) - fn standard_attrs() -> HashSet { - ["NH", "HI", "AS", "NM", "nM"] - .iter() - .map(ToString::to_string) - .collect() - } - fn make_test_genome() -> Genome { Genome { sequence: vec![0, 1, 2, 3, 0, 1, 2, 3], // ACGTACGT @@ -1470,7 +1436,7 @@ mod tests { 255, 1, 1, - &standard_attrs(), + SamAttributes::STANDARD, ); assert!(record.is_ok()); @@ -1597,7 +1563,7 @@ mod tests { 1, // n_alignments 1, // hit_index 190, // combined_score (100+90) - &standard_attrs(), + SamAttributes::STANDARD, ) .unwrap(); @@ -1625,7 +1591,7 @@ mod tests { 1, // n_alignments 1, // hit_index 190, // combined_score (100+90) - &standard_attrs(), + SamAttributes::STANDARD, ) .unwrap(); @@ -1709,7 +1675,7 @@ mod tests { 1, // n_alignments 1, // hit_index 350, // combined_score (200+150) - &standard_attrs(), + SamAttributes::STANDARD, ) .unwrap(); @@ -1774,7 +1740,7 @@ mod tests { 255, 3, // n_alignments 2, // hit_index - &standard_attrs(), + SamAttributes::STANDARD, ) .unwrap(); @@ -1934,7 +1900,7 @@ mod tests { 255, 1, // unique mapper 1, - &standard_attrs(), + SamAttributes::STANDARD, ) .unwrap(); @@ -2070,7 +2036,7 @@ mod tests { 255, 1, 1, - &all_attrs(), + SamAttributes::ALL, ) .unwrap(); @@ -2115,7 +2081,7 @@ mod tests { 255, 1, 1, - &all_attrs(), + SamAttributes::ALL, ) .unwrap(); @@ -2164,7 +2130,7 @@ mod tests { 255, 1, 1, - &all_attrs(), + SamAttributes::ALL, ) .unwrap(); @@ -2215,7 +2181,7 @@ mod tests { 255, 1, 1, - &all_attrs(), + SamAttributes::ALL, ) .unwrap(); @@ -2264,7 +2230,7 @@ mod tests { 255, 1, 1, - &standard_attrs(), // XS not in standard attrs + SamAttributes::STANDARD, // XS not in standard attrs ) .unwrap(); @@ -2698,7 +2664,7 @@ mod tests { 255, 1, 1, - &all_attrs(), + SamAttributes::ALL, ) .unwrap(); @@ -2784,7 +2750,7 @@ mod tests { 1, 1, 190, // combined_score (100+90) - &standard_attrs(), + SamAttributes::STANDARD, ) .unwrap(); @@ -2807,7 +2773,7 @@ mod tests { 1, 1, 190, // combined_score (100+90) - &standard_attrs(), + SamAttributes::STANDARD, ) .unwrap(); @@ -2893,7 +2859,7 @@ mod tests { 1, 1, 180, // combined_score (100+80) - &standard_attrs(), + SamAttributes::STANDARD, ) .unwrap(); assert_eq!( @@ -2923,7 +2889,7 @@ mod tests { 1, 1, 180, // combined_score (100+80) - &standard_attrs(), + SamAttributes::STANDARD, ) .unwrap(); assert_eq!( @@ -3009,7 +2975,7 @@ mod tests { 1, 1, 190, // combined_score (100+90) - &standard_attrs(), + SamAttributes::STANDARD, ) .unwrap(); assert!(!rec1.flags().is_mate_reverse_complemented()); @@ -3029,7 +2995,7 @@ mod tests { 1, 1, 190, // combined_score (100+90) - &standard_attrs(), + SamAttributes::STANDARD, ) .unwrap(); assert!(!rec2.flags().is_mate_reverse_complemented()); @@ -3070,7 +3036,7 @@ mod tests { 255, 1, 1, - &standard_attrs(), + SamAttributes::STANDARD, ) .unwrap(); @@ -3137,7 +3103,7 @@ mod tests { let read_seq = vec![0, 1, 2, 3]; let read_qual = vec![30, 30, 30, 30]; - let empty_attrs: HashSet = HashSet::new(); + let empty_attrs = SamAttributes::empty(); let record = transcript_to_record( &transcript, "read1", @@ -3147,7 +3113,7 @@ mod tests { 255, 1, 1, - &empty_attrs, + empty_attrs, ) .unwrap(); @@ -3212,7 +3178,7 @@ mod tests { let read_seq = vec![0, 1, 2, 3]; let read_qual = vec![30, 30, 30, 30]; - let attrs: HashSet = ["NH", "MD"].iter().map(ToString::to_string).collect(); + let attrs = SamAttributes::NH | SamAttributes::MD; let record = transcript_to_record( &transcript, "read1", @@ -3222,7 +3188,7 @@ mod tests { 255, 1, 1, - &attrs, + attrs, ) .unwrap(); @@ -3270,15 +3236,10 @@ mod tests { // Standard let p = Parameters::parse_from(["rustar-aligner", "--readFilesIn", "r.fq"]); - let attrs = p.sam_attribute_set(); - assert_eq!(attrs.len(), 5); - assert!(attrs.contains("NH")); - assert!(attrs.contains("HI")); - assert!(attrs.contains("AS")); - assert!(attrs.contains("NM")); - assert!(attrs.contains("nM")); - - // All + assert_eq!(p.out_sam_attributes, SamAttributes::STANDARD); + + // All — note: XS is stripped at parse-time because the default + // --outSAMstrandField is "None" (STAR only emits XS in intronMotif mode). let p = Parameters::parse_from([ "rustar-aligner", "--readFilesIn", @@ -3286,13 +3247,19 @@ mod tests { "--outSAMattributes", "All", ]); - let attrs = p.sam_attribute_set(); - assert_eq!(attrs.len(), 9); - assert!(attrs.contains("nM")); - assert!(attrs.contains("XS")); - assert!(attrs.contains("MD")); - assert!(attrs.contains("jM")); - assert!(attrs.contains("jI")); + assert_eq!(p.out_sam_attributes, SamAttributes::ALL - SamAttributes::XS); + + // All + intronMotif keeps XS. + let p = Parameters::parse_from([ + "rustar-aligner", + "--readFilesIn", + "r.fq", + "--outSAMattributes", + "All", + "--outSAMstrandField", + "intronMotif", + ]); + assert_eq!(p.out_sam_attributes, SamAttributes::ALL); // None let p = Parameters::parse_from([ @@ -3302,8 +3269,7 @@ mod tests { "--outSAMattributes", "None", ]); - let attrs = p.sam_attribute_set(); - assert!(attrs.is_empty()); + assert!(p.out_sam_attributes.is_empty()); // Explicit subset let p = Parameters::parse_from([ @@ -3315,12 +3281,10 @@ mod tests { "AS", "MD", ]); - let attrs = p.sam_attribute_set(); - assert_eq!(attrs.len(), 3); - assert!(attrs.contains("NH")); - assert!(attrs.contains("AS")); - assert!(attrs.contains("MD")); - assert!(!attrs.contains("HI")); + assert_eq!( + p.out_sam_attributes, + SamAttributes::NH | SamAttributes::AS | SamAttributes::MD, + ); } #[test] @@ -3363,7 +3327,7 @@ mod tests { 255, 1, 1, - &all_attrs(), + SamAttributes::ALL, ) .unwrap(); diff --git a/src/params/mod.rs b/src/params/mod.rs index 0204981..58d24bb 100644 --- a/src/params/mod.rs +++ b/src/params/mod.rs @@ -1,4 +1,3 @@ -use std::collections::HashSet; use std::num::NonZeroUsize; use std::path::PathBuf; @@ -6,7 +5,7 @@ use clap::{CommandFactory, Parser}; mod sam; -pub use sam::{OutSamFormat, OutSamSortOrder, OutSamType, OutSamUnmapped}; +pub use sam::{OutSamFormat, OutSamSortOrder, OutSamType, OutSamUnmapped, SamAttributes}; // --------------------------------------------------------------------------- // Run mode enum @@ -303,9 +302,10 @@ pub struct Parameters { #[arg(long = "outSAMstrandField", default_value = "None")] pub out_sam_strand_field: String, - /// SAM attributes to include (Standard, All, None, or explicit list) - #[arg(long = "outSAMattributes", num_args = 1.., default_values_t = vec!["Standard".to_string()])] - pub out_sam_attributes: Vec, + /// SAM optional-tag set (`Standard`, `All`, `None`, or any combination + /// of NH HI AS NM nM MD jM jI XS RG). Parsed into a bitflags struct. + #[command(flatten)] + pub out_sam_attributes: SamAttributes, /// Read group line(s) for the `@RG` SAM header. Space-separated fields; /// a bare `,` separates multiple RG blocks. Each block must start with `ID:`. @@ -649,40 +649,6 @@ impl Parameters { self.chim_out_type.iter().any(|s| s == "WithinBAM") } - /// Expand `--outSAMattributes` into a set of individual tag names. - /// - /// - `"Standard"` → {NH, HI, AS, NM, nM} - /// - `"All"` → {NH, HI, AS, NM, nM, MD, jM, jI, XS} - /// - `"None"` → {} (empty) - /// - Explicit list (e.g. `["NH", "AS"]`) → collected as-is - /// - /// `RG` is auto-appended when `--outSAMattrRGline` is set (STAR behavior, - /// `Parameters_samAttributes.cpp:201`). - pub fn sam_attribute_set(&self) -> HashSet { - let mut attrs: HashSet = match self - .out_sam_attributes - .iter() - .map(String::as_str) - .collect::>() - .as_slice() - { - ["Standard"] => ["NH", "HI", "AS", "NM", "nM"] - .iter() - .map(ToString::to_string) - .collect(), - ["All"] => ["NH", "HI", "AS", "NM", "nM", "MD", "jM", "jI", "XS"] - .iter() - .map(ToString::to_string) - .collect(), - ["None"] => HashSet::new(), - tags => tags.iter().map(ToString::to_string).collect(), - }; - if self.rg_line_set() { - attrs.insert("RG".to_string()); - } - attrs - } - /// True if the user provided a non-default `--outSAMattrRGline`. pub fn rg_line_set(&self) -> bool { !self.out_sam_attr_rg_line.is_empty() && self.out_sam_attr_rg_line[0] != "-" @@ -842,7 +808,7 @@ impl Parameters { let mut command = ::command(); let matches = command.clone().get_matches_from(args); - let params = ::from_arg_matches(&matches)?; + let mut params = ::from_arg_matches(&matches)?; // genomeGenerate requires FASTA files if params.run_mode == RunMode::GenomeGenerate && params.genome_fasta_files.is_empty() { @@ -870,10 +836,8 @@ impl Parameters { // Read group: `RG` in outSAMattributes without an RG line is a fatal // error (STAR: Parameters_samAttributes.cpp:206). STAR's "All" preset - // does NOT include RG, so only match a literal "RG" token here. Also - // parse the RG line to validate its ID: prefix and per-file RG count. - let user_wants_rg_attr = params.out_sam_attributes.iter().any(|a| a == "RG"); - if !params.rg_line_set() && user_wants_rg_attr { + // does NOT include RG, so only match a literal user-supplied RG flag. + if params.out_sam_attributes.contains(SamAttributes::RG) && !params.rg_line_set() { return Err(command.error( ErrorKind::MissingRequiredArgument, "--outSAMattributes contains RG tag, but --outSAMattrRGline is not set", @@ -883,6 +847,19 @@ impl Parameters { .rg_ids() .map_err(|e| command.error(ErrorKind::InvalidValue, e))?; + // Fold runtime-derived bits into the effective attribute set so writers + // can read `out_sam_attributes` directly. + // - Auto-emit RG whenever an RG line is configured (STAR, + // `Parameters_samAttributes.cpp:201`). + // - Strip XS unless `--outSAMstrandField intronMotif` is set, since + // STAR only emits XS in that mode. + if params.rg_line_set() { + params.out_sam_attributes |= SamAttributes::RG; + } + if params.out_sam_strand_field != "intronMotif" { + params.out_sam_attributes.remove(SamAttributes::XS); + } + // quantMode TranscriptomeSAM requires transcript annotations — // either via --sjdbGTFfile or pre-generated transcriptInfo.tab // et al in --genomeDir (persisted at genomeGenerate time). At @@ -944,7 +921,7 @@ mod tests { assert_eq!(p.out_file_name_prefix, "./"); assert_eq!(p.out_sam_type, OutSamType::default()); assert_eq!(p.out_sam_strand_field, "None"); - assert_eq!(p.out_sam_attributes, vec!["Standard".to_string()]); + assert_eq!(p.out_sam_attributes, SamAttributes::STANDARD); assert_eq!(p.out_sam_unmapped, OutSamUnmapped::None); assert_eq!(p.out_sam_mapq_unique, 255); assert_eq!(p.out_sam_mult_nmax, -1); @@ -1266,7 +1243,7 @@ mod tests { assert!(!p.rg_line_set()); assert_eq!(p.parsed_rg_lines().unwrap(), Vec::::new()); assert_eq!(p.rg_ids().unwrap(), Vec::::new()); - assert!(!p.sam_attribute_set().contains("RG")); + assert!(!p.out_sam_attributes.contains(SamAttributes::RG)); } #[test] @@ -1286,7 +1263,7 @@ mod tests { vec!["ID:foo\tSM:bar\tLB:lib1".to_string()] ); assert_eq!(p.rg_ids().unwrap(), vec!["foo".to_string()]); - assert!(p.sam_attribute_set().contains("RG")); + assert!(p.out_sam_attributes.contains(SamAttributes::RG)); } #[test] diff --git a/src/params/sam.rs b/src/params/sam.rs index 34ee7a5..43006f7 100644 --- a/src/params/sam.rs +++ b/src/params/sam.rs @@ -1,3 +1,109 @@ +// --------------------------------------------------------------------------- +// SAM optional-tag attribute set (`--outSAMattributes`) +// --------------------------------------------------------------------------- + +use std::str::FromStr; + +bitflags::bitflags! { + /// Optional SAM tags requested via `--outSAMattributes`. + /// + /// Each bit corresponds to one tag the writer may emit. `STANDARD` and + /// `ALL` are convenience aliases matching STAR's preset names. + #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] + pub struct SamAttributes: u16 { + const NH = 1 << 0; + const HI = 1 << 1; + const AS = 1 << 2; + const NM = 1 << 3; + const MD = 1 << 4; + const JM = 1 << 5; + const JI = 1 << 6; + const XS = 1 << 7; + const RG = 1 << 8; + + const STANDARD = + Self::NH.bits() | Self::HI.bits() | Self::AS.bits() + | Self::NM.bits(); + const ALL = + Self::STANDARD.bits() + | Self::MD.bits() | Self::JM.bits() | Self::JI.bits() | Self::XS.bits(); + } +} + +impl FromStr for SamAttributes { + type Err = String; + /// Parse a single CLI token into a flag. `Standard`/`All`/`None` expand to + /// their preset sets; individual tag names map to a single bit. + fn from_str(s: &str) -> Result { + Ok(match s { + "Standard" => Self::STANDARD, + "All" => Self::ALL, + "None" => Self::empty(), + "NH" => Self::NH, + "HI" => Self::HI, + "AS" => Self::AS, + // STAR maps NM attribute to 'nM' tag (mismatches only, not edit distance) + "NM" | "nM" => Self::NM, + "MD" => Self::MD, + "jM" => Self::JM, + "jI" => Self::JI, + "XS" => Self::XS, + "RG" => Self::RG, + other => return Err(format!("unknown --outSAMattributes token '{other}'")), + }) + } +} + +impl clap::FromArgMatches for SamAttributes { + fn from_arg_matches(matches: &clap::ArgMatches) -> Result { + let mut s = Self::STANDARD; + s.update_from_arg_matches(matches)?; + Ok(s) + } + + fn update_from_arg_matches(&mut self, matches: &clap::ArgMatches) -> Result<(), clap::Error> { + let Some(values) = matches.get_many::("outSAMattributes") else { + return Ok(()); + }; + let mut acc = Self::empty(); + for tok in values { + let flag = tok.parse().map_err(|e| { + use clap::error::{ContextKind, ContextValue, ErrorKind}; + let mut err = clap::Error::new(ErrorKind::InvalidValue); + err.insert( + ContextKind::InvalidArg, + ContextValue::String("--outSAMattributes".into()), + ); + err.insert(ContextKind::InvalidValue, ContextValue::String(tok.clone())); + err.insert(ContextKind::Custom, ContextValue::String(e)); + err + })?; + acc |= flag; + } + *self = acc; + Ok(()) + } +} + +impl clap::Args for SamAttributes { + fn augment_args(cmd: clap::Command) -> clap::Command { + cmd.arg( + clap::Arg::new("outSAMattributes") + .long("outSAMattributes") + .num_args(1..) + .default_values(["Standard"]) + .help( + "SAM optional tags: Standard, All, None, or any combination of \ + NH HI AS NM nM MD jM jI XS RG.", + ), + ) + } + + fn augment_args_for_update(cmd: clap::Command) -> clap::Command { + Self::augment_args(cmd) + } +} + // --------------------------------------------------------------------------- // SAM output type enums // ---------------------------------------------------------------------------