lazear · lazear · Nov 2, 2022 · Oct 30, 2022 · Oct 30, 2022 · Oct 30, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,55 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+<!-- ## [Unreleased] -->
+
+
+## [0.6.0] - 2022-11-01
+## Added
+- Changelog
+- `rank` column added to output file
+- `database.generate_decoys` parameter, which turns off internal decoy generation. This enables the use of FASTA databases for SearchGUI/PeptideShaker
+
+### Changed
+- Base ProForma v2 notation is used for peptide modifications, i.e. "\[+304.2071\]-PEPTIDEM\[+15.9949\]AAC\[+57.0214\]H"
+- `scannr` column now contains the full nativeID/spectrum title from the mzML file, i.e. "controllerType=0 controllerNumber=1 scan=30069"
+- `discriminant_score` column renamed to `sage_discriminant_score` for PeptideShaker recognition
+- `database.decoy_prefix` JSON option changed to `database.decoy_tag`. This allows decoy tagging to occur anywhere within the accession: "sp|P01234_REVERSED|HUMAN"
+- Output file renamed:  `results.pin` to `results.sage.tsv`
+- Output file renamed: `quant.csv` to `quant.tsv`
+- Rename `pin_paths` to `output_paths` in results.json file
+
+
+## [0.5.1] - 2022-10-31
+### Added
+- Support for selenocysteine and pyrrolysine amino acids
+
+## [0.5.0] - 2022-10-28
+### Added
+- Ability to directly read/write files from AWS S3
+
+### Changed
+- Processing files in parallel processes them in batches of `num_cpus / 2` to avoid memory issues
+- Fixed bug where `protein_fdr` was erroneously assigned to `peptide_fdr` output field
+- Additional parallelization for assignment of PEP, FDR, writing output files
+
+## [0.4.0] - 2022-10-18
+### Added
+- Label free quantification can be enabled by turning on `quant.lfq` JSON parameter 
+- Commmand line arguments can be used to override configuration file
+
+## [0.3.1] - 2022-10-06
+### Added
+- Workflow contributions from [@wfondrie](https://github.com/wfondrie).
+
+### Changed
+- Don't parse empty MS2 spectra
+
+## [0.3.0] - 2015-09-15
+### Added
+- Retention time prediction
+- Ability to filter low-number b/y-ions for faster preliminary scoring (`database.min_ion_index` option)
+- Ability to toggle retention time prediction (`predict_rt`)
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/README.md b/README.md
@@ -130,15 +130,24 @@ sage config.json s3://my-bucket/YYYY-MM-DD_expt_A_fraction_1.mzML.gz
 
 Running Sage will produce several output files (located in either the current directory, or `output_directory` if that option is specified):
 - Record of search parameters (`results.json`) will be created that details input/output paths and all search parameters used for the search
-- MS2 search results will be stored as a Percolator-compatible (`search.pin`) file - this is a tab-separated file, which can be opened in Excel/Pandas/etc
-- MS3 search results will be stored as a CSV (`quant.csv`) if `quant.tmt` option is used in the parameter file
+- MS2 search results will be stored as a tab-separated file (`results.sage.tsv`) file - this is a tab-separated file, which can be opened in Excel/Pandas/etc
+- MS3 search results will be stored as a tab-separated file (`quant.tsv`) if `quant.tmt` option is used in the parameter file
 
 ## Configuration file schema
 
-Notes:
+### Notes
+
 - The majority of parameters are optional - only "database.fasta", "precursor_tol", and "fragment_tol" are required. Sage will try and use reasonable defaults for any parameters not supplied
 - Tolerances are specified on the *experimental* m/z values. To perform a -100 to +500 Da open search (mass window applied to *precursor*), you would use `"da": [-500, 100]`
 
+### Decoys
+
+Using decoy sequences is critical to controlling the false discovery rate in proteomics experiments. Sage can use decoy sequences in the supplied FASTA file, or it can generate internal sequences. Sage reverses tryptic peptides (not proteins), so that the [picked-peptide](https://pubmed.ncbi.nlm.nih.gov/36166314/) approach to FDR can be used.
+
+If `database.generate_decoys` is set to true (or unspecified), then decoy sequences in the FASTA database matching `database.decoy_tag` will be *ignored*, and Sage will internally generate decoys. It is __critical__ that you ensure you use the proper `decoy_tag` if you are using a FASTA database containing decoys and have internal decoy generation turned on - otherwise Sage will treat the supplied decoys as hits!
+
+Internally generated decoys will have protein accessions matching "{decoy_tag}{accession}", e.g. if `decoy_tag` is "rev_" then a protein accession like "rev_sp|P01234|HUMAN" will be listed in the output file.
+
 ```jsonc
 // Note that json does not allow comments, they are here just as explanation
 // but need to be removed in a real config.json file
@@ -161,8 +170,9 @@ Notes:
     "variable_mods": {      // Optional[Dict[char, float]] {default={}}, variable modifications
       "M": 15.9949          // Variable mods are applied *before* static mods
     },
-    "decoy_prefix": "rev_", // Optional[str] {default="rev_"}: Prefix appended to decoy proteins
-    "fasta": "dual.fasta"   // str: mandatory path to fasta file
+    "decoy_tag": "rev_",    // Optional[str] {default="rev_"}: See notes above
+    "generate_decoys": false  // Optional[bool] {default="true"}: Ignore decoys in FASTA database matching `decoy_tag`
+    "fasta": "dual.fasta"   // str: mandatory path to FASTA file
   },
   "quant": {                // Optional - specify only if TMT or LFQ
     "tmt": "Tmt16",         // Optional[str] {default=null}, one of "Tmt6", "Tmt10", "Tmt11", "Tmt16", or "Tmt18"

diff --git a/crates/sage-cli/Cargo.toml b/crates/sage-cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "sage-cli"
-version = "0.5.1"
+version = "0.6.0"
 authors = ["Michael Lazear <[email protected]"]
 edition = "2021"
 rust-version = "1.62"

diff --git a/crates/sage-cli/src/main.rs b/crates/sage-cli/src/main.rs
@@ -27,7 +27,7 @@ struct Search {
     predict_rt: bool,
     parallel: bool,
     mzml_paths: Vec<String>,
-    pin_paths: Vec<String>,
+    output_paths: Vec<String>,
 
     #[serde(skip_serializing)]
     output_directory: CloudPath,
@@ -101,7 +101,7 @@ impl Input {
             deisotope: self.deisotope.unwrap_or(true),
             chimera: self.chimera.unwrap_or(false),
             predict_rt: self.predict_rt.unwrap_or(true),
-            pin_paths: Vec::new(),
+            output_paths: Vec::new(),
         })
     }
 }
@@ -184,13 +184,14 @@ impl Runner {
         path
     }
 
-    fn serialize_feature(&self, feature: &Feature) -> csv::ByteRecord {
+    fn serialize_feature(&self, feature: &Feature, filenames: &[String]) -> csv::ByteRecord {
         let mut record = csv::ByteRecord::new();
         record.push_field(feature.peptide.as_str().as_bytes());
         record.push_field(feature.proteins.as_str().as_bytes());
         record.push_field(itoa::Buffer::new().format(feature.num_proteins).as_bytes());
-        record.push_field(self.parameters.mzml_paths[feature.file_id].as_bytes());
-        record.push_field(itoa::Buffer::new().format(feature.scannr).as_bytes());
+        record.push_field(filenames[feature.file_id].as_bytes());
+        record.push_field(feature.spec_id.as_bytes());
+        record.push_field(itoa::Buffer::new().format(feature.rank).as_bytes());
         record.push_field(itoa::Buffer::new().format(feature.label).as_bytes());
         record.push_field(ryu::Buffer::new().format(feature.expmass).as_bytes());
         record.push_field(ryu::Buffer::new().format(feature.calcmass).as_bytes());
@@ -245,8 +246,13 @@ impl Runner {
         record
     }
 
-    fn write_features(&self, features: Vec<Feature>) -> anyhow::Result<String> {
-        let path = self.make_path("search.pin");
+    fn write_features(
+        &self,
+        features: Vec<Feature>,
+        filenames: &[String],
+    ) -> anyhow::Result<String> {
+        let path = self.make_path("results.sage.tsv");
+
         let mut wtr = csv::WriterBuilder::new()
             .delimiter(b'\t')
             .from_writer(vec![]);
@@ -255,8 +261,9 @@ impl Runner {
             "peptide",
             "proteins",
             "num_proteins",
-            "file",
+            "filename",
             "scannr",
+            "rank",
             "label",
             "expmass",
             "calcmass",
@@ -278,23 +285,18 @@ impl Runner {
             "matched_intensity_pct",
             "scored_candidates",
             "poisson",
-            "discriminant_score",
+            "sage_discriminant_score",
             "posterior_error",
             "spectrum_fdr",
             "peptide_fdr",
             "protein_fdr",
             "ms1_intensity",
         ]);
 
-        // for feat in features {
-        // wtr.serialize(feat)?;
-        // }
-
         wtr.write_byte_record(&headers)?;
         for record in features
-            // .par_iter()
             .into_par_iter()
-            .map(|feat| self.serialize_feature(&feat))
+            .map(|feat| self.serialize_feature(&feat, filenames))
             .collect::<Vec<_>>()
         {
             wtr.write_byte_record(&record)?;
@@ -306,10 +308,12 @@ impl Runner {
         Ok(path.to_string())
     }
 
-    fn write_quant(&self, quant: &[TmtQuant]) -> anyhow::Result<String> {
-        let path = self.make_path("quant.csv");
+    fn write_quant(&self, quant: &[TmtQuant], filenames: &[String]) -> anyhow::Result<String> {
+        let path = self.make_path("quant.tsv");
 
-        let mut wtr = csv::WriterBuilder::new().from_writer(vec![]);
+        let mut wtr = csv::WriterBuilder::new()
+            .delimiter(b'\t')
+            .from_writer(vec![]);
         let mut headers = csv::ByteRecord::from(vec!["file", "scannr", "ion_injection_time"]);
         headers.extend(
             self.parameters
@@ -326,8 +330,8 @@ impl Runner {
             .into_par_iter()
             .map(|q| {
                 let mut record = csv::ByteRecord::new();
-                record.push_field(itoa::Buffer::new().format(q.file_id).as_bytes());
-                record.push_field(itoa::Buffer::new().format(q.scannr).as_bytes());
+                record.push_field(filenames[q.file_id].as_bytes());
+                record.push_field(q.spec_id.as_bytes());
                 record.push_field(ryu::Buffer::new().format(q.ion_injection_time).as_bytes());
                 for peak in &q.peaks {
                     record.push_field(ryu::Buffer::new().format(*peak).as_bytes());
@@ -491,20 +495,32 @@ impl Runner {
         info!("discovered {} peptides at 1% FDR", q_peptide);
         info!("discovered {} proteins at 1% FDR", q_protein);
 
+        let filenames = self
+            .parameters
+            .mzml_paths
+            .iter()
+            .map(|s| {
+                s.parse::<CloudPath>()
+                    .ok()
+                    .and_then(|c| c.filename().map(|s| s.to_string()))
+                    .unwrap_or_else(|| s.clone())
+            })
+            .collect::<Vec<_>>();
+
         self.parameters
-            .pin_paths
-            .push(self.write_features(outputs.features)?);
+            .output_paths
+            .push(self.write_features(outputs.features, &filenames)?);
         if !outputs.quant.is_empty() {
             self.parameters
-                .pin_paths
-                .push(self.write_quant(&outputs.quant)?);
+                .output_paths
+                .push(self.write_quant(&outputs.quant, &filenames)?);
         }
 
         let run_time = (Instant::now() - self.start).as_secs();
         info!("finished in {}s", run_time);
 
         let path = self.make_path("results.json");
-        self.parameters.pin_paths.push(path.to_string());
+        self.parameters.output_paths.push(path.to_string());
         println!("{}", serde_json::to_string_pretty(&self.parameters)?);
 
         let bytes = serde_json::to_vec_pretty(&self.parameters)?;

diff --git a/crates/sage-cloudpath/Cargo.toml b/crates/sage-cloudpath/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "sage-cloudpath"
-version = "0.5.1"
+version = "0.6.0"
 authors = ["Michael Lazear <[email protected]"]
 edition = "2021"
 rust-version = "1.62"

diff --git a/crates/sage-cloudpath/src/lib.rs b/crates/sage-cloudpath/src/lib.rs
@@ -4,7 +4,7 @@ use http::Uri;
 use sage_core::mzml::{MzMLError, MzMLReader, Spectrum};
 use std::path::PathBuf;
 use std::str::FromStr;
-use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncWriteExt, BufReader};
+use tokio::io::{AsyncBufRead, AsyncRead, AsyncWriteExt, BufReader};
 
 static S3_CLIENT: once_cell::sync::OnceCell<aws_sdk_s3::Client> = once_cell::sync::OnceCell::new();
 
@@ -82,6 +82,13 @@ impl CloudPath {
         }
     }
 
+    pub fn filename(&self) -> Option<&str> {
+        match self {
+            CloudPath::S3 { key, .. } => key.split('/').last(),
+            CloudPath::Local(path) => path.file_name().and_then(|s| s.to_str()),
+        }
+    }
+
     /// Does the path end in "gz" or "gzip"?
     fn gzip_heuristic(&self) -> bool {
         match &self {

diff --git a/crates/sage/Cargo.toml b/crates/sage/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "sage-core"
-version = "0.5.1"
+version = "0.6.0"
 authors = ["Michael Lazear <[email protected]"]
 edition = "2021"
 rust-version = "1.62"
@@ -14,7 +14,6 @@ license = "MIT"
 async-compression = { version = "0.3", features = ["tokio", "gzip", "zlib"] }
 base64 = "0.13"
 log = "0.4.0"
-regex = "1.6"
 tokio = { version = "1.0", features = ["io-util"] }
 rayon = "1.5"
 serde = { version="1.0", features = ["derive"] }

diff --git a/crates/sage/src/database.rs b/crates/sage/src/database.rs
@@ -37,7 +37,9 @@ pub struct Builder {
     /// Variable modifications to add to matching amino acids
     pub variable_mods: Option<HashMap<char, f32>>,
     /// Use this prefix for decoy proteins
-    pub decoy_prefix: Option<String>,
+    pub decoy_tag: Option<String>,
+
+    pub generate_decoys: Option<bool>,
     /// Path to fasta database
     pub fasta: Option<PathBuf>,
 }
@@ -71,10 +73,11 @@ impl Builder {
             peptide_min_mass: self.peptide_min_mass.unwrap_or(500.0),
             peptide_max_mass: self.peptide_max_mass.unwrap_or(5000.0),
             min_ion_index: self.min_ion_index.unwrap_or(2),
-            decoy_prefix: self.decoy_prefix.unwrap_or_else(|| "rev_".into()),
+            decoy_tag: self.decoy_tag.unwrap_or_else(|| "rev_".into()),
             missed_cleavages: self.missed_cleavages.unwrap_or(0),
             static_mods: Self::validate_mods(self.static_mods),
             variable_mods: Self::validate_mods(self.variable_mods),
+            generate_decoys: self.generate_decoys.unwrap_or(true),
             fasta: self.fasta.expect("A fasta file must be provided!"),
         }
     }
@@ -97,7 +100,8 @@ pub struct Parameters {
     missed_cleavages: u8,
     static_mods: HashMap<char, f32>,
     variable_mods: HashMap<char, f32>,
-    decoy_prefix: String,
+    decoy_tag: String,
+    generate_decoys: bool,
     pub fasta: PathBuf,
 }
 
@@ -140,7 +144,7 @@ impl Parameters {
             self.peptide_min_len,
             self.peptide_max_len,
         );
-        let fasta = Fasta::open(&self.fasta, &self.decoy_prefix)?;
+        let fasta = Fasta::open(&self.fasta, self.decoy_tag.clone(), self.generate_decoys)?;
 
         let (target_decoys, peptide_graph) = self.digest(&fasta, &trypsin);