From 17021c413d5cda4186b62caff722cbd8ab8dac8c Mon Sep 17 00:00:00 2001 From: Oscar Aspelin Date: Tue, 22 Jul 2025 13:20:18 +0200 Subject: [PATCH 1/6] Add gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2f7896d --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +target/ From 64b045fc9d28a66ccecb9e728f7184261a4b17d4 Mon Sep 17 00:00:00 2001 From: Oscar Aspelin Date: Sun, 3 Aug 2025 13:42:25 +0200 Subject: [PATCH 2/6] Add proper .gitignore file --- .gitignore | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.gitignore b/.gitignore index 2f7896d..d49725c 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,14 @@ +# Generated by Cargo +# will have compiled files and executables +debug/ target/ + +# These are backup files generated by rustfmt +**/*.rs.bk + +# MSVC Windows builds of rustc generate these, which store debugging information +*.pdb + +# Generated by cargo mutants +# Contains mutation testing data +**/mutants.out*/ From 46439271ddb3ba5a24190e72f86d5a94c639c645 Mon Sep 17 00:00:00 2001 From: Oscar Aspelin Date: Sun, 3 Aug 2025 13:46:22 +0200 Subject: [PATCH 3/6] Rename Example_data to example_data and update README --- README.md | 6 +++--- {Example_data => example_data}/test_data.fastq | 0 2 files changed, 3 insertions(+), 3 deletions(-) rename {Example_data => example_data}/test_data.fastq (100%) diff --git a/README.md b/README.md index 4001fb9..c712002 100644 --- a/README.md +++ b/README.md @@ -33,12 +33,12 @@ cargo build --release The executable is then located in folder `target/release`. ### Testing the installation -Run `target/release/isONclust3 --fastq Example_data/test_data.fastq --mode ont --outfolder Example_out --seeding minimizer --post-cluster`. +Run `target/release/isONclust3 --fastq example_data/test_data.fastq --mode ont --outfolder example_out --seeding minimizer --post-cluster`. -This generates an output directory in the repository folder. The fastq_files folder inside clustering should now contain 94 fastq files(each representing one cluster). +This generates an output directory in the repository folder. The fastq_files folder inside clustering should now contain 94 fastq files(each representing one cluster). # Running isONclust3 -IsONclust3 can be used on either Pacbio data or ONT data. +IsONclust3 can be used on either Pacbio data or ONT data. ``` isONclust3 --fastq {input.fastq} --mode ont --outfolder {outfolder} # Oxford Nanopore reads diff --git a/Example_data/test_data.fastq b/example_data/test_data.fastq similarity index 100% rename from Example_data/test_data.fastq rename to example_data/test_data.fastq From 8d29c86ea85f4082abe9e16b16698d0d37091dbe Mon Sep 17 00:00:00 2001 From: Oscar Aspelin Date: Sun, 3 Aug 2025 13:47:12 +0200 Subject: [PATCH 4/6] Move cpu native rustflag to config.toml and add log + simple_logger crates --- .cargo/config.toml | 2 + Cargo.lock | 177 ++++++++++++++++++++++++++++++++++++++++++--- Cargo.toml | 8 +- 3 files changed, 175 insertions(+), 12 deletions(-) create mode 100644 .cargo/config.toml diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 0000000..ddff440 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,2 @@ +[build] +rustflags = ["-C", "target-cpu=native"] diff --git a/Cargo.lock b/Cargo.lock index b629e12..195db7d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -270,6 +270,16 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" +[[package]] +name = "colored" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" +dependencies = [ + "lazy_static", + "windows-sys 0.59.0", +] + [[package]] name = "crossbeam-deque" version = "0.8.5" @@ -322,6 +332,15 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ef8ae57c4978a2acd8b869ce6b9ca1dfe817bff704c220209fdef2c0b75a01b9" +[[package]] +name = "deranged" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c9e6a11ca8224451684bc0d7d5a7adbf8f2fd6887261a1cfc3c0432f9d4068e" +dependencies = [ + "powerfmt", +] + [[package]] name = "derive-new" version = "0.6.0" @@ -445,11 +464,13 @@ dependencies = [ "bio-seq", "clap", "clap-cargo", + "log", "memory-stats", "minimizer-iter", "nohash-hasher", "rayon", "rustc-hash", + "simple_logger", ] [[package]] @@ -500,6 +521,12 @@ version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa" +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + [[package]] name = "matrixmultiply" version = "0.3.9" @@ -628,6 +655,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + [[package]] name = "num-integer" version = "0.1.46" @@ -657,6 +690,15 @@ dependencies = [ "libm", ] +[[package]] +name = "num_threads" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" +dependencies = [ + "libc", +] + [[package]] name = "ordered-float" version = "3.9.2" @@ -682,6 +724,12 @@ dependencies = [ "indexmap", ] +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.20" @@ -885,6 +933,18 @@ dependencies = [ "wide", ] +[[package]] +name = "simple_logger" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e8c5dfa5e08767553704aa0ffd9d9794d527103c736aba9854773851fd7497eb" +dependencies = [ + "colored", + "log", + "time", + "windows-sys 0.48.0", +] + [[package]] name = "statrs" version = "0.16.1" @@ -990,6 +1050,39 @@ dependencies = [ "syn 2.0.89", ] +[[package]] +name = "time" +version = "0.3.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7619e19bc266e0f9c5e6686659d394bc57973859340060a69221e57dbc0c40" +dependencies = [ + "deranged", + "itoa", + "libc", + "num-conv", + "num_threads", + "powerfmt", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9e9a38711f559d9e3ce1cdb06dd7c5b8ea546bc90052da6d06bb76da74bb07c" + +[[package]] +name = "time-macros" +version = "0.2.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3526739392ec93fd8b359c8e98514cb3e8e021beb4e5f597b00a0221f8ed8a49" +dependencies = [ + "num-conv", + "time-core", +] + [[package]] name = "triple_accel" version = "0.4.0" @@ -1039,13 +1132,22 @@ dependencies = [ "safe_arch", ] +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + [[package]] name = "windows-sys" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", ] [[package]] @@ -1054,7 +1156,22 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" +dependencies = [ + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", ] [[package]] @@ -1063,28 +1180,46 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" + [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" + [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_i686_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" + [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -1097,24 +1232,48 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" + [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_x86_64_gnu" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" + [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" + [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_msvc" +version = "0.48.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + [[package]] name = "windows_x86_64_msvc" version = "0.52.6" diff --git a/Cargo.toml b/Cargo.toml index a570be2..2c9bb35 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,20 +1,22 @@ -rustflags = ["-Ctarget-cpu=native"] + [package] name = "isONclust3" version = "0.3.0" edition = "2021" repository = "https://github.com/aljpetri/isONclust3" -license="MIT" +license = "MIT" description = "Rust implementation of a novel de novo clustering algorithm. isONclust3 is a tool for clustering either PacBio Iso-Seq reads, or Oxford Nanopore reads into clusters, where each cluster represents all reads that came from a gene family. Output is a tsv file with each read assigned to a cluster-ID and a folder 'fastq' containing one fastq file per cluster generated. Detailed information is available in the isONclust3 paper." # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] bio = "1" bio-seq = "0.10.0" -clap = { version = "4.4.3", features = ["derive","cargo"] } +clap = { version = "4.4.3", features = ["derive", "cargo"] } clap-cargo = "0.12.0" rayon = "1.7" memory-stats = "1.0.0" rustc-hash = "1.1.0" minimizer-iter = "1.2.1" nohash-hasher = "0.2.0" +log = "0.4.27" +simple_logger = "5.0.0" From 588b8fb1d9a508e84140fba4a9c84ed686a012f4 Mon Sep 17 00:00:00 2001 From: Oscar Aspelin Date: Sun, 3 Aug 2025 14:06:58 +0200 Subject: [PATCH 5/6] Add simple_logger init and replace println! with log macros. Set logging level based on --verbose argument. --- src/clustering.rs | 295 +++++++----- src/generate_sorted_fastq_for_cluster.rs | 197 +++++--- src/gff_handling.rs | 250 ++++++---- src/main.rs | 450 +++++++++++------- src/seeding_and_filtering_seeds.rs | 580 ++++++++++++----------- src/write_output.rs | 163 ++++--- 6 files changed, 1164 insertions(+), 771 deletions(-) diff --git a/src/clustering.rs b/src/clustering.rs index 19a22a3..4cda6f3 100644 --- a/src/clustering.rs +++ b/src/clustering.rs @@ -1,46 +1,56 @@ -use std::time::Instant; use crate::structs::Minimizer_hashed; +use std::time::Instant; -use rustc_hash::{FxHashMap, FxHashSet}; -use rayon::prelude::*; use crate::{Cluster_ID_Map, Seed_Map}; +use log::debug; +use rayon::prelude::*; +use rustc_hash::{FxHashMap, FxHashSet}; pub(crate) fn reverse_complement(dna: &str) -> String { - let reverse_complement: String = dna.chars() - .rev()//TODO: test whether into_par_iter works here + let reverse_complement: String = dna + .chars() + .rev() //TODO: test whether into_par_iter works here .map(|c| match c { 'A' => 'T', 'T' => 'A', 'C' => 'G', 'G' => 'C', _ => c, - }).clone() + }) + .clone() .collect(); reverse_complement } - -fn calculate_shared_perc(nr_sign_minis: usize,value: i32)->f64{ +fn calculate_shared_perc(nr_sign_minis: usize, value: i32) -> f64 { value as f64 / nr_sign_minis as f64 } - -fn new_Fx_hashset()->FxHashSet{ - let return_set:FxHashSet= FxHashSet::default(); +fn new_Fx_hashset() -> FxHashSet { + let return_set: FxHashSet = FxHashSet::default(); return_set } -fn detect_whether_shared(min_shared_minis:f64, shared_seed_infos: &FxHashMap, minimizers: &[Minimizer_hashed]) -> (bool, i32) { - let mut most_shared= 0.0; - let mut shared= false; - let mut most_shared_cluster= -1; - let nr_minis= minimizers.len(); +fn detect_whether_shared( + min_shared_minis: f64, + shared_seed_infos: &FxHashMap, + minimizers: &[Minimizer_hashed], +) -> (bool, i32) { + let mut most_shared = 0.0; + let mut shared = false; + let mut most_shared_cluster = -1; + let nr_minis = minimizers.len(); let mut shared_perc: f64; - for (key, nr_shared) in shared_seed_infos {//TODO: test whether into_par_iter works here + for (key, nr_shared) in shared_seed_infos { + //TODO: test whether into_par_iter works here //we have more shared minis with the cluster than our threshold and this is the cluster we share the most minimizers with shared_perc = calculate_shared_perc(nr_minis, *nr_shared); - //println!("shared percentage between read and cluster {} : {}",key, shared_perc); - if shared_perc > min_shared_minis && shared_perc > most_shared{//} && *nr_shared >=0 { + debug!( + "shared percentage between read and cluster {} : {}", + key, shared_perc + ); + if shared_perc > min_shared_minis && shared_perc > most_shared { + //} && *nr_shared >=0 { most_shared = shared_perc; most_shared_cluster = *key; if !shared { @@ -48,52 +58,64 @@ fn detect_whether_shared(min_shared_minis:f64, shared_seed_infos: &FxHashMapnr shared minimizers with clusters->not updated when cluster changes! //clustering method for the case that we do not have any annotation to compare the reads against //shared_seed_infos: hashmap that holds read_id->nr shared minimizers with clusters->not updated when cluster changes! //clustering method for the case that we do not have any annotation to compare the reads against -pub(crate) fn cluster(sign_minis: &Vec, min_shared_minis:f64, minimizers: &Vec, clusters:&mut Cluster_ID_Map, cluster_map: &mut Seed_Map, id: i32, shared_seed_infos_norm_vec: &mut [i32]){ +pub(crate) fn cluster( + sign_minis: &Vec, + min_shared_minis: f64, + minimizers: &Vec, + clusters: &mut Cluster_ID_Map, + cluster_map: &mut Seed_Map, + id: i32, + shared_seed_infos_norm_vec: &mut [i32], +) { //clusters contains the main result we are interested in: it will contain the cluster id as key and the read_ids of reads from the cluster as value //cluster_map contains a hashmap in which we have a hash_value for a minimizer as key and a vector of ids as a value - let cl_id:i32 = clusters.len() as i32; + let cl_id: i32 = clusters.len() as i32; //we do not yet have a cluster and therefore need to fill the first read into the first - if clusters.is_empty(){ + if clusters.is_empty() { for minimizer in sign_minis { //fill cluster_map with the minimizers that we found in the first read - cluster_map - .entry(minimizer.sequence) - .or_default(); + cluster_map.entry(minimizer.sequence).or_default(); let vect = cluster_map.get_mut(&minimizer.sequence).unwrap(); if !vect.contains(&cl_id) { vect.push(cl_id); } } - let id_vec=vec![id]; - clusters.insert(cl_id,id_vec); + let id_vec = vec![id]; + clusters.insert(cl_id, id_vec); } //entry represents a read in our data //if we already have at least one cluster: compare the new read to the cluster(s) else { - let mut most_shared_cluster= -1; - let mut shared= false; + let mut most_shared_cluster = -1; + let mut shared = false; // println!("shared_seed_infos_norm_vec BEFORE: {:?}", shared_seed_infos_norm_vec); - for minimizer in minimizers{//TODO: test whether into_par_iter works here + for minimizer in minimizers { + //TODO: test whether into_par_iter works here //if we find the minimizer hash in cluster_map: store the clusters in belongs_to if let Some(belongs_to) = cluster_map.get(&minimizer.sequence) { //iterate over belongs_to to update the counts of shared minimizers for each cluster - for &belong_cluster in belongs_to {//TODO: test whether into_par_iter works here + for &belong_cluster in belongs_to { + //TODO: test whether into_par_iter works here //iterate over belongs_to to update the counts of shared minimizers for each cluster shared_seed_infos_norm_vec[belong_cluster as usize] += 1; } } } - // we have found clusters to compare to - if let Some((max_cluster_id, max_shared)) = shared_seed_infos_norm_vec.iter().enumerate().max_by_key(|&(_, value)| value) { - let nr_minis= minimizers.len(); + // we have found clusters to compare to + if let Some((max_cluster_id, max_shared)) = shared_seed_infos_norm_vec + .iter() + .enumerate() + .max_by_key(|&(_, value)| value) + { + let nr_minis = minimizers.len(); let mut shared_perc: f64; //we have more shared minis with the cluster than our threshold and this is the cluster we share the most minimizers with shared_perc = calculate_shared_perc(nr_minis, *max_shared); @@ -110,7 +132,10 @@ pub(crate) fn cluster(sign_minis: &Vec, min_shared_minis:f64, read_list.push(id); } //the following for-loop updates cluster_map - //println!("PUSHING HERE most_shared_cluster: {} {}", most_shared_cluster, &most_shared_cluster); + debug!( + "PUSHING HERE most_shared_cluster: {} {}", + most_shared_cluster, &most_shared_cluster + ); for sign_mini in sign_minis { cluster_map //TODO: test whether into_par_iter works here .entry(sign_mini.sequence) @@ -118,13 +143,13 @@ pub(crate) fn cluster(sign_minis: &Vec, min_shared_minis:f64, let vect = cluster_map.get_mut(&sign_mini.sequence).unwrap(); if !vect.contains(&most_shared_cluster) { vect.push(most_shared_cluster); - //println!("vect: {:?}", vect); + debug!("vect: {:?}", vect); } } } //we did not find a cluster that we could put the read into-> generate a new cluster - else{ - if !sign_minis.is_empty(){ + else { + if !sign_minis.is_empty() { for sign_mini in sign_minis { cluster_map //TODO: test whether into_par_iter works here .entry(sign_mini.sequence) @@ -136,25 +161,26 @@ pub(crate) fn cluster(sign_minis: &Vec, min_shared_minis:f64, } } } - let id_vec=vec![id]; - clusters.insert(cl_id,id_vec); + let id_vec = vec![id]; + clusters.insert(cl_id, id_vec); } } } - //takes clusters_map as input and generates cl_set_map: a Hashmap containing the cluster id as key and a hashset of seedhashes as value. -fn generate_cluster_merging_ds(cl_set_map: &mut FxHashMap>, clusters_map: &mut Seed_Map){ +fn generate_cluster_merging_ds( + cl_set_map: &mut FxHashMap>, + clusters_map: &mut Seed_Map, +) { //cl_set_map is a hashmap with cl_id -> Hashset of seed hashes //iterate over clusters_map for (mini, vec_of_ids) in clusters_map { //iterate over the ids that we have stored in the value of each minimizer for id in vec_of_ids.iter() { //the cluster is already in the cluster_seeds_hash_map ->add the seed hash to the hashset, otherwise add new hashset with the seed hash - if cl_set_map.contains_key(id){ + if cl_set_map.contains_key(id) { cl_set_map.get_mut(id).unwrap().push(*mini); - } - else{ + } else { let this_set: Vec = vec![*mini]; cl_set_map.insert(*id, this_set); } @@ -162,22 +188,26 @@ fn generate_cluster_merging_ds(cl_set_map: &mut FxHashMap>, cluste } } - - //helper function for the post_clustering step: Updates the 'clusters' and 'clusters_map' data structures -fn update_clusters(clusters: &mut Cluster_ID_Map, clusters_map: &mut Seed_Map, small_hs: &[u64], large_cluster_id: &i32, small_cluster_id:&i32){ - //println!("attempt: {} into {}",small_cluster_id,large_cluster_id); +fn update_clusters( + clusters: &mut Cluster_ID_Map, + clusters_map: &mut Seed_Map, + small_hs: &[u64], + large_cluster_id: &i32, + small_cluster_id: &i32, +) { + debug!("attempt: {} into {}", small_cluster_id, large_cluster_id); //get the infos of clusters that belong to the two clusters we want to merge - let small_cl_info= clusters.remove(small_cluster_id).unwrap(); - let large_cl_info= clusters.get_mut(large_cluster_id).unwrap(); + let small_cl_info = clusters.remove(small_cluster_id).unwrap(); + let large_cl_info = clusters.get_mut(large_cluster_id).unwrap(); //add the reads of the small cluster into the large cluster large_cl_info.extend(small_cl_info); //clusters.remove_entry(small_cluster_id); //also add the hashes of the small cluster into the large cluster - if !small_hs.is_empty(){ - for seed_hash in small_hs{ - let mut cl_vec= clusters_map.get_mut(seed_hash).unwrap(); - if !cl_vec.contains(large_cluster_id){ + if !small_hs.is_empty() { + for seed_hash in small_hs { + let mut cl_vec = clusters_map.get_mut(seed_hash).unwrap(); + if !cl_vec.contains(large_cluster_id) { cl_vec.push(*large_cluster_id); } //delete small_cluster_id from the seed hashes, so we do not have any indication of the cluster in the ds @@ -186,15 +216,20 @@ fn update_clusters(clusters: &mut Cluster_ID_Map, clusters_map: &mut Seed_Map, s } } - - - -fn detect_overlaps( cl_set_map: &FxHashMap>, cluster_map: &mut Seed_Map, merge_into: &mut Vec<(i32,i32)>, min_shared_minis: f64, small_hs: &mut FxHashSet, shared_seed_infos_vec: &mut [i32], verbose:bool ){ +fn detect_overlaps( + cl_set_map: &FxHashMap>, + cluster_map: &mut Seed_Map, + merge_into: &mut Vec<(i32, i32)>, + min_shared_minis: f64, + small_hs: &mut FxHashSet, + shared_seed_infos_vec: &mut [i32], + verbose: bool, +) { //shared_seed_infos_vec: a vector - let mut cl_sorted: Vec<(&i32,&Vec)> = cl_set_map.iter().collect(); + let mut cl_sorted: Vec<(&i32, &Vec)> = cl_set_map.iter().collect(); cl_sorted.sort_by_key(|&(_, v)| v.len()); - for (cl_id,hashes) in cl_sorted{ - //for (cl_id, hashes) in cl_set_map{ + for (cl_id, hashes) in cl_sorted { + //for (cl_id, hashes) in cl_set_map{ //iterate over the hashes for each cl_id for hash in hashes.iter() { if let Some(belongs_to) = cluster_map.get(hash) { @@ -207,32 +242,42 @@ fn detect_overlaps( cl_set_map: &FxHashMap>, cluster_map: &mut Seed } } } - if let Some((max_cluster_id, max_shared)) = shared_seed_infos_vec.iter().enumerate().max_by_key(|&(_, value)| value) { - let nr_minis= hashes.len(); + if let Some((max_cluster_id, max_shared)) = shared_seed_infos_vec + .iter() + .enumerate() + .max_by_key(|&(_, value)| value) + { + let nr_minis = hashes.len(); let shared_perc: f64; - let most_shared_cluster_id= max_cluster_id as i32; + let most_shared_cluster_id = max_cluster_id as i32; //calculate the percentage of shared minimizers shared_perc = calculate_shared_perc(nr_minis, *max_shared); //We only merge if we share more than min_shared_minis if shared_perc > min_shared_minis { - if verbose{ + if verbose { println!("CL_ID {}, msc {}", cl_id, most_shared_cluster_id); - println!("nr_minis {}, max_shared {}, shared_perc {}",nr_minis, max_shared,shared_perc); + println!( + "nr_minis {}, max_shared {}, shared_perc {}", + nr_minis, max_shared, shared_perc + ); } - //println!("ENTERING MERGE"); + debug!("ENTERING MERGE"); //if this cluster has less minimizers than most_shared_cluster and most_shared_cluster is not in small_hs (does not get merged into another cluster) - if nr_minis < cl_set_map.get(&most_shared_cluster_id).unwrap().len() && !small_hs.contains(&most_shared_cluster_id) { - if !merge_into.contains(&(*cl_id,most_shared_cluster_id)){ + if nr_minis < cl_set_map.get(&most_shared_cluster_id).unwrap().len() + && !small_hs.contains(&most_shared_cluster_id) + { + if !merge_into.contains(&(*cl_id, most_shared_cluster_id)) { //add the info to merge_into that we want to merge cl_id into most_shared_cluster merge_into.push((*cl_id, most_shared_cluster_id)); small_hs.insert(*cl_id); } - } - - else {//the clusters have exactly the same number of seeds + } else { + //the clusters have exactly the same number of seeds //if cl_id is smaller than most_shared_cluster_id (we need some kind of merging same size clusters) - if *cl_id < most_shared_cluster_id && !small_hs.contains(&most_shared_cluster_id){ - if !merge_into.contains(&(*cl_id,most_shared_cluster_id)) { + if *cl_id < most_shared_cluster_id + && !small_hs.contains(&most_shared_cluster_id) + { + if !merge_into.contains(&(*cl_id, most_shared_cluster_id)) { //add the info to merge_into that we want to merge cl_id into most_shared_cluster merge_into.push((*cl_id, most_shared_cluster_id)); small_hs.insert(*cl_id); @@ -242,87 +287,116 @@ fn detect_overlaps( cl_set_map: &FxHashMap>, cluster_map: &mut Seed } } // clear count vector for next cluster - for item in shared_seed_infos_vec.iter_mut(){ + for item in shared_seed_infos_vec.iter_mut() { *item = 0; } } } - -fn merge_clusters_from_merge_into(merge_into: &mut Vec<(i32,i32)>, clusters_map: &mut Seed_Map, clusters: &mut Cluster_ID_Map, cl_set_map: &mut FxHashMap>, small_hs: &FxHashSet, not_large: &mut FxHashSet){ - //println!("Merge_into_len: {}",merge_into.len()); - for (id , value) in merge_into{ +fn merge_clusters_from_merge_into( + merge_into: &mut Vec<(i32, i32)>, + clusters_map: &mut Seed_Map, + clusters: &mut Cluster_ID_Map, + cl_set_map: &mut FxHashMap>, + small_hs: &FxHashSet, + not_large: &mut FxHashSet, +) { + debug!("Merge_into_len: {}", merge_into.len()); + for (id, value) in merge_into { let large_id = value; //we might already have deleted large_id from clusters during this iteration if clusters.contains_key(large_id) { //idea here: we merge the ids into larger clusters first, smaller clusters are still bound to merge into the new cluster later - if !small_hs.contains(large_id){ + if !small_hs.contains(large_id) { //merge_clusters( clusters, clusters_map, cl_set_map, large_id, id) let small_hs: &Vec = cl_set_map.get(id).unwrap(); update_clusters(clusters, clusters_map, small_hs, large_id, id); - } - else{ + } else { not_large.insert(*large_id); } } } } - -pub(crate) fn cluster_merging(clusters: &mut Cluster_ID_Map, cluster_map: &mut Seed_Map, min_shared_minis:f64, shared_seed_infos_vec: &mut Vec, verbose: bool){ +pub(crate) fn cluster_merging( + clusters: &mut Cluster_ID_Map, + cluster_map: &mut Seed_Map, + min_shared_minis: f64, + shared_seed_infos_vec: &mut Vec, + verbose: bool, +) { //let min_shared_minis_pc = 0.5; - println!("min_shared_minis_pc: {}",min_shared_minis); + println!("min_shared_minis_pc: {}", min_shared_minis); //cl_set_map is a hashmap with cl_id -> Hashset of seed hashes - let mut cl_set_map: FxHashMap> = FxHashMap::default(); - if verbose{ - println!("Cl_set_map {:?}",cl_set_map.len()); + let mut cl_set_map: FxHashMap> = FxHashMap::default(); + if verbose { + println!("Cl_set_map {:?}", cl_set_map.len()); } //merge_into is a vector of a tuple(cl_id1,cl_id2) - let mut merge_into: Vec<(i32,i32)> = vec![]; + let mut merge_into: Vec<(i32, i32)> = vec![]; //small_hs is a HashSet storing all cluster ids that were merged into other clusters during this iteration let mut small_hs: FxHashSet = FxHashSet::default(); //used to have do-while structure - let mut first_iter= true; - let mut not_large= FxHashSet::default(); + let mut first_iter = true; + let mut not_large = FxHashSet::default(); //continue merging as long as we still find clusters that we may merge while !merge_into.is_empty() || first_iter { //clear merge_into as this is the indicator how often we attempt to merge further (the while loop depends on it) merge_into.clear(); - println!("MI {:?}",merge_into); + println!("MI {:?}", merge_into); small_hs.clear(); //set first_iter to be false to not stay in a infinity loop first_iter = false; //merge_into contains the information about which clusters to merge into which //generate the data structure giving us merge infos let now_pc1 = Instant::now(); - generate_cluster_merging_ds(&mut cl_set_map, cluster_map); + generate_cluster_merging_ds(&mut cl_set_map, cluster_map); println!("{} s for creating the pc ds", now_pc1.elapsed().as_secs()); println!("Post_clustering_ds generated"); let now_pc2 = Instant::now(); - detect_overlaps(&mut cl_set_map, cluster_map, &mut merge_into, min_shared_minis, &mut small_hs, shared_seed_infos_vec, verbose); - println!("{} s for detection of overlaps", now_pc2.elapsed().as_secs()); - if verbose{ - println!("Merge_into {:?}",merge_into); + detect_overlaps( + &mut cl_set_map, + cluster_map, + &mut merge_into, + min_shared_minis, + &mut small_hs, + shared_seed_infos_vec, + verbose, + ); + println!( + "{} s for detection of overlaps", + now_pc2.elapsed().as_secs() + ); + if verbose { + println!("Merge_into {:?}", merge_into); } let now_pc3 = Instant::now(); - merge_clusters_from_merge_into(&mut merge_into, cluster_map, clusters, &mut cl_set_map, &small_hs, &mut not_large); + merge_clusters_from_merge_into( + &mut merge_into, + cluster_map, + clusters, + &mut cl_set_map, + &small_hs, + &mut not_large, + ); println!("{} s for merging of clusters", now_pc3.elapsed().as_secs()); let now_pc4 = Instant::now(); merge_into.retain(|&(_, second)| !not_large.contains(&second)); println!("{} s for retaining merge_into", now_pc4.elapsed().as_secs()); println!("{} s since create ds", now_pc2.elapsed().as_secs()); - + cl_set_map.clear(); } - println!("min_shared_minis_pc: {}",min_shared_minis); + println!("min_shared_minis_pc: {}", min_shared_minis); } - -pub(crate) fn generate_initial_cluster_map(this_minimizers: &Vec, init_cluster_map: &mut Seed_Map,identifier: i32){ - for minimizer in this_minimizers{ - init_cluster_map - .entry(minimizer.sequence) - .or_default(); +pub(crate) fn generate_initial_cluster_map( + this_minimizers: &Vec, + init_cluster_map: &mut Seed_Map, + identifier: i32, +) { + for minimizer in this_minimizers { + init_cluster_map.entry(minimizer.sequence).or_default(); // Check if id was retained (not a duplicate) and push it if needed let vec = init_cluster_map.get_mut(&minimizer.sequence).unwrap(); if !vec.contains(&identifier) { @@ -331,15 +405,14 @@ pub(crate) fn generate_initial_cluster_map(this_minimizers: &Vec String { @@ -20,20 +20,22 @@ fn compress_sequence(seq: &[u8]) -> String { let mut seq_hpol_comp = String::new(); let mut last_char: Option<&u8> = None; - for ch in seq{ - if last_char.is_none() || last_char.unwrap()!= ch{ + for ch in seq { + if last_char.is_none() || last_char.unwrap() != ch { seq_hpol_comp.push(*ch as char); } - last_char= Some(ch); + last_char = Some(ch); } seq_hpol_comp } - -fn expected_number_errornous_kmers(quality_string: &str, k: usize, d:&[f64;128]) -> f64 { +fn expected_number_errornous_kmers(quality_string: &str, k: usize, d: &[f64; 128]) -> f64 { //computes the expected number of errornous kmers for a read by analysing the quality entry - let prob_error: Vec = quality_string.chars().map(|char_| d[char_ as u8 as usize]).collect(); + let prob_error: Vec = quality_string + .chars() + .map(|char_| d[char_ as u8 as usize]) + .collect(); let mut sum_of_expectations = 0.0; let mut current_prob_no_error = 1.0; //holds k nucleotides to represent a kmer @@ -58,8 +60,6 @@ fn expected_number_errornous_kmers(quality_string: &str, k: usize, d:&[f64;128]) (quality_string.len() - k + 1) as f64 - sum_of_expectations } - - fn calculate_error_rate(qual: &[u8], d_no_min: &[f64; 128]) -> f64 { let mut counts = vec![0; 128]; let mut total_count = 0; @@ -69,16 +69,14 @@ fn calculate_error_rate(qual: &[u8], d_no_min: &[f64; 128]) -> f64 { counts[char_byte as usize] += 1; } - for (idx, cnt) in counts.iter().enumerate() { + for (idx, cnt) in counts.iter().enumerate() { poisson_mean += *cnt as f64 * d_no_min[idx]; total_count += *cnt; } poisson_mean / total_count as f64 - } - fn compute_d() -> [f64; 128] { let mut d = [0.0; 128]; for i in 0..128 { @@ -90,7 +88,6 @@ fn compute_d() -> [f64; 128] { d } - //D_no_min = {chr(i) : 10**( - (ord(chr(i)) - 33)/10.0 ) for i in range(128)} fn compute_d_no_min() -> [f64; 128] { let mut d = [0.0; 128]; @@ -104,20 +101,33 @@ fn compute_d_no_min() -> [f64; 128] { d } - -fn analyse_fastq_and_sort(k:usize, q_threshold:f64, in_file_path:&str, quality_threshold: &f64, window_size: usize, score_vec: &mut Vec<(i32,usize)>, id_map: &mut FxHashMap, seeding: &str, s: usize, t: usize, noncanonical_bool: bool, verbose:bool){ +fn analyse_fastq_and_sort( + k: usize, + q_threshold: f64, + in_file_path: &str, + quality_threshold: &f64, + window_size: usize, + score_vec: &mut Vec<(i32, usize)>, + id_map: &mut FxHashMap, + seeding: &str, + s: usize, + t: usize, + noncanonical_bool: bool, + verbose: bool, +) { /* Reads, filters and sorts reads from a fastq file so that we are left with reads having a reasonable quality score, that are sorted by score */ - let d_no_min= compute_d_no_min(); + let d_no_min = compute_d_no_min(); //read_id holds the internal id we appoint to a read let mut read_id = 0; //generate a Reader object that parses the fastq-file (taken from rust-bio) - let mut reader = fastq::Reader::from_file(Path::new(&in_file_path)).expect("We expect the file to exist"); + let mut reader = + fastq::Reader::from_file(Path::new(&in_file_path)).expect("We expect the file to exist"); //make sure that we have suitable values for k_size and w_size (w_size should be larger) let mut w; - if window_size > k{ - w = window_size - k+ 1; // the minimizer generator will panic if w is even to break ties + if window_size > k { + w = window_size - k + 1; // the minimizer generator will panic if w is even to break ties if w % 2 == 0 { w += 1; } @@ -127,107 +137,152 @@ fn analyse_fastq_and_sort(k:usize, q_threshold:f64, in_file_path:&str, quality_t w = 1; } //iterate over the records - for record in reader.records(){ + for record in reader.records() { let seq_rec = record.expect("invalid record"); let header_new = seq_rec.id(); let sequence = seq_rec.seq(); - let quality = seq_rec.qual();//.expect("We also should have a quality"); - //add the read id and the real header to id_map - if sequence.len() >= 2*k && compress_sequence(sequence).len() >= k{ + let quality = seq_rec.qual(); //.expect("We also should have a quality"); + //add the read id and the real header to id_map + if sequence.len() >= 2 * k && compress_sequence(sequence).len() >= k { let mut this_minimizers = vec![]; let mut filtered_minis = vec![]; if seeding == "minimizer" { - if noncanonical_bool{ + if noncanonical_bool { let min_iter = MinimizerBuilder::::new() .minimizer_size(k) .width((w) as u16) .iter(sequence); for (minimizer, position) in min_iter { - let mini = Minimizer_hashed { sequence: minimizer, position: position }; + let mini = Minimizer_hashed { + sequence: minimizer, + position: position, + }; this_minimizers.push(mini); } - } - else { + } else { let min_iter = MinimizerBuilder::::new() .canonical() .minimizer_size(k) .width((w) as u16) .iter(sequence); for (minimizer, position, _) in min_iter { - let mini = Minimizer_hashed {sequence: minimizer,position: position }; + let mini = Minimizer_hashed { + sequence: minimizer, + position: position, + }; this_minimizers.push(mini); } - //println!("minimizers NEW len: {:?}", this_minimizers.len()); + debug!("minimizers NEW len: {:?}", this_minimizers.len()); //generate_sorted_fastq_new_version::get_canonical_kmer_minimizers_hashed(sequence, k, window_size, &mut this_minimizers); - //println!("minimizers OLD len: {:?}", &this_minimizers.len()); - } - } - else if seeding =="syncmer"{ - if noncanonical_bool{ - seeding_and_filtering_seeds::get_kmer_syncmers(sequence, k, s, t, &mut this_minimizers); + debug!("minimizers OLD len: {:?}", &this_minimizers.len()); } - else { - seeding_and_filtering_seeds::syncmers_canonical(sequence, k, s, t, &mut this_minimizers); + } else if seeding == "syncmer" { + if noncanonical_bool { + seeding_and_filtering_seeds::get_kmer_syncmers( + sequence, + k, + s, + t, + &mut this_minimizers, + ); + } else { + seeding_and_filtering_seeds::syncmers_canonical( + sequence, + k, + s, + t, + &mut this_minimizers, + ); } - } - else{ + } else { println!("No seeding"); } - seeding_and_filtering_seeds::filter_seeds_by_quality(&this_minimizers, quality, k, d_no_min, &mut filtered_minis, quality_threshold, false); - let error_rate= calculate_error_rate(quality, &d_no_min); - if 10.0_f64 * - error_rate.log(10.0_f64) > q_threshold{ + seeding_and_filtering_seeds::filter_seeds_by_quality( + &this_minimizers, + quality, + k, + d_no_min, + &mut filtered_minis, + quality_threshold, + false, + ); + let error_rate = calculate_error_rate(quality, &d_no_min); + if 10.0_f64 * -error_rate.log(10.0_f64) > q_threshold { id_map.insert(read_id, header_new.to_string()); - let score_tuple=(read_id, filtered_minis.len()); + let score_tuple = (read_id, filtered_minis.len()); score_vec.push(score_tuple); read_id += 1; } } } //sort the score_vec by the number of high-confidence seeds (most to least) - score_vec.par_sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());//TODO: replace by par_sort_by - if verbose{ - let print_vec= &score_vec[0..5]; - for score_tup in print_vec{ - println!("ID {} count {}", &score_tup.0,score_tup.1); + score_vec.par_sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); //TODO: replace by par_sort_by + if verbose { + let print_vec = &score_vec[0..5]; + for score_tup in print_vec { + println!("ID {} count {}", &score_tup.0, score_tup.1); } } - println!("{} reads accepted",score_vec.len()); - //println!("{:?}",score_vec.pop()); + println!("{} reads accepted", score_vec.len()); + debug!("{:?}", score_vec.pop()); } - -fn print_statistics(fastq_records: &[FastqRecord_isoncl_init]){ +fn print_statistics(fastq_records: &[FastqRecord_isoncl_init]) { /* Prints the statistics for the resulting file TODO: add median and mean */ let min_e = fastq_records[0].get_err_rate(); - let max_e = fastq_records[fastq_records.len()-1].get_err_rate(); - println!("Lowest read error rate: {}",min_e); - println!("Highest read error rate: {}",max_e); + let max_e = fastq_records[fastq_records.len() - 1].get_err_rate(); + println!("Lowest read error rate: {}", min_e); + println!("Highest read error rate: {}", max_e); //logfile.write("Median read error rate:{0}\n".format(median_e)) //logfile.write("Mean read error rate:{0}\n".format(mean_e)) } - -pub(crate) fn sort_fastq_for_cluster(k:usize, q_threshold:f64, in_file_path:&str, outfolder: &String, quality_threshold:&f64, window_size: usize, seeding: &str, s: usize, t: usize, noncanonical_bool: bool, verbose: bool) { +pub(crate) fn sort_fastq_for_cluster( + k: usize, + q_threshold: f64, + in_file_path: &str, + outfolder: &String, + quality_threshold: &f64, + window_size: usize, + seeding: &str, + s: usize, + t: usize, + noncanonical_bool: bool, + verbose: bool, +) { println!("Sorting the fastq_file"); let now = Instant::now(); //holds the internal ids and scores as tuples to be able to sort properly - let mut score_vec=vec![]; + let mut score_vec = vec![]; //holds the internal read id - let mut id_map=FxHashMap::default(); + let mut id_map = FxHashMap::default(); //the main step of the sort_fastq_for_cluster step: Gets the number of high-confidence seeds for each read and writes them into score_vec - analyse_fastq_and_sort(k, q_threshold, in_file_path,quality_threshold,window_size,&mut score_vec, &mut id_map, seeding,s,t, noncanonical_bool, verbose); + analyse_fastq_and_sort( + k, + q_threshold, + in_file_path, + quality_threshold, + window_size, + &mut score_vec, + &mut id_map, + seeding, + s, + t, + noncanonical_bool, + verbose, + ); let elapsed = now.elapsed(); println!("Elapsed: {:.2?}", elapsed); - if !path_exists(outfolder){ + if !path_exists(outfolder) { fs::create_dir(outfolder.clone()).expect("We should be able to create the directory"); } //write a fastq-file that contains the reordered reads - write_output::write_ordered_fastq(&score_vec, outfolder,&id_map,in_file_path); + write_output::write_ordered_fastq(&score_vec, outfolder, &id_map, in_file_path); println!("Wrote the sorted fastq file"); //print_statistics(fastq_records.borrow()); -} \ No newline at end of file +} diff --git a/src/gff_handling.rs b/src/gff_handling.rs index f883823..dfdfc95 100644 --- a/src/gff_handling.rs +++ b/src/gff_handling.rs @@ -1,32 +1,38 @@ -use crate::structs::{ Coord_obj,Minimizer_hashed}; +use crate::structs::{Coord_obj, Minimizer_hashed}; use std::fs::File; -use std::io::{BufReader, BufRead}; +use std::io::{BufRead, BufReader}; -use rustc_hash::{FxHashMap, FxHashSet}; use bio::io::gff; +use rustc_hash::{FxHashMap, FxHashSet}; use bio::io::gff::GffType::GFF3; use rayon::prelude::*; +use bio::io::fasta; use minimizer_iter::MinimizerBuilder; use std::path::Path; -use bio::io::fasta; extern crate rayon; -use crate::{Cluster_ID_Map, Seed_Map, seeding_and_filtering_seeds}; use crate::clustering; +use crate::{seeding_and_filtering_seeds, Cluster_ID_Map, Seed_Map}; +use log::debug; use std::time::Instant; //TODO: add overlap detection //TODO: remove multiple occasions of minimizers in the same gene if the exons overlap //TODO: possibly use the gene id for cluster_identification -fn detect_overlaps(gene_map: &FxHashMap>, this_gene_id:&i32, this_coords: &Vec, overlap_ctr: &mut i32){ - for (gene_id_other,coords) in gene_map{ - if gene_id_other > this_gene_id{ - for coord in coords{ - for this_coord in this_coords{ - if coord.overlaps_with(this_coord){ - *overlap_ctr +=1; +fn detect_overlaps( + gene_map: &FxHashMap>, + this_gene_id: &i32, + this_coords: &Vec, + overlap_ctr: &mut i32, +) { + for (gene_id_other, coords) in gene_map { + if gene_id_other > this_gene_id { + for coord in coords { + for this_coord in this_coords { + if coord.overlaps_with(this_coord) { + *overlap_ctr += 1; } } } @@ -34,85 +40,105 @@ fn detect_overlaps(gene_map: &FxHashMap>, this_gene_id:&i32, } } - - -fn parse_fasta_and_gen_clusters(fasta_path: Option<&str>, coords: FxHashMap>>, clusters: &mut Cluster_ID_Map, init_cluster_map: &mut Seed_Map,k: usize,w: usize){ +fn parse_fasta_and_gen_clusters( + fasta_path: Option<&str>, + coords: FxHashMap>>, + clusters: &mut Cluster_ID_Map, + init_cluster_map: &mut Seed_Map, + k: usize, + w: usize, +) { println!("parse_fasta"); - let path=fasta_path.unwrap(); - let mut reader = fasta::Reader::from_file(Path::new(path)).expect("We expect the file to exist"); + let path = fasta_path.unwrap(); + let mut reader = + fasta::Reader::from_file(Path::new(path)).expect("We expect the file to exist"); //let mut reader = parse_fastx_file(&filename).expect("valid path/file"); - reader.records().into_iter().for_each(|record|{ - - let mut internal_id=0; - let mut record_minis=vec![]; + reader.records().into_iter().for_each(|record| { + let mut internal_id = 0; + let mut record_minis = vec![]; //retreive the current record let seq_rec = record.expect("invalid record"); let sequence = std::str::from_utf8(seq_rec.seq()).unwrap().to_uppercase(); - let mut overlap_ctr= 0; + let mut overlap_ctr = 0; //in the next lines we make sure that we have a proper header and store it as string let id = seq_rec.id().to_string().split(' ').collect::>()[0].to_string(); println!("Now to the coords_ds"); if let Some(gene_map) = coords.get(id.as_str()) { //iterate over the genes in the gene_map - for (gene_id, exon_coords) in gene_map { //TODO:test whether into_par_iter works here - let mut coords_in_gene=FxHashSet::default(); - for exon_coord in exon_coords{ - if ! coords_in_gene.contains(exon_coord){ - let exon_seq = &sequence[exon_coord.startpos as usize..exon_coord.endpos as usize].to_string(); - let mut exon_minis=vec![]; - seeding_and_filtering_seeds::get_canonical_kmer_minimizers_hashed(exon_seq.as_bytes(), k, w, &mut exon_minis); - record_minis.append( &mut exon_minis); + for (gene_id, exon_coords) in gene_map { + //TODO:test whether into_par_iter works here + let mut coords_in_gene = FxHashSet::default(); + for exon_coord in exon_coords { + if !coords_in_gene.contains(exon_coord) { + let exon_seq = &sequence + [exon_coord.startpos as usize..exon_coord.endpos as usize] + .to_string(); + let mut exon_minis = vec![]; + seeding_and_filtering_seeds::get_canonical_kmer_minimizers_hashed( + exon_seq.as_bytes(), + k, + w, + &mut exon_minis, + ); + record_minis.append(&mut exon_minis); coords_in_gene.insert(exon_coord); } } - detect_overlaps(gene_map,gene_id,exon_coords, &mut overlap_ctr); - //println!("Record_seq {}: {}",id,record_minis); + detect_overlaps(gene_map, gene_id, exon_coords, &mut overlap_ctr); + debug!("Record_seq {}: {:?}", id, record_minis); clustering::generate_initial_cluster_map(&record_minis, init_cluster_map, *gene_id); - let id_vec= vec![]; - clusters.insert(*gene_id,id_vec); - //println!("{:?}",init_cluster_map); + let id_vec = vec![]; + clusters.insert(*gene_id, id_vec); + debug!("{:?}", init_cluster_map); internal_id += 1; } } - println!("{} overlaps between genes (their exons) ",overlap_ctr); + println!("{} overlaps between genes (their exons) ", overlap_ctr); }); } - -fn parse_gtf_and_collect_coords(gtf_path: Option<&str>, coords:&mut FxHashMap>>){ - let reader = gff::Reader::from_file(gtf_path.unwrap(),GFF3); - let mut gene_id=0; - let mut coords_in_gene=FxHashSet::default(); - let mut true_gene=false; +fn parse_gtf_and_collect_coords( + gtf_path: Option<&str>, + coords: &mut FxHashMap>>, +) { + let reader = gff::Reader::from_file(gtf_path.unwrap(), GFF3); + let mut gene_id = 0; + let mut coords_in_gene = FxHashSet::default(); + let mut true_gene = false; for record in reader.expect("The reader should find records").records() { let mut rec = record.ok().expect("Error reading record."); //we have a new gene - if rec.feature_type() == "gene" {//|| rec.feature_type() == "pseudogene"{ + if rec.feature_type() == "gene" { + //|| rec.feature_type() == "pseudogene"{ true_gene = true; //see if we are in a new chromosome/scaffold - if !coords.contains_key(rec.seqname()){ + if !coords.contains_key(rec.seqname()) { //we are in a new chromosome/scaffold - let sname= rec.seqname().to_string(); - coords.insert(sname,FxHashMap::default()); - + let sname = rec.seqname().to_string(); + coords.insert(sname, FxHashMap::default()); } //increase the gene_id by 1 gene_id += 1; coords_in_gene.clear(); - } - else if rec.feature_type()=="exon"{ + } else if rec.feature_type() == "exon" { //we only are interested in exons from true genes if true_gene { - //println!("{} {} {}",rec.seqname(),rec.feature_type(),gene_id); + debug!("{} {} {}", rec.seqname(), rec.feature_type(), gene_id); if let Some(gene_map) = coords.get_mut(rec.seqname()) { if let Some(coord_vec) = gene_map.get_mut(&gene_id) { - let coord_o = Coord_obj { startpos: *rec.start(), endpos: *rec.end() }; + let coord_o = Coord_obj { + startpos: *rec.start(), + endpos: *rec.end(), + }; if !coords_in_gene.contains(&coord_o) { coord_vec.push(coord_o.clone()); coords_in_gene.insert(coord_o); } } else { - let coord_o = Coord_obj { startpos: *rec.start(), endpos: *rec.end() }; + let coord_o = Coord_obj { + startpos: *rec.start(), + endpos: *rec.end(), + }; if !coords_in_gene.contains(&coord_o) { let mut coord_vec = vec![]; coord_vec.push(coord_o.clone()); @@ -124,30 +150,55 @@ fn parse_gtf_and_collect_coords(gtf_path: Option<&str>, coords:&mut FxHashMap, fasta_path: Option<&str>,clusters: &mut Cluster_ID_Map, cluster_map: &mut Seed_Map,k:usize ,w:usize) { +pub(crate) fn resolve_gff( + gff_path: Option<&str>, + fasta_path: Option<&str>, + clusters: &mut Cluster_ID_Map, + cluster_map: &mut Seed_Map, + k: usize, + w: usize, +) { println!("Resolving GFF file "); let now1 = Instant::now(); - let mut coords = FxHashMap::default();//: HashMap, BuildHasherDefault>, BuildHasherDefault> = FxHashMap::default(); + let mut coords = FxHashMap::default(); //: HashMap, BuildHasherDefault>, BuildHasherDefault> = FxHashMap::default(); parse_gtf_and_collect_coords(gff_path, &mut coords); - println!("{} s used for parsing the gff file", now1.elapsed().as_secs()); + println!( + "{} s used for parsing the gff file", + now1.elapsed().as_secs() + ); println!("First step done"); let now2 = Instant::now(); parse_fasta_and_gen_clusters(fasta_path, coords, clusters, cluster_map, k, w); - println!("Generated {} initial clusters from the reference", clusters.len()); - println!("{} s used for parsing the fasta file", now2.elapsed().as_secs()); + println!( + "Generated {} initial clusters from the reference", + clusters.len() + ); + println!( + "{} s used for parsing the fasta file", + now2.elapsed().as_secs() + ); println!("{} s for full GFF resolution", now1.elapsed().as_secs()); println!("GTF resolved"); } - -pub(crate) fn gff_based_clustering(gff_path: Option<&str>, fasta_path: Option<&str>, clusters: &mut Cluster_ID_Map, cluster_map: &mut Seed_Map, k:usize, w:usize, seeding: &str,s: usize,t: usize, noncanonical_bool: bool){ +pub(crate) fn gff_based_clustering( + gff_path: Option<&str>, + fasta_path: Option<&str>, + clusters: &mut Cluster_ID_Map, + cluster_map: &mut Seed_Map, + k: usize, + w: usize, + seeding: &str, + s: usize, + t: usize, + noncanonical_bool: bool, +) { // Read the FASTA file let fasta_reader = File::open(Path::new(fasta_path.unwrap())).unwrap(); let fasta_buf_reader = BufReader::new(fasta_reader); @@ -155,71 +206,93 @@ pub(crate) fn gff_based_clustering(gff_path: Option<&str>, fasta_path: Option<&s // Read the GFF file let gff_reader = File::open(Path::new(gff_path.unwrap())).unwrap(); let gff_buf_reader = BufReader::new(gff_reader); - let mut binding=gff::Reader::new(gff_buf_reader,GFF3); + let mut binding = gff::Reader::new(gff_buf_reader, GFF3); let mut gff_records = binding.records(); - let mut gene_id= 0; - let mut previous_genes= 0; + let mut gene_id = 0; + let mut previous_genes = 0; // Iterate through FASTA records while let Some(fasta_record) = fasta_records.next() { let fasta_record = fasta_record.expect("Error reading FASTA record"); let scaffold_id = fasta_record.id().to_string(); - let sequence = std::str::from_utf8(fasta_record.seq()).unwrap().to_uppercase(); - let record_minis=vec![]; - let mut is_gene= false; - //println!("scaffold {}",scaffold_id); + let sequence = std::str::from_utf8(fasta_record.seq()) + .unwrap() + .to_uppercase(); + let record_minis = vec![]; + let mut is_gene = false; + debug!("scaffold {}", scaffold_id); // Process GFF records for the current scaffold ID while let Some(gff_record) = gff_records.next() { let gff_record = gff_record.expect("Error reading GFF record"); let gff_scaffold_id = gff_record.seqname().to_string(); // Check if the scaffold IDs match if scaffold_id == gff_scaffold_id { - if gff_record.feature_type() =="gene" && gff_record.attributes().get("gene_biotype").expect("This algorithm requires a gene_biotype to extract the coding genes") == "protein_coding"{ + if gff_record.feature_type() == "gene" + && gff_record.attributes().get("gene_biotype").expect( + "This algorithm requires a gene_biotype to extract the coding genes", + ) == "protein_coding" + { gene_id += 1; is_gene = true; - } - else if gff_record.feature_type()=="exon"{ - let exon_seq= &sequence[*gff_record.start() as usize..*gff_record.end() as usize]; - let mut this_minimizers=vec![]; - if seeding == "minimizer"{ + } else if gff_record.feature_type() == "exon" { + let exon_seq = + &sequence[*gff_record.start() as usize..*gff_record.end() as usize]; + let mut this_minimizers = vec![]; + if seeding == "minimizer" { if noncanonical_bool { let min_iter = MinimizerBuilder::::new() .minimizer_size(k) .width((w) as u16) .iter(sequence.as_bytes()); for (minimizer, position) in min_iter { - let mini = Minimizer_hashed { sequence: minimizer, position: position }; + let mini = Minimizer_hashed { + sequence: minimizer, + position: position, + }; this_minimizers.push(mini); } - } - else{ + } else { let min_iter = MinimizerBuilder::::new() .canonical() .minimizer_size(k) .width((w) as u16) .iter(sequence.as_bytes()); for (minimizer, position, _) in min_iter { - let mini = Minimizer_hashed {sequence: minimizer,position: position }; + let mini = Minimizer_hashed { + sequence: minimizer, + position: position, + }; this_minimizers.push(mini); } } - - } - else if seeding =="syncmer"{ - if exon_seq.len() > s{ - seeding_and_filtering_seeds::syncmers_canonical(exon_seq.as_bytes(), k, s, t, &mut this_minimizers); + } else if seeding == "syncmer" { + if exon_seq.len() > s { + seeding_and_filtering_seeds::syncmers_canonical( + exon_seq.as_bytes(), + k, + s, + t, + &mut this_minimizers, + ); } } - } - else if gff_record.feature_type() =="pseudogene"{ + } else if gff_record.feature_type() == "pseudogene" { is_gene = false; } clustering::generate_initial_cluster_map(&record_minis, cluster_map, gene_id); - let id_vec= vec![]; + let id_vec = vec![]; clusters.insert(gene_id, id_vec); } else { - println!("found {} genes in {}",gene_id - previous_genes, scaffold_id); + println!( + "found {} genes in {}", + gene_id - previous_genes, + scaffold_id + ); previous_genes = gene_id; - if gff_record.feature_type() =="gene" && gff_record.attributes().get("gene_biotype").expect("This algorithm requires a gene_biotype to extract the coding genes") == "protein_coding"{ + if gff_record.feature_type() == "gene" + && gff_record.attributes().get("gene_biotype").expect( + "This algorithm requires a gene_biotype to extract the coding genes", + ) == "protein_coding" + { gene_id += 1; is_gene = true; } @@ -227,6 +300,5 @@ pub(crate) fn gff_based_clustering(gff_path: Option<&str>, fasta_path: Option<&s break; } } - } } diff --git a/src/main.rs b/src/main.rs index 2c7bf21..a89775d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,44 +1,43 @@ //#![allow(warnings)] -extern crate rayon; extern crate clap; extern crate nohash_hasher; +extern crate rayon; //use crate::generate_sorted_fastq_new_version::{filter_minimizers_by_quality, Minimizer,get_kmer_minimizers}; //use clap::{arg, command, Command}; - -pub mod file_actions; mod clustering; -mod seeding_and_filtering_seeds; +pub mod file_actions; mod generate_sorted_fastq_for_cluster; mod gff_handling; -mod write_output; +mod seeding_and_filtering_seeds; mod structs; +mod write_output; mod Parallelization_side; - //mod isONclust; -use std::{collections::HashMap}; use crate::clustering::cluster_merging; -use crate::structs::{Minimizer_hashed, FastqRecord_isoncl_init}; +use crate::structs::{FastqRecord_isoncl_init, Minimizer_hashed}; +use std::collections::HashMap; use clap::Parser; use rayon::prelude::*; use std::collections::VecDeque; use std::time::Instant; -use std::path::Path; -use std::io::Read; use std::convert::TryFrom; +use std::io::Read; +use std::path::Path; use memory_stats::memory_stats; -use rustc_hash::{FxHashMap,FxHashSet}; +use rustc_hash::{FxHashMap, FxHashSet}; use bio::io::fastq; +use log::info; use minimizer_iter::MinimizerBuilder; - +use simple_logger::SimpleLogger; type Seed_Map = FxHashMap>; // Change here to any other hash table implementation, e.g., HashMap, nohash_hasher::BuildNoHashHasher>; type Cluster_ID_Map = FxHashMap>; // Change here to any other hash table implementation, e.g., HashMap,nohash_hasher::BuildNoHashHasher>; @@ -55,10 +54,12 @@ fn compute_d() -> [f64; 128] { d } - -fn expected_number_errornous_kmers(quality_string: &str, k: usize, d:&[f64;128]) -> f64 { +fn expected_number_errornous_kmers(quality_string: &str, k: usize, d: &[f64; 128]) -> f64 { //computes the expeced number of errornous kmers for a read by analysing the quality entry - let prob_error: Vec = quality_string.chars().map(|char_| d[char_ as u8 as usize]).collect(); + let prob_error: Vec = quality_string + .chars() + .map(|char_| d[char_ as u8 as usize]) + .collect(); let mut sum_of_expectations = 0.0; let mut qurrent_prob_no_error = 1.0; let mut window: VecDeque = VecDeque::with_capacity(k); @@ -77,12 +78,11 @@ fn expected_number_errornous_kmers(quality_string: &str, k: usize, d:&[f64;128]) window.push_back(1.0 - p_e); } } - println!("Quality string: {}",(quality_string.len() - k + 1)as f64); - println!("SoE: {}",sum_of_expectations); + info!("Quality string: {}", (quality_string.len() - k + 1) as f64); + info!("SoE: {}", sum_of_expectations); (quality_string.len() - k + 1) as f64 - sum_of_expectations } - fn calculate_error_rate(qual: &str, d_no_min: &[f64; 128]) -> f64 { let mut poisson_mean = 0.0; let mut total_count = 0; @@ -95,103 +95,148 @@ fn calculate_error_rate(qual: &str, d_no_min: &[f64; 128]) -> f64 { } poisson_mean / total_count as f64 - } - -fn get_sorted_entries(mini_map_filtered: FxHashMap>)->Vec<(i32, Vec)>{ +fn get_sorted_entries( + mini_map_filtered: FxHashMap>, +) -> Vec<(i32, Vec)> { // Sort by the length of vectors in descending order - let mut sorted_entries: Vec<(i32, Vec)> = mini_map_filtered - .into_iter() - .collect(); - sorted_entries.sort_by(|a, b| { - b.1.len().cmp(&a.1.len()).then_with(|| a.0.cmp(&b.0)) - }); - + let mut sorted_entries: Vec<(i32, Vec)> = + mini_map_filtered.into_iter().collect(); + sorted_entries.sort_by(|a, b| b.1.len().cmp(&a.1.len()).then_with(|| a.0.cmp(&b.0))); sorted_entries } - -fn filter_fastq_records(mut fastq_records:Vec,d_no_min:[f64;128], q_threshold:f64,k:usize,d :[f64;128])->Vec{ +fn filter_fastq_records( + mut fastq_records: Vec, + d_no_min: [f64; 128], + q_threshold: f64, + k: usize, + d: [f64; 128], +) -> Vec { fastq_records.par_iter_mut().for_each(|fastq_record| { - //calculate the error rate and store it in vector errors + //calculate the error rate and store it in vector errors if fastq_record.sequence.len() > k { - fastq_record.set_error_rate(calculate_error_rate(fastq_record.get_quality(), &d_no_min)); - let exp_errors_in_kmers = expected_number_errornous_kmers(fastq_record.get_quality(), k, &d); - let p_no_error_in_kmers = 1.0 - exp_errors_in_kmers / (fastq_record.get_sequence().len() - k + 1) as f64; + fastq_record + .set_error_rate(calculate_error_rate(fastq_record.get_quality(), &d_no_min)); + let exp_errors_in_kmers = + expected_number_errornous_kmers(fastq_record.get_quality(), k, &d); + let p_no_error_in_kmers = + 1.0 - exp_errors_in_kmers / (fastq_record.get_sequence().len() - k + 1) as f64; //calculate the final score and add it to fastq_record (we have a dedicated field for that that was initialized with 0.0) - fastq_record.set_score(p_no_error_in_kmers * ((fastq_record.get_sequence().len() - k + 1) as f64)) + fastq_record.set_score( + p_no_error_in_kmers * ((fastq_record.get_sequence().len() - k + 1) as f64), + ) } }); //filter out records that have a too high error rate - fastq_records.retain(|record| 10.0_f64*-record.get_err_rate().log(10.0_f64) > q_threshold); - println!("Nr of records that passed the filtering: {}",fastq_records.len()); + fastq_records.retain(|record| 10.0_f64 * -record.get_err_rate().log(10.0_f64) > q_threshold); + info!( + "Nr of records that passed the filtering: {}", + fastq_records.len() + ); fastq_records } - fn convert_cl_id(v: usize) -> Option { i32::try_from(v).ok() } - -#[derive(Parser,Debug)] +#[derive(Parser, Debug)] #[command(name = "isONclust3")] #[command(author = "Alexander J. Petri ")] #[command(version = "0.0.2")] -#[command(about = "Clustering of long-read sequencing data into gene families", long_about = "isONclust is a tool for clustering either PacBio Iso-Seq reads, or Oxford Nanopore reads into clusters, where each cluster represents all reads that came from a gene." )] +#[command( + about = "Clustering of long-read sequencing data into gene families", + long_about = "isONclust is a tool for clustering either PacBio Iso-Seq reads, or Oxford Nanopore reads into clusters, where each cluster represents all reads that came from a gene." +)] #[command(author, version, about, long_about = None)] struct Cli { - #[arg(long, short, help="Path to consensus fastq file(s)")] + #[arg(long, short, help = "Path to consensus fastq file(s)")] fastq: String, - #[arg(long, short,help="Path to initial clusters (stored in fasta format), which is required when --gff is set")] + #[arg( + long, + short, + help = "Path to initial clusters (stored in fasta format), which is required when --gff is set" + )] init_cl: Option, - #[arg(short, help="Kmer length")] + #[arg(short, help = "Kmer length")] k: Option, - #[arg(short, help=" window size")] + #[arg(short, help = " window size")] w: Option, - #[arg(short, help=" syncmer length")] + #[arg(short, help = " syncmer length")] s: Option, - #[arg(short, help=" minimum syncmer position")] + #[arg(short, help = " minimum syncmer position")] t: Option, - #[arg(long, short, help="Path to outfolder")] + #[arg(long, short, help = "Path to outfolder")] outfolder: String, - #[arg(long,short,default_value_t= 1, help="Minimum number of reads for cluster")] + #[arg( + long, + short, + default_value_t = 1, + help = "Minimum number of reads for cluster" + )] n: usize, - #[arg(long, short,help="Path to gff3 file (optional parameter), requires a reference added by calling --init-cl ")] + #[arg( + long, + short, + help = "Path to gff3 file (optional parameter), requires a reference added by calling --init-cl " + )] gff: Option, - #[arg(long,help="we do not want to use canonical seeds")] + #[arg(long, help = "we do not want to use canonical seeds")] noncanonical: bool, - #[arg(long,help="Run mode of isONclust (pacbio or ont")] + #[arg(long, help = "Run mode of isONclust (pacbio or ont")] mode: String, //TODO:add argument telling us whether we want to use syncmers instead of kmers, maybe also add argument determining whether we want to use canonical_minimizers - #[arg(long,help="seeding approach we choose")] + #[arg(long, help = "seeding approach we choose")] seeding: Option, - #[arg(long,help="quality threshold used for the data (standard: 0.9) ")] + #[arg(long, help = "quality threshold used for the data (standard: 0.9) ")] quality_threshold: Option, - #[arg(long,help="print additional information")] + #[arg(long, help = "print additional information")] verbose: bool, - #[arg(long,help="Run the post clustering step during the analysis (small improvement for results but much higher runtime)")] + #[arg( + long, + help = "Run the post clustering step during the analysis (small improvement for results but much higher runtime)" + )] post_cluster: bool, - #[arg(long,help="Do not write the fastq_files (no write_fastq in isONclust1)")] + #[arg( + long, + help = "Do not write the fastq_files (no write_fastq in isONclust1)" + )] no_fastq: bool, - #[arg(long,help="Minimum overlap threshold for reads to be clustered together (Experimental parameter)")] + #[arg( + long, + help = "Minimum overlap threshold for reads to be clustered together (Experimental parameter)" + )] min_shared_minis: Option, - #[arg(long,help="Minimum thresholds of shared HCS for clusters to be merged during cluster merging")] - cm_mini: Option + #[arg( + long, + help = "Minimum thresholds of shared HCS for clusters to be merged during cluster merging" + )] + cm_mini: Option, } - fn main() { - //################################################################################################# //INITIALIZATION //################################################################################################# let cli = Cli::parse(); - println!("n: {:?}", cli.n); - println!("outfolder {:?}", cli.outfolder); + + // TODO - move later on. + let level = match cli.verbose { + true => log::LevelFilter::Debug, + false => log::LevelFilter::Info, + }; + + SimpleLogger::new() + .with_level(level) + .init() + .expect("Failed to initialize logger."); + + info!("n: {:?}", cli.n); + info!("outfolder {:?}", cli.outfolder); let mode = cli.mode; let n = cli.n; @@ -200,45 +245,45 @@ fn main() { let mut s; let mut t; let mut quality_threshold; - let mut min_shared_minis ; + let mut min_shared_minis; let mut cm_mini; //right now we only have two modes( custom settings for variables k, w, s, and t: 'ont' for reads with 3% error rate or more and 'pacbio' for reads with less than 3% error rate) - if mode=="ont"{ + if mode == "ont" { k = 13; w = 21; - quality_threshold = 0.9_f64.powi(k as i32);//TODO: standard: 0.9_f64 + quality_threshold = 0.9_f64.powi(k as i32); //TODO: standard: 0.9_f64 min_shared_minis = 0.5; cm_mini = 0.5; s = 9; t = 2; - } - else if mode == "pacbio"{ + } else if mode == "pacbio" { k = 15; w = 51; - quality_threshold = 0.98_f64.powi(k as i32);//TODO://standard: 0.97_f64 + quality_threshold = 0.98_f64.powi(k as i32); //TODO://standard: 0.97_f64 min_shared_minis = 0.5; cm_mini = 0.8; s = 9; t = 3; - } - else { - if cli.cm_mini.is_some(){ + } else { + if cli.cm_mini.is_some() { let cm_mini = cli.cm_mini; } - if cli.quality_threshold.is_some(){ - let qt=cli.quality_threshold.unwrap(); - if cli.k.is_some(){ + if cli.quality_threshold.is_some() { + let qt = cli.quality_threshold.unwrap(); + if cli.k.is_some() { k = cli.k.unwrap(); + } else { + panic!("Please set k") } - else{ panic!("Please set k")} w = 0; t = 0; s = 0; quality_threshold = qt.powi(k as i32); min_shared_minis = 0.5; cm_mini = 0.5; + } else { + panic!("Please set the quality_threshold") } - else { panic!("Please set the quality_threshold") } } let verbose = cli.verbose; /*let mut verbose = false; @@ -267,51 +312,45 @@ fn main() { noncanonical_bool = true; }*/ let seeding_input = cli.seeding.as_deref(); - let mut seeding= "minimizer"; + let mut seeding = "minimizer"; if let Some(seed) = seeding_input { seeding = seed; } - if cli.k.is_some(){ + if cli.k.is_some() { k = cli.k.unwrap(); } - if cli.min_shared_minis.is_some(){ + if cli.min_shared_minis.is_some() { min_shared_minis = cli.min_shared_minis.unwrap() } - if seeding =="syncmer"{ - if cli.s.is_some(){ + if seeding == "syncmer" { + if cli.s.is_some() { s = cli.s.unwrap(); - if (k - s + 1) % 2 != 0{ + if (k - s + 1) % 2 != 0 { panic!("Please set k and s so that (k-s)+1 yields an odd number") } } - if cli.t.is_some(){ + if cli.t.is_some() { t = cli.t.unwrap(); - if (k-s) / 2 != t{ + if (k - s) / 2 != t { panic!("Please set k,s and t to fulfill (k-s)/2=t") } } - } - - else if seeding == "minimizer" { - if cli.w.is_some(){ + } else if seeding == "minimizer" { + if cli.w.is_some() { w = cli.w.unwrap(); - if w < k{ + if w < k { panic!("Please set w greater than k") - } - else if w%2==0{ + } else if w % 2 == 0 { panic!("Please choose w to be odd") } } - } - println!("k: {:?}", k); - println!("w: {:?}", w); - println!("s: {:?}", s); - println!("t: {:?}", t); - println!("quality_threshold {:?}",quality_threshold); - println!("Min shared minis: {}",min_shared_minis); + info!("k: {}, w: {}, s: {}, t: {}", k, w, s, t); + info!("quality_threshold {:?}", quality_threshold); + info!("Min shared minis: {}", min_shared_minis); + //let k = cli.k; let outfolder = cli.outfolder; let gff_path = cli.gff.as_deref(); @@ -319,28 +358,46 @@ fn main() { let mut id_map = FxHashMap::default(); let mut clusters: Cluster_ID_Map = HashMap::default(); //FxHashMap> = FxHashMap::default(); - let mut annotation_based= false; + let mut annotation_based = false; let filename = outfolder.clone() + "/clustering/sorted.fastq"; - println!("Using {}s as seeds", seeding); + + // info or debug? + info!("Using {}s as seeds", seeding); + let now1 = Instant::now(); - {//main scope (holds all the data structures that we can delete when the clustering is done + { + //main scope (holds all the data structures that we can delete when the clustering is done //holds the mapping of which minimizer belongs to what clusters //let mut shared_seed_info: FxHashMap=FxHashMap::default(); let mut cluster_map: Seed_Map = HashMap::default(); //FxHashMap> = FxHashMap::default(); let initial_clustering_path = cli.init_cl.as_deref(); - if gff_path.is_some(){ - gff_handling::gff_based_clustering(gff_path, initial_clustering_path, &mut clusters, &mut cluster_map, k, w, seeding, s, t, noncanonical_bool); - println!("{} s used for parsing the annotation information", now1.elapsed().as_secs()); - print!("{:?}", clusters); + if gff_path.is_some() { + gff_handling::gff_based_clustering( + gff_path, + initial_clustering_path, + &mut clusters, + &mut cluster_map, + k, + w, + seeding, + s, + t, + noncanonical_bool, + ); + info!( + "{} s used for parsing the annotation information", + now1.elapsed().as_secs() + ); + info!("{:?}", clusters); annotation_based = true; } - if verbose{ + if verbose { if let Some(usage) = memory_stats() { - println!("Current physical memory usage: {}", usage.physical_mem); - println!("Current virtual memory usage: {}", usage.virtual_mem); + info!("Current physical memory usage: {}", usage.physical_mem); + info!("Current virtual memory usage: {}", usage.virtual_mem); } else { - println!("Couldn't get the current memory usage :("); + info!("Couldn't get the current memory usage :("); } } @@ -350,10 +407,10 @@ fn main() { if verbose { if let Some(usage) = memory_stats() { - println!("Current physical memory usage: {}", usage.physical_mem); - println!("Current virtual memory usage: {}", usage.virtual_mem); + info!("Current physical memory usage: {}", usage.physical_mem); + info!("Current virtual memory usage: {}", usage.virtual_mem); } else { - println!("Couldn't get the current memory usage :("); + info!("Couldn't get the current memory usage :("); } } @@ -365,35 +422,49 @@ fn main() { //count the number of reads that were too short to be clustered //d_no_min contains a translation for chars into quality values let d_no_min = seeding_and_filtering_seeds::compute_d_no_min(); - println!("{}", filename); + info!("{}", filename); let now2 = Instant::now(); - generate_sorted_fastq_for_cluster::sort_fastq_for_cluster(k, q_threshold, &cli.fastq, &outfolder, &quality_threshold, w, seeding, s, t, noncanonical_bool,verbose); + generate_sorted_fastq_for_cluster::sort_fastq_for_cluster( + k, + q_threshold, + &cli.fastq, + &outfolder, + &quality_threshold, + w, + seeding, + s, + t, + noncanonical_bool, + verbose, + ); let now3 = Instant::now(); if verbose { - println!("{} s for sorting the fastq file", now2.elapsed().as_secs()); + info!("{} s for sorting the fastq file", now2.elapsed().as_secs()); if let Some(usage) = memory_stats() { - println!("Current physical memory usage: {}", usage.physical_mem); - println!("Current virtual memory usage: {}", usage.virtual_mem); + info!("Current physical memory usage: {}", usage.physical_mem); + info!("Current virtual memory usage: {}", usage.virtual_mem); } else { - println!("Couldn't get the current memory usage :("); + info!("Couldn't get the current memory usage :("); } let now3 = Instant::now(); - println!("initial clusters {:?}", clusters); + info!("initial clusters {:?}", clusters); } //################################################################################################# //CLUSTERING STEP //################################################################################################# - {//Clustering scope ( we define a scope so that variables die that we do not use later on) + { + //Clustering scope ( we define a scope so that variables die that we do not use later on) //the read id stores an internal id to represent our read let mut read_id = 0; //this gives the percentage of high_confidence seeds that the read has to share with a cluster to be added to it - let reader = fastq::Reader::from_file(Path::new(&filename)).expect("We expect the file to exist"); - for record in reader.records(){ + let reader = fastq::Reader::from_file(Path::new(&filename)) + .expect("We expect the file to exist"); + for record in reader.records() { let seq_rec = record.expect("invalid record"); let header_new = seq_rec.id(); - if verbose{ - //println!("ID: {}",header_new); + if verbose { + //info!("ID: {}",header_new); } let sequence = seq_rec.seq(); let quality = seq_rec.qual(); @@ -401,9 +472,9 @@ fn main() { id_map.insert(read_id, header_new.to_string()); let mut this_minimizers = vec![]; let mut filtered_minis = vec![]; - if seeding == "minimizer"{ - if w > k{ - w = w - k+ 1; // the minimizer generator will panic if w is even to break ties + if seeding == "minimizer" { + if w > k { + w = w - k + 1; // the minimizer generator will panic if w is even to break ties if w % 2 == 0 { w += 1; } @@ -414,90 +485,129 @@ fn main() { .width((w) as u16) .iter(sequence); for (minimizer, position) in min_iter { - let mini = Minimizer_hashed { sequence: minimizer, position: position }; + let mini = Minimizer_hashed { + sequence: minimizer, + position: position, + }; this_minimizers.push(mini); } - } - else{ + } else { let min_iter = MinimizerBuilder::::new() .canonical() .minimizer_size(k) .width((w) as u16) .iter(sequence); for (minimizer, position, _) in min_iter { - let mini = Minimizer_hashed {sequence: minimizer,position: position }; + let mini = Minimizer_hashed { + sequence: minimizer, + position: position, + }; this_minimizers.push(mini); } } - } - else if seeding == "syncmer"{ - if noncanonical_bool{ - seeding_and_filtering_seeds::get_kmer_syncmers(sequence, k, s, t, &mut this_minimizers); - } - else { - seeding_and_filtering_seeds::syncmers_canonical(sequence, k, s, t, &mut this_minimizers); + } else if seeding == "syncmer" { + if noncanonical_bool { + seeding_and_filtering_seeds::get_kmer_syncmers( + sequence, + k, + s, + t, + &mut this_minimizers, + ); + } else { + seeding_and_filtering_seeds::syncmers_canonical( + sequence, + k, + s, + t, + &mut this_minimizers, + ); } } - seeding_and_filtering_seeds::filter_seeds_by_quality(&this_minimizers, quality, k, d_no_min, &mut filtered_minis, &quality_threshold, verbose); + seeding_and_filtering_seeds::filter_seeds_by_quality( + &this_minimizers, + quality, + k, + d_no_min, + &mut filtered_minis, + &quality_threshold, + verbose, + ); // perform the clustering step - //println!("{} filtered_minis", filtered_minis.len()); - //println!("{} this_minimizers", this_minimizers.len()); - //println!(" "); + //info!("{} filtered_minis", filtered_minis.len()); + //info!("{} this_minimizers", this_minimizers.len()); + //info!(" "); let mut shared_seed_infos_norm_vec: Vec = vec![0; clusters.len()]; - clustering::cluster(&filtered_minis, min_shared_minis, &this_minimizers, &mut clusters, &mut cluster_map, read_id, &mut shared_seed_infos_norm_vec); + clustering::cluster( + &filtered_minis, + min_shared_minis, + &this_minimizers, + &mut clusters, + &mut cluster_map, + read_id, + &mut shared_seed_infos_norm_vec, + ); read_id += 1; - if verbose{ + if verbose { if read_id % 1000000 == 0 { - println!("{} reads processed", read_id); + info!("{} reads processed", read_id); } } } - println!("Generated {} clusters from clustering",clusters.len()); - println!("Finished clustering"); - println!("{} reads used for clustering",read_id); + info!("Generated {} clusters from clustering", clusters.len()); + info!("Finished clustering"); + info!("{} reads used for clustering", read_id); if verbose { - //println!("{} s for reading the sorted fastq file and clustering of the reads", now3.elapsed().as_secs()); + //info!("{} s for reading the sorted fastq file and clustering of the reads", now3.elapsed().as_secs()); } if let Some(usage) = memory_stats() { - println!("Current physical memory usage: {}", usage.physical_mem); - println!("Current virtual memory usage: {}", usage.virtual_mem); + info!("Current physical memory usage: {}", usage.physical_mem); + info!("Current virtual memory usage: {}", usage.virtual_mem); } else { - println!("Couldn't get the current memory usage :("); + info!("Couldn't get the current memory usage :("); } //post_cluster: true -> run post_clustering - if post_cluster{ - println!("Starting post-clustering to refine clusters"); + if post_cluster { + info!("Starting post-clustering to refine clusters"); let now_pc = Instant::now(); let mut shared_seed_infos_vec: Vec = vec![0; clusters.len()]; - cluster_merging(&mut clusters, &mut cluster_map, cm_mini, &mut shared_seed_infos_vec, verbose); - println!("{} s for post-clustering", now_pc.elapsed().as_secs()); - println!("Got {} clusters from Post-clustering",clusters.len()); + cluster_merging( + &mut clusters, + &mut cluster_map, + cm_mini, + &mut shared_seed_infos_vec, + verbose, + ); + info!("{} s for post-clustering", now_pc.elapsed().as_secs()); + info!("Got {} clusters from Post-clustering", clusters.len()); if let Some(usage) = memory_stats() { - println!("Current physical memory usage: {}", usage.physical_mem); - println!("Current virtual memory usage: {}", usage.virtual_mem); + info!("Current physical memory usage: {}", usage.physical_mem); + info!("Current virtual memory usage: {}", usage.virtual_mem); } else { - println!("Couldn't get the current memory usage :("); + info!("Couldn't get the current memory usage :("); } } } - println!("{} s for clustering and post_clustering", now3.elapsed().as_secs()); + info!( + "{} s for clustering and post_clustering", + now3.elapsed().as_secs() + ); } //################################################################################################# //FILE OUTPUT STEP //################################################################################################# let now4 = Instant::now(); - write_output::write_output(outfolder, &clusters, filename, id_map, n ,no_fastq); - println!("{} s for file output", now4.elapsed().as_secs()); + write_output::write_output(outfolder, &clusters, filename, id_map, n, no_fastq); + info!("{} s for file output", now4.elapsed().as_secs()); if let Some(usage) = memory_stats() { - println!("Current physical memory usage: {}", usage.physical_mem); - println!("Current virtual memory usage: {}", usage.virtual_mem); + info!("Current physical memory usage: {}", usage.physical_mem); + info!("Current virtual memory usage: {}", usage.virtual_mem); } else { - println!("Couldn't get the current memory usage :("); + info!("Couldn't get the current memory usage :("); } - println!("{} overall runtime", now1.elapsed().as_secs()); + info!("{} overall runtime", now1.elapsed().as_secs()); } - diff --git a/src/seeding_and_filtering_seeds.rs b/src/seeding_and_filtering_seeds.rs index 94956b3..9cee186 100644 --- a/src/seeding_and_filtering_seeds.rs +++ b/src/seeding_and_filtering_seeds.rs @@ -1,22 +1,19 @@ -use std::collections::VecDeque; -use rayon::prelude::*; -use crate::structs::{ Minimizer_hashed}; use crate::clustering::reverse_complement; +use crate::structs::Minimizer_hashed; +use log::debug; +use rayon::prelude::*; use std::borrow::Cow; use std::collections::hash_map::DefaultHasher; +use std::collections::VecDeque; use std::hash::{Hash, Hasher}; - - //takes an object T and hashes it via DefaultHasher. Used to improve search for minimizers in the data -pub fn calculate_hash(t: &T) -> u64 { +pub fn calculate_hash(t: &T) -> u64 { let mut s = DefaultHasher::new(); t.hash(&mut s); s.finish() } - - pub fn compute_d_no_min() -> [f64; 128] { let mut d = [0.0; 128]; @@ -29,8 +26,6 @@ pub fn compute_d_no_min() -> [f64; 128] { d } - - fn cow_to_string(cow: Cow<'_, [u8]>) -> String { String::from_utf8(cow.into_owned()).unwrap_or_else(|e| { e.utf8_error().to_string() @@ -38,12 +33,15 @@ fn cow_to_string(cow: Cow<'_, [u8]>) -> String { }) } - - -pub fn get_canonical_kmer_minimizers_hashed(seq: &[u8], k_size: usize, w_size: usize, this_minimizers: &mut Vec) { +pub fn get_canonical_kmer_minimizers_hashed( + seq: &[u8], + k_size: usize, + w_size: usize, + this_minimizers: &mut Vec, +) { //make sure that we have suitable values for k_size and w_size (w_size should be larger) let w; - if w_size > k_size{ + if w_size > k_size { w = w_size - k_size + 1; } //k_size was chosen larger than w_size. To not fail we use every k-mer as minimizer (maybe have an error message?) @@ -58,8 +56,8 @@ pub fn get_canonical_kmer_minimizers_hashed(seq: &[u8], k_size: usize, w_size: u let mut reverse_hash; //let full_seq=cow_to_string(seq.clone()); //we can only get a minimizer if the sequence is longer than w + k_size - 1 (else we do not even cover one full window) - if w + k_size < seq.len() + 1{ - for i in 0 .. w { + if w + k_size < seq.len() + 1 { + for i in 0..w { k_mer_str = std::str::from_utf8(&seq[i..i + k_size]).unwrap(); rc_string = reverse_complement(k_mer_str); //generate the hashes of the kmers @@ -68,19 +66,21 @@ pub fn get_canonical_kmer_minimizers_hashed(seq: &[u8], k_size: usize, w_size: u //we now want to find the canonical minimizer: we only push the smaller k-mer of k_mer_str and rc_String into the window if forward_hash <= reverse_hash { window_kmers.push_back((forward_hash, i)); - } - else{ + } else { window_kmers.push_back((reverse_hash, i)) } } } //store the final positional minimizers in a vector - if !window_kmers.is_empty(){ + if !window_kmers.is_empty() { // Find the initial minimizer (minimizer of initial window) - let mut binding= window_kmers.clone(); + let mut binding = window_kmers.clone(); let (curr_min, min_pos) = binding.iter().min_by_key(|&(kmer, _)| kmer).unwrap(); //add the initial minimizer to the vector - let mut mini = Minimizer_hashed {sequence: *curr_min,position: *min_pos }; + let mut mini = Minimizer_hashed { + sequence: *curr_min, + position: *min_pos, + }; this_minimizers.push(mini.clone()); //we always store the previous minimizer to compare to the newly found one let mut prev_minimizer = mini; @@ -91,27 +91,30 @@ pub fn get_canonical_kmer_minimizers_hashed(seq: &[u8], k_size: usize, w_size: u let mut reverse_hash; //iterate further over the sequence and generate the minimizers thereof for (i, new_kmer) in seq[w..].windows(k_size).enumerate() { - new_kmer_pos = i + w; + new_kmer_pos = i + w; new_kmer_str = std::str::from_utf8(new_kmer).unwrap(); rc_string = reverse_complement(new_kmer_str); // updating by removing first kmer from window window_kmers.pop_front().unwrap(); forward_hash = calculate_hash(&new_kmer_str); reverse_hash = calculate_hash(&rc_string); - if reverse_hash > forward_hash{ + if reverse_hash > forward_hash { window_kmers.push_back((forward_hash, new_kmer_pos)); - } - else { - window_kmers.push_back((reverse_hash,new_kmer_pos)) + } else { + window_kmers.push_back((reverse_hash, new_kmer_pos)) } // Find the new minimizer, we need a ds that was cloned from window_kmers to abide ownership rules in rust binding = window_kmers.clone(); let (curr_min, min_pos) = *binding.iter().min_by_key(|&(kmer, _)| kmer).unwrap(); //make sure that the minimal string is a new minimizer not just the previously found one - if min_pos != prev_minimizer.position{ //&& *curr_min != prev_minimizer.1 { + if min_pos != prev_minimizer.position { + //&& *curr_min != prev_minimizer.1 { //add the minimizer into the vector and store the minimizer as previously detected minimizer - mini = Minimizer_hashed {sequence: curr_min,position: min_pos }; - //println!("minimizer {:?}",mini); + mini = Minimizer_hashed { + sequence: curr_min, + position: min_pos, + }; + debug!("minimizer {:?}", mini); this_minimizers.push(mini.clone()); prev_minimizer = mini.clone(); } @@ -120,7 +123,6 @@ pub fn get_canonical_kmer_minimizers_hashed(seq: &[u8], k_size: usize, w_size: u } } - /// Generates positional minimizers from an input string. /// A positional minimizer is the lexicographically smallest substring of a given window size /// as the window slides through the input string. @@ -135,10 +137,15 @@ pub fn get_canonical_kmer_minimizers_hashed(seq: &[u8], k_size: usize, w_size: u /// /// A vector containing `Minimizer` structs, each containing the lexicographically smallest ///substring and its starting position in the input string. -pub fn get_kmer_minimizers_hashed(seq: &[u8], k_size: usize, w_size: usize, this_minimizers: &mut Vec) { +pub fn get_kmer_minimizers_hashed( + seq: &[u8], + k_size: usize, + w_size: usize, + this_minimizers: &mut Vec, +) { //make sure that we have suitable values for k_size and w_size (w_size should be larger) - let w; - if w_size > k_size{ + let w; + if w_size > k_size { w = w_size - k_size + 1; } //k_size was chosen larger than w_size. To not fail we use every k-mer as minimizer (maybe have an error message?) @@ -150,20 +157,23 @@ pub fn get_kmer_minimizers_hashed(seq: &[u8], k_size: usize, w_size: usize, this let mut k_mer_str: &str; let mut forward_hash; //we can only get a minimizer if the sequence is longer than w + k_size - 1 (else we do not even cover one full window) - if w + k_size < seq.len() + 1{ - for i in 0 .. w { + if w + k_size < seq.len() + 1 { + for i in 0..w { k_mer_str = std::str::from_utf8(&seq[i..i + k_size]).unwrap(); forward_hash = calculate_hash(&k_mer_str); window_kmers.push_back((forward_hash, i)); } } //store the final positional minimizers in a vector - if !window_kmers.is_empty(){ + if !window_kmers.is_empty() { // Find the initial minimizer (minimizer of initial window) - let mut binding= window_kmers.clone(); + let mut binding = window_kmers.clone(); let (curr_min, min_pos) = binding.iter().min_by_key(|&(kmer, _)| kmer).unwrap(); //add the initial minimizer to the vector - let mut mini = Minimizer_hashed {sequence: *curr_min,position: *min_pos }; + let mut mini = Minimizer_hashed { + sequence: *curr_min, + position: *min_pos, + }; this_minimizers.push(mini.clone()); //we always store the previous minimizer to compare to the newly found one let mut prev_minimizer = mini; @@ -172,7 +182,7 @@ pub fn get_kmer_minimizers_hashed(seq: &[u8], k_size: usize, w_size: usize, this let mut forward_hash; //iterate further over the sequence and generate the minimizers thereof for (i, new_kmer) in seq[w..].windows(k_size).enumerate() { - new_kmer_pos = i + w; + new_kmer_pos = i + w; new_kmer_str = std::str::from_utf8(new_kmer).unwrap(); // updating by removing first kmer from window window_kmers.pop_front().unwrap(); @@ -182,10 +192,14 @@ pub fn get_kmer_minimizers_hashed(seq: &[u8], k_size: usize, w_size: usize, this binding = window_kmers.clone(); let (curr_min, min_pos) = *binding.iter().min_by_key(|&(kmer, _)| kmer).unwrap(); //make sure that the minimal string is a new minimizer not just the previously found one - if min_pos != prev_minimizer.position{ //&& *curr_min != prev_minimizer.1 { + if min_pos != prev_minimizer.position { + //&& *curr_min != prev_minimizer.1 { //add the minimizer into the vector and store the minimizer as previously detected minimizer - mini = Minimizer_hashed {sequence: curr_min,position: min_pos }; - //println!("minimizer {:?}",mini); + mini = Minimizer_hashed { + sequence: curr_min, + position: min_pos, + }; + debug!("minimizer {:?}", mini); this_minimizers.push(mini.clone()); prev_minimizer = mini.clone(); } @@ -193,36 +207,46 @@ pub fn get_kmer_minimizers_hashed(seq: &[u8], k_size: usize, w_size: usize, this } } - - //TODO: add neccessity that the user should give only valid combinations of s t and k -pub(crate) fn syncmers_canonical(seq: &[u8], k: usize, s: usize, t: usize, syncmers: &mut Vec) { +pub(crate) fn syncmers_canonical( + seq: &[u8], + k: usize, + s: usize, + t: usize, + syncmers: &mut Vec, +) { // Calculate reverse complement let seq_rc = reverse_complement(std::str::from_utf8(seq).unwrap()); - let seq_len= seq.len(); - //println!("seqlen {}",seq_len); - //println!("seq_len {}", seq_len); + let seq_len = seq.len(); + debug!("seqlen {}", seq_len); + debug!("seq_len {}", seq_len); // Initialize deques for forward and reverse complement sequences (stores the hashs of the smers) let mut window_smers_fw: VecDeque = (0..k - s + 1) .map(|i| calculate_hash(&seq[i..i + s])) .collect(); //Initialize the reverse complement deque. We store the hash of the smer, however we have to generate the smer first. For this we take let mut window_smers_rc: VecDeque = (0..k - s + 1) - .map(|i| calculate_hash(&seq_rc[seq_len-(i+s)..seq_len-i])) + .map(|i| calculate_hash(&seq_rc[seq_len - (i + s)..seq_len - i])) .collect(); - //println!("{} elements in our deque",window_smers_fw.len()); + debug!("{} elements in our deque", window_smers_fw.len()); // Find initial minimums (fw and rc) let mut curr_min_fw = *window_smers_fw.iter().min().unwrap(); let mut curr_min_rc = *window_smers_rc.iter().min().unwrap(); //find the minimum positions (fw and rc) - let pos_min_fw = window_smers_fw.iter().position(|&x| x == curr_min_fw).unwrap(); - let pos_min_rc = window_smers_rc.iter().position(|&x| x == curr_min_rc).unwrap(); - let rc_hash=calculate_hash(reverse_complement(std::str::from_utf8(&seq[0..k]).unwrap()).as_str()); + let pos_min_fw = window_smers_fw + .iter() + .position(|&x| x == curr_min_fw) + .unwrap(); + let pos_min_rc = window_smers_rc + .iter() + .position(|&x| x == curr_min_rc) + .unwrap(); + let rc_hash = + calculate_hash(reverse_complement(std::str::from_utf8(&seq[0..k]).unwrap()).as_str()); // Choose minimum position let (pos_min, seq_tmp) = if curr_min_fw < curr_min_rc { (pos_min_fw, calculate_hash(&seq[0..k])) } else { - (pos_min_rc, rc_hash) }; @@ -235,12 +259,16 @@ pub(crate) fn syncmers_canonical(seq: &[u8], k: usize, s: usize, t: usize, syncm } // Iterate over the sequence - for i in (k - s) + 1 .. seq.len() - k { - //println!("i {}",i); + for i in (k - s) + 1..seq.len() - k { + debug!("i {}", i); let new_smer_fw = calculate_hash(&seq[i..i + s]); - let rev_slice=&seq_rc[seq_len-(i+s)..seq_len-i]; + let rev_slice = &seq_rc[seq_len - (i + s)..seq_len - i]; let new_smer_rc = calculate_hash(rev_slice); - //println!("fw_len: {}, rc_len: {}", &seq[i..i + s].len(),&seq_rc[seq_len-(i+1) - s ..seq_len-(i+1)].len()); + debug!( + "fw_len: {}, rc_len: {}", + &seq[i..i + s].len(), + &seq_rc[seq_len - (i + 1) - s..seq_len - (i + 1)].len() + ); // Update windows let _ = window_smers_fw.pop_front(); window_smers_fw.push_back(new_smer_fw); @@ -250,15 +278,27 @@ pub(crate) fn syncmers_canonical(seq: &[u8], k: usize, s: usize, t: usize, syncm // Update minimums and positions curr_min_fw = *window_smers_fw.iter().min().unwrap(); curr_min_rc = *window_smers_rc.iter().min().unwrap(); - let pos_min_fw = window_smers_fw.iter().position(|&x| x == curr_min_fw).unwrap(); - let pos_min_rc = window_smers_rc.iter().position(|&x| x == curr_min_rc).unwrap(); + let pos_min_fw = window_smers_fw + .iter() + .position(|&x| x == curr_min_fw) + .unwrap(); + let pos_min_rc = window_smers_rc + .iter() + .position(|&x| x == curr_min_rc) + .unwrap(); // Choose minimum position - //println!("startpos {} end {}",i-(k-s),i-(k-s)+k); - let rc_hash= calculate_hash(reverse_complement(std::str::from_utf8(&seq[i-(k-s)..i-(k-s)+k]).unwrap()).as_str()); + debug!("startpos {} end {}", i - (k - s), i - (k - s) + k); + let rc_hash = calculate_hash( + reverse_complement(std::str::from_utf8(&seq[i - (k - s)..i - (k - s) + k]).unwrap()) + .as_str(), + ); let (pos_min, kmer) = if curr_min_fw < curr_min_rc { - (pos_min_fw, calculate_hash(&seq[i-(k-s)..i-(k-s)+k])) + ( + pos_min_fw, + calculate_hash(&seq[i - (k - s)..i - (k - s) + k]), + ) } else { - (pos_min_rc,rc_hash) + (pos_min_rc, rc_hash) }; // Add syncmer to the list @@ -271,20 +311,22 @@ pub(crate) fn syncmers_canonical(seq: &[u8], k: usize, s: usize, t: usize, syncm } } - - //calculates the average of a list of f64s and returns it as f64 fn average(numbers: &[f64]) -> f64 { - numbers.iter().sum::()/ numbers.len() as f64 + numbers.iter().sum::() / numbers.len() as f64 } ///Used to detect significant minimizers by checking the read qualities and estimating the overall quality of the area of the read /// Input: quality_interval: the quality values of the area we want to check /// Output: significance_indicator: a bool stating whether the minimizer is significant( true: yes, false: no) /// -pub fn is_significant(quality_interval: &[u8], d_no_min:[f64;128],quality_threshold: &f64)->bool{ - let mut significance_indicator= false; - let mut qualities :Vec = vec![]; +pub fn is_significant( + quality_interval: &[u8], + d_no_min: [f64; 128], + quality_threshold: &f64, +) -> bool { + let mut significance_indicator = false; + let mut qualities: Vec = vec![]; let mut quality = 1.0; let mut index; let mut q_value; @@ -307,230 +349,236 @@ pub fn is_significant(quality_interval: &[u8], d_no_min:[f64;128],quality_thresh significance_indicator } - //filter out minimizers for which the quality of the minimizer_impact range is too bad -pub fn filter_seeds_by_quality(this_minimizers: &Vec, fastq_quality:&[u8], k: usize, d_no_min:[f64;128], minimizers_filtered: &mut Vec, quality_threshold: &f64, verbose:bool) { - let mut skipped_cter= 0; +pub fn filter_seeds_by_quality( + this_minimizers: &Vec, + fastq_quality: &[u8], + k: usize, + d_no_min: [f64; 128], + minimizers_filtered: &mut Vec, + quality_threshold: &f64, + verbose: bool, +) { + let mut skipped_cter = 0; let mut minimizer_range_start: usize; let mut significant; - //println!("Number of minimizers: {}",this_minimizers.len()); - for mini in this_minimizers{//TODO: test whether into_par_iter works here + debug!("Number of minimizers: {}", this_minimizers.len()); + for mini in this_minimizers { + //TODO: test whether into_par_iter works here minimizer_range_start = mini.position; let qualitiy_interval = &fastq_quality[minimizer_range_start..minimizer_range_start + k]; - //println!("Quality_interval len {}",qualitiy_interval.len()); + debug!("Quality_interval len {}", qualitiy_interval.len()); significant = is_significant(qualitiy_interval, d_no_min, quality_threshold); - //println!("Quality intervallen {}",qualitiy_interval.len()); - if significant{ + debug!("Quality intervallen {}", qualitiy_interval.len()); + if significant { minimizers_filtered.push(mini.clone()) - } - else{ + } else { skipped_cter += 1; } } - if verbose{ - //println!("Number of insignificant seeds: {}",skipped_cter ); - //println!("Number of significant seeds: {}",minimizers_filtered.len()); + if verbose { + debug!("Number of insignificant seeds: {}", skipped_cter); + debug!("Number of significant seeds: {}", minimizers_filtered.len()); } } - - - ///Method used to generate syncmers from reads /// INPUT: seq: a string reference to the original read sequence /// k_size: The size of the k_mer used /// s_size: The size of s /// t: The size of parameter t ///OUTPUT: syncmers: A vector storing all syncmers (we use the minimizer struct to store them as essentially the same infos) -pub(crate) fn get_kmer_syncmers(seq: &[u8], k: usize, s: usize, t: usize, syncmers: &mut Vec) { +pub(crate) fn get_kmer_syncmers( + seq: &[u8], + k: usize, + s: usize, + t: usize, + syncmers: &mut Vec, +) { //TODO: add neccessity that the user should give only valid combinations of s t and k - // Calculate reverse complement - //let seq_rc = reverse_complement(std::str::from_utf8(seq).unwrap()); - let seq_len= seq.len(); - //println!("seqlen {}",seq_len); - //println!("seq_len {}", seq_len); - // Initialize deques for forward and reverse complement sequences (stores the hashs of the smers) - let mut window_smers_fw: VecDeque = (0..k - s + 1) - .map(|i| calculate_hash(&seq[i..i + s])) - .collect(); - //Initialize the reverse complement deque. We store the hash of the smer, however we have to generate the smer first. For this we take - //let mut window_smers_rc: VecDeque = (0..k - s + 1) - // .map(|i| calculate_hash(&seq_rc[seq_len-(i+s)..seq_len-i])) - // .collect(); - //println!("{} elements in our deque",window_smers_fw.len()); - // Find initial minimums (fw and rc) - let mut curr_min_fw = *window_smers_fw.iter().min().unwrap(); - //let mut curr_min_rc = *window_smers_rc.iter().min().unwrap(); - //find the minimum positions (fw and rc) - let pos_min_fw = window_smers_fw.iter().position(|&x| x == curr_min_fw).unwrap(); + // Calculate reverse complement + //let seq_rc = reverse_complement(std::str::from_utf8(seq).unwrap()); + let seq_len = seq.len(); + debug!("seqlen {}", seq_len); + debug!("seq_len {}", seq_len); + // Initialize deques for forward and reverse complement sequences (stores the hashs of the smers) + let mut window_smers_fw: VecDeque = (0..k - s + 1) + .map(|i| calculate_hash(&seq[i..i + s])) + .collect(); + //Initialize the reverse complement deque. We store the hash of the smer, however we have to generate the smer first. For this we take + //let mut window_smers_rc: VecDeque = (0..k - s + 1) + // .map(|i| calculate_hash(&seq_rc[seq_len-(i+s)..seq_len-i])) + // .collect(); + debug!("{} elements in our deque", window_smers_fw.len()); + // Find initial minimums (fw and rc) + let mut curr_min_fw = *window_smers_fw.iter().min().unwrap(); + //let mut curr_min_rc = *window_smers_rc.iter().min().unwrap(); + //find the minimum positions (fw and rc) + let pos_min_fw = window_smers_fw + .iter() + .position(|&x| x == curr_min_fw) + .unwrap(); + //let pos_min_rc = window_smers_rc.iter().position(|&x| x == curr_min_rc).unwrap(); + //let rc_hash=calculate_hash(reverse_complement(std::str::from_utf8(&seq[0..k]).unwrap()).as_str()); + // Choose minimum position + //let (pos_min, seq_tmp) = if curr_min_fw < curr_min_rc { + // (pos_min_fw, calculate_hash(&seq[0..k])) + //} else { + // + // (pos_min_rc, rc_hash) + //}; + + // Initialize syncmers list + if pos_min_fw == t { + syncmers.push(Minimizer_hashed { + sequence: calculate_hash(&seq[0..k]), + position: 0, + }); + } + + // Iterate over the sequence + for i in (k - s) + 1..seq.len() - k { + debug!("i {}", i); + let new_smer_fw = calculate_hash(&seq[i..i + s]); + //let rev_slice=&seq_rc[seq_len-(i+s)..seq_len-i]; + //let new_smer_rc = calculate_hash(rev_slice); + // Update windows + let _ = window_smers_fw.pop_front(); + window_smers_fw.push_back(new_smer_fw); + //let _ = window_smers_rc.pop_front(); + //window_smers_rc.push_back(new_smer_rc); + + // Update minimums and positions + curr_min_fw = *window_smers_fw.iter().min().unwrap(); + //curr_min_rc = *window_smers_rc.iter().min().unwrap(); + let pos_min_fw = window_smers_fw + .iter() + .position(|&x| x == curr_min_fw) + .unwrap(); //let pos_min_rc = window_smers_rc.iter().position(|&x| x == curr_min_rc).unwrap(); - //let rc_hash=calculate_hash(reverse_complement(std::str::from_utf8(&seq[0..k]).unwrap()).as_str()); // Choose minimum position - //let (pos_min, seq_tmp) = if curr_min_fw < curr_min_rc { - // (pos_min_fw, calculate_hash(&seq[0..k])) + debug!("startpos {} end {}", i - (k - s), i - (k - s) + k); + //let rc_hash= calculate_hash(reverse_complement(std::str::from_utf8(&seq[i-(k-s)..i-(k-s)+k]).unwrap()).as_str()); + //let (pos_min, kmer) = if curr_min_fw < curr_min_rc { + // (pos_min_fw, calculate_hash(&seq[i-(k-s)..i-(k-s)+k])) //} else { - // - // (pos_min_rc, rc_hash) + // (pos_min_rc,rc_hash) //}; - // Initialize syncmers list + // Add syncmer to the list if pos_min_fw == t { syncmers.push(Minimizer_hashed { - sequence: calculate_hash(&seq[0..k]), - position: 0, + sequence: calculate_hash(&seq[i - (k - s)..i - (k - s) + k]), + position: i, }); } - - // Iterate over the sequence - for i in (k - s) + 1 .. seq.len() - k { - //println!("i {}",i); - let new_smer_fw = calculate_hash(&seq[i..i + s]); - //let rev_slice=&seq_rc[seq_len-(i+s)..seq_len-i]; - //let new_smer_rc = calculate_hash(rev_slice); - //println!("fw_len: {}, rc_len: {}", &seq[i..i + s].len(),&seq_rc[seq_len-(i+1) - s ..seq_len-(i+1)].len()); - // Update windows - let _ = window_smers_fw.pop_front(); - window_smers_fw.push_back(new_smer_fw); - //let _ = window_smers_rc.pop_front(); - //window_smers_rc.push_back(new_smer_rc); - - // Update minimums and positions - curr_min_fw = *window_smers_fw.iter().min().unwrap(); - //curr_min_rc = *window_smers_rc.iter().min().unwrap(); - let pos_min_fw = window_smers_fw.iter().position(|&x| x == curr_min_fw).unwrap(); - //let pos_min_rc = window_smers_rc.iter().position(|&x| x == curr_min_rc).unwrap(); - // Choose minimum position - //println!("startpos {} end {}",i-(k-s),i-(k-s)+k); - //let rc_hash= calculate_hash(reverse_complement(std::str::from_utf8(&seq[i-(k-s)..i-(k-s)+k]).unwrap()).as_str()); - //let (pos_min, kmer) = if curr_min_fw < curr_min_rc { - // (pos_min_fw, calculate_hash(&seq[i-(k-s)..i-(k-s)+k])) - //} else { - // (pos_min_rc,rc_hash) - //}; - - // Add syncmer to the list - if pos_min_fw == t { - syncmers.push(Minimizer_hashed { - sequence: calculate_hash(&seq[i-(k-s)..i-(k-s)+k]), - position: i, - }); - } - } - } - - - - - - - - - - -#[cfg(test)] -mod tests { - use super::*; - - //#[test] - /*fn test_kmer_minimizers_0() { - let input = "ATGCTAGCATGCTAGCATGCTAGC"; - let window_size = 8; - let k = 3; - let actual_minimizers = get_kmer_minimizers(input, k, window_size); - println!("Generated Minimizers: {:?}", actual_minimizers); - let expected_minimizers = vec![ - Minimizer { sequence: "ATG".to_string(), position: 0 }, - Minimizer { sequence: "AGC".to_string(), position: 5 }, - Minimizer { sequence: "ATG".to_string(), position: 8 }, - Minimizer { sequence: "AGC".to_string(), position: 13 }, - Minimizer { sequence: "ATG".to_string(), position: 16 }, - Minimizer { sequence: "AGC".to_string(), position: 21 }, - ]; - assert_eq!(actual_minimizers, expected_minimizers); - } - #[test] - fn test_kmer_minimizers_1() { - let input = "CAATTTAAGGCCCGGG"; - let window_size = 10; - let k = 5; - let actual_minimizers = get_kmer_minimizers(input, k, window_size); - println!("Generated Minimizers: {:?}", actual_minimizers); - let expected_minimizers = vec![ - Minimizer { sequence: "AATTT".to_string(), position: 1 }, - Minimizer { sequence: "AAGGC".to_string(), position: 6 }, - Minimizer { sequence: "AGGCC".to_string(), position: 7 }, - ]; - assert_eq!(actual_minimizers, expected_minimizers); - } - - #[test] - fn test_kmer_minimizers_2() { - let input = "CAAAGTAAGGCCCTCC"; - let window_size = 10; - let k = 5; - let actual_minimizers = get_kmer_minimizers(input, k, window_size); - println!("Generated Minimizers: {:?}", actual_minimizers); - let expected_minimizers = vec![ - Minimizer { sequence: "AAAGT".to_string(), position: 1 }, - Minimizer { sequence: "AAGGC".to_string(), position: 6 }, - Minimizer { sequence: "AGGCC".to_string(), position: 7 }, - ]; - assert_eq!(actual_minimizers, expected_minimizers); - } - #[test] - fn test_kmer_minimizers_3() { - let input = "CAATGA"; - let window_size = 10; - let k = 5; - let actual_minimizers = get_kmer_minimizers(input, k, window_size); - println!("Generated Minimizers: {:?}", actual_minimizers); - let expected_minimizers = vec![ - Minimizer { sequence: "AATGA".to_string(), position: 1 }, - ]; - assert_eq!(actual_minimizers, expected_minimizers); - } - #[test] - fn test_canonical_minis_1(){ - let input ="GGGTAACTTTTCA"; - let window_size=12; - let k =6; - let actual_minimizers=get_canonical_kmer_minimizers(input,k,window_size); - println!("Generated Minimizers: {:?}", actual_minimizers); - let expected_minimizers = vec![ - Minimizer { sequence: "AAAAGT".to_string(), position: 5 }, - ]; - assert_eq!(actual_minimizers, expected_minimizers); } - /*#[test] - fn test_kmer_syncmers(){ - let input ="CATTCAGGAATC"; - let k=5; - let s=2; - let t=2; - let retreived_syncmers=get_kmer_syncmers(input,k,s,t); - let expected_syncmers=vec![ - Minimizer { sequence: "TCAGG".to_string(), position: 3 }, - Minimizer { sequence: "GGAAT".to_string(),position: 6}, +} - ]; - assert_eq!(retreived_syncmers, expected_syncmers); - }*/ - #[test] - fn test_average_0(){ - let mut input=vec![]; - input.push(0.5); - input.push(0.75); - input.push(0.25); - let average_res=average(&*input); - assert_eq!(average_res,0.5); - } - #[test] - fn test_average_1(){ - let mut input=vec![]; - input.push(1.0); - input.push(2.0); - input.push(3.0); - let average_res=average(&*input); - assert_eq!(average_res,2.0); - }*/ -} \ No newline at end of file +// #[cfg(test)] +// mod tests { +// use super::*; + +// //#[test] +// /*fn test_kmer_minimizers_0() { +// let input = "ATGCTAGCATGCTAGCATGCTAGC"; +// let window_size = 8; +// let k = 3; +// let actual_minimizers = get_kmer_minimizers(input, k, window_size); +// println!("Generated Minimizers: {:?}", actual_minimizers); +// let expected_minimizers = vec![ +// Minimizer { sequence: "ATG".to_string(), position: 0 }, +// Minimizer { sequence: "AGC".to_string(), position: 5 }, +// Minimizer { sequence: "ATG".to_string(), position: 8 }, +// Minimizer { sequence: "AGC".to_string(), position: 13 }, +// Minimizer { sequence: "ATG".to_string(), position: 16 }, +// Minimizer { sequence: "AGC".to_string(), position: 21 }, +// ]; +// assert_eq!(actual_minimizers, expected_minimizers); +// } +// #[test] +// fn test_kmer_minimizers_1() { +// let input = "CAATTTAAGGCCCGGG"; +// let window_size = 10; +// let k = 5; +// let actual_minimizers = get_kmer_minimizers(input, k, window_size); +// println!("Generated Minimizers: {:?}", actual_minimizers); +// let expected_minimizers = vec![ +// Minimizer { sequence: "AATTT".to_string(), position: 1 }, +// Minimizer { sequence: "AAGGC".to_string(), position: 6 }, +// Minimizer { sequence: "AGGCC".to_string(), position: 7 }, +// ]; +// assert_eq!(actual_minimizers, expected_minimizers); +// } + +// #[test] +// fn test_kmer_minimizers_2() { +// let input = "CAAAGTAAGGCCCTCC"; +// let window_size = 10; +// let k = 5; +// let actual_minimizers = get_kmer_minimizers(input, k, window_size); +// println!("Generated Minimizers: {:?}", actual_minimizers); +// let expected_minimizers = vec![ +// Minimizer { sequence: "AAAGT".to_string(), position: 1 }, +// Minimizer { sequence: "AAGGC".to_string(), position: 6 }, +// Minimizer { sequence: "AGGCC".to_string(), position: 7 }, +// ]; +// assert_eq!(actual_minimizers, expected_minimizers); +// } +// #[test] +// fn test_kmer_minimizers_3() { +// let input = "CAATGA"; +// let window_size = 10; +// let k = 5; +// let actual_minimizers = get_kmer_minimizers(input, k, window_size); +// println!("Generated Minimizers: {:?}", actual_minimizers); +// let expected_minimizers = vec![ +// Minimizer { sequence: "AATGA".to_string(), position: 1 }, +// ]; +// assert_eq!(actual_minimizers, expected_minimizers); +// } +// #[test] +// fn test_canonical_minis_1(){ +// let input ="GGGTAACTTTTCA"; +// let window_size=12; +// let k =6; +// let actual_minimizers=get_canonical_kmer_minimizers(input,k,window_size); +// println!("Generated Minimizers: {:?}", actual_minimizers); +// let expected_minimizers = vec![ +// Minimizer { sequence: "AAAAGT".to_string(), position: 5 }, +// ]; +// assert_eq!(actual_minimizers, expected_minimizers); +// } +// /*#[test] +// fn test_kmer_syncmers(){ +// let input ="CATTCAGGAATC"; +// let k=5; +// let s=2; +// let t=2; +// let retreived_syncmers=get_kmer_syncmers(input,k,s,t); +// let expected_syncmers=vec![ +// Minimizer { sequence: "TCAGG".to_string(), position: 3 }, +// Minimizer { sequence: "GGAAT".to_string(),position: 6}, + +// ]; +// assert_eq!(retreived_syncmers, expected_syncmers); +// }*/ +// #[test] +// fn test_average_0(){ +// let mut input=vec![]; +// input.push(0.5); +// input.push(0.75); +// input.push(0.25); +// let average_res=average(&*input); +// assert_eq!(average_res,0.5); +// } +// #[test] +// fn test_average_1(){ +// let mut input=vec![]; +// input.push(1.0); +// input.push(2.0); +// input.push(3.0); +// let average_res=average(&*input); +// assert_eq!(average_res,2.0); +// }*/ +// } diff --git a/src/write_output.rs b/src/write_output.rs index c3438d2..1a2aa13 100644 --- a/src/write_output.rs +++ b/src/write_output.rs @@ -1,74 +1,95 @@ -use std::path::PathBuf; -use std::fs::File; -use std::io::{Write, BufWriter}; -use std::path::Path; -use std::fs; use crate::structs::FastqRecord_isoncl_init; -use rustc_hash::FxHashMap; -use crate::{Cluster_ID_Map, file_actions}; +use crate::{file_actions, Cluster_ID_Map}; +use log::debug; use rayon::prelude::*; +use rustc_hash::FxHashMap; +use std::fs; +use std::fs::File; +use std::io::{BufWriter, Write}; +use std::path::Path; +use std::path::PathBuf; - -pub(crate) fn write_ordered_fastq(score_vec: &[(i32,usize)], outfolder: &String,id_map: &FxHashMap,fastq: &str){ +pub(crate) fn write_ordered_fastq( + score_vec: &[(i32, usize)], + outfolder: &String, + id_map: &FxHashMap, + fastq: &str, +) { //writes the fastq file let _ = fs::create_dir_all(PathBuf::from(outfolder).join("clustering")); let fastq_file = File::open(fastq).unwrap(); - let mut fastq_records= FxHashMap::default(); - file_actions::parse_fastq_hashmap(fastq_file,&mut fastq_records); - let f = File::create(outfolder.to_owned()+"/clustering/sorted.fastq").expect("Unable to create file"); + let mut fastq_records = FxHashMap::default(); + file_actions::parse_fastq_hashmap(fastq_file, &mut fastq_records); + let f = File::create(outfolder.to_owned() + "/clustering/sorted.fastq") + .expect("Unable to create file"); let mut buf_write = BufWriter::new(&f); //for record in fastq_records { - for score_tup in score_vec.iter(){ + for score_tup in score_vec.iter() { let this_header = id_map.get(&score_tup.0).unwrap(); - let record= fastq_records.get(this_header).unwrap(); - if &record.header == this_header{ - write!(buf_write, "@{}\n{}\n+\n{}\n", record.get_header(), record.get_sequence(),record.get_quality()).expect("Could not write file"); + let record = fastq_records.get(this_header).unwrap(); + if &record.header == this_header { + write!( + buf_write, + "@{}\n{}\n+\n{}\n", + record.get_header(), + record.get_sequence(), + record.get_quality() + ) + .expect("Could not write file"); } } buf_write.flush().expect("Failed to flush the buffer"); } - -fn write_final_clusters_tsv(outfolder: &Path, clusters: Cluster_ID_Map, id_map:FxHashMap, header_cluster_map:&mut FxHashMap){ +fn write_final_clusters_tsv( + outfolder: &Path, + clusters: Cluster_ID_Map, + id_map: FxHashMap, + header_cluster_map: &mut FxHashMap, +) { let file_path = PathBuf::from(outfolder).join("final_clusters.tsv"); let f = File::create(file_path).expect("unable to create file"); let mut buf_write = BufWriter::new(&f); - let mut nr_reads= 0; - println!("{} different clusters identified",clusters.len()); + let mut nr_reads = 0; + println!("{} different clusters identified", clusters.len()); //let nr_clusters=clusters.len(); - for (cl_id, r_int_ids) in clusters.into_iter(){ - //println!("cl_id {}, nr_reads {:?}",cl_id,nr_reads); - for r_int_id in r_int_ids{ + for (cl_id, r_int_ids) in clusters.into_iter() { + debug!("cl_id {}, nr_reads {:?}", cl_id, nr_reads); + for r_int_id in r_int_ids { let read_id = id_map.get(&r_int_id).unwrap(); nr_reads += 1; - let _ = writeln!(buf_write ,"{}\t{}", cl_id, read_id); - header_cluster_map.insert(read_id.to_string(),cl_id); + let _ = writeln!(buf_write, "{}\t{}", cl_id, read_id); + header_cluster_map.insert(read_id.to_string(), cl_id); } } // Flush the buffer to ensure all data is written to the underlying file buf_write.flush().expect("Failed to flush the buffer"); - //println!("{} different clusters identified",nr_clusters); - //println!("HCLM {:?}",header_cluster_map); + // debug!("{} different clusters identified", nr_clusters); + debug!("HCLM {:?}", header_cluster_map); println!("{} reads added to tsv file", nr_reads); } - //TODO: this is the current RAM bottleneck: we read the whole file to then have the reads when we write the output //Outline: sort the fastq file by cluster and then write the entries from the sorted fastq file to not having to read the full file -fn create_final_ds(header_cluster_map: FxHashMap, fastq: String, cluster_map: &mut FxHashMap>){ +fn create_final_ds( + header_cluster_map: FxHashMap, + fastq: String, + cluster_map: &mut FxHashMap>, +) { let fastq_file = File::open(fastq).unwrap(); - let mut fastq_vec= vec![]; + let mut fastq_vec = vec![]; //parse the fastq file to store the data in fastq_vec - file_actions::parse_fastq(fastq_file,&mut fastq_vec); + file_actions::parse_fastq(fastq_file, &mut fastq_vec); //iterate over fastq_vec and add the reads to cluster_map - for read in fastq_vec{ + for read in fastq_vec { let id = read.header.clone(); if header_cluster_map.contains_key(&id) { let cluster_id = header_cluster_map.get(&id).unwrap(); if cluster_map.contains_key(cluster_id) { - let mut id_vec: &mut Vec = cluster_map.get_mut(cluster_id).unwrap(); + let mut id_vec: &mut Vec = + cluster_map.get_mut(cluster_id).unwrap(); id_vec.push(read) } else { let id_vec = vec![read]; @@ -78,66 +99,80 @@ fn create_final_ds(header_cluster_map: FxHashMap, fastq: String, clu } } - - -fn write_fastq_files(outfolder: &Path, cluster_map: FxHashMap>, n: usize){ +fn write_fastq_files( + outfolder: &Path, + cluster_map: FxHashMap>, + n: usize, +) { let mut new_cl_id = 0; - let mut read_cter= 0; + let mut read_cter = 0; //fs::create_dir_all(PathBuf::from(outfolder).join("fastq_files")); - let fastq_outfolder= PathBuf::from(outfolder); + let fastq_outfolder = PathBuf::from(outfolder); //Writes the fastq files using the data structure cluster_map HashMap> - for (cl_id, records) in cluster_map.into_iter(){ - if records.len() >= n { //only write the records if we have n or more reads supporting the cluster - let filename = new_cl_id.to_string()+".fastq"; + for (cl_id, records) in cluster_map.into_iter() { + if records.len() >= n { + //only write the records if we have n or more reads supporting the cluster + let filename = new_cl_id.to_string() + ".fastq"; let file_path = fastq_outfolder.join(filename); let f = File::create(file_path).expect("unable to create file"); let mut buf_write = BufWriter::new(&f); - for record in records{ - write!(buf_write ,"@{}\n{}\n+\n{}\n", record.header, record.sequence,record.quality).expect("We should be able to write the entries"); + for record in records { + write!( + buf_write, + "@{}\n{}\n+\n{}\n", + record.header, record.sequence, record.quality + ) + .expect("We should be able to write the entries"); read_cter += 1; } buf_write.flush().expect("Failed to flush the buffer"); new_cl_id += 1; //this is the new cl_id as we skip some on the way } - - //println!("cl id for writing: {}, {}",cl_id,read_cter); + debug!("cl id for writing: {}, {}", cl_id, read_cter); } - println!("{} reads written",read_cter); + println!("{} reads written", read_cter); } - - pub fn path_exists(path: &str) -> bool { fs::metadata(path).is_ok() } - - -pub(crate) fn write_output(outfolder: String, clusters: &Cluster_ID_Map, fastq: String, id_map: FxHashMap, n: usize, no_fastq: bool){ - - if !path_exists(&outfolder){ +pub(crate) fn write_output( + outfolder: String, + clusters: &Cluster_ID_Map, + fastq: String, + id_map: FxHashMap, + n: usize, + no_fastq: bool, +) { + if !path_exists(&outfolder) { fs::create_dir(outfolder.clone()).expect("We should be able to create the directory"); } //clustering_path: the outfolder of isONclust3 - let clustering_path= Path::new(&outfolder).join("clustering"); - if !clustering_path.exists(){ + let clustering_path = Path::new(&outfolder).join("clustering"); + if !clustering_path.exists() { let _ = fs::create_dir(clustering_path.clone()); } //the subfolder of clustering in which we write the fastq_files ( as done in isONclust1) - let fastq_path= clustering_path.join("fastq_files"); - if !fastq_path.exists(){ + let fastq_path = clustering_path.join("fastq_files"); + if !fastq_path.exists() { let _ = fs::create_dir(fastq_path.clone()); } - let mut cluster_hashmap_fastq_record= FxHashMap::default(); - let mut header_cluster_map= FxHashMap::default(); - write_final_clusters_tsv(&clustering_path, clusters.clone(), id_map.clone(), &mut header_cluster_map); + let mut cluster_hashmap_fastq_record = FxHashMap::default(); + let mut header_cluster_map = FxHashMap::default(); + write_final_clusters_tsv( + &clustering_path, + clusters.clone(), + id_map.clone(), + &mut header_cluster_map, + ); //no_fastq: true -> we do not want to write the fastq files - if !no_fastq{ + if !no_fastq { //create a data structure that we use to generate the proper fastq files - create_final_ds(header_cluster_map, fastq,&mut cluster_hashmap_fastq_record); - //println!("Cluster_hashmap: {}",cluster_hashmap_fastq_record.len()); + create_final_ds(header_cluster_map, fastq, &mut cluster_hashmap_fastq_record); + debug!("Cluster_hashmap: {}", cluster_hashmap_fastq_record.len()); println!("Writing the fastq files"); write_fastq_files(&fastq_path, cluster_hashmap_fastq_record, n); } -} \ No newline at end of file +} From 0bc07b36b5091c85eb10640d5095e40c405110a2 Mon Sep 17 00:00:00 2001 From: Oscar Aspelin Date: Sun, 3 Aug 2025 15:44:09 +0200 Subject: [PATCH 6/6] Replace more println! macros with info!/debug! that, for some reason, were not caught in the first round. --- src/clustering.rs | 33 +++++++++++++----------- src/generate_sorted_fastq_for_cluster.rs | 18 ++++++------- src/gff_handling.rs | 23 +++++++++-------- src/write_output.rs | 9 ++++--- 4 files changed, 44 insertions(+), 39 deletions(-) diff --git a/src/clustering.rs b/src/clustering.rs index 4cda6f3..171ead7 100644 --- a/src/clustering.rs +++ b/src/clustering.rs @@ -95,7 +95,10 @@ pub(crate) fn cluster( else { let mut most_shared_cluster = -1; let mut shared = false; - // println!("shared_seed_infos_norm_vec BEFORE: {:?}", shared_seed_infos_norm_vec); + debug!( + "shared_seed_infos_norm_vec BEFORE: {:?}", + shared_seed_infos_norm_vec + ); for minimizer in minimizers { //TODO: test whether into_par_iter works here @@ -156,7 +159,7 @@ pub(crate) fn cluster( .or_default(); let vect = cluster_map.get_mut(&sign_mini.sequence).unwrap(); if !vect.contains(&cl_id) { - // println!(" cl_id: {} &cl_id {}", cl_id, &cl_id ); + debug!(" cl_id: {} &cl_id {}", cl_id, &cl_id); vect.push(cl_id); } } @@ -255,8 +258,8 @@ fn detect_overlaps( //We only merge if we share more than min_shared_minis if shared_perc > min_shared_minis { if verbose { - println!("CL_ID {}, msc {}", cl_id, most_shared_cluster_id); - println!( + debug!("CL_ID {}, msc {}", cl_id, most_shared_cluster_id); + debug!( "nr_minis {}, max_shared {}, shared_perc {}", nr_minis, max_shared, shared_perc ); @@ -326,11 +329,11 @@ pub(crate) fn cluster_merging( verbose: bool, ) { //let min_shared_minis_pc = 0.5; - println!("min_shared_minis_pc: {}", min_shared_minis); + debug!("min_shared_minis_pc: {}", min_shared_minis); //cl_set_map is a hashmap with cl_id -> Hashset of seed hashes let mut cl_set_map: FxHashMap> = FxHashMap::default(); if verbose { - println!("Cl_set_map {:?}", cl_set_map.len()); + debug!("Cl_set_map {:?}", cl_set_map.len()); } //merge_into is a vector of a tuple(cl_id1,cl_id2) let mut merge_into: Vec<(i32, i32)> = vec![]; @@ -343,7 +346,7 @@ pub(crate) fn cluster_merging( while !merge_into.is_empty() || first_iter { //clear merge_into as this is the indicator how often we attempt to merge further (the while loop depends on it) merge_into.clear(); - println!("MI {:?}", merge_into); + debug!("MI {:?}", merge_into); small_hs.clear(); //set first_iter to be false to not stay in a infinity loop first_iter = false; @@ -351,8 +354,8 @@ pub(crate) fn cluster_merging( //generate the data structure giving us merge infos let now_pc1 = Instant::now(); generate_cluster_merging_ds(&mut cl_set_map, cluster_map); - println!("{} s for creating the pc ds", now_pc1.elapsed().as_secs()); - println!("Post_clustering_ds generated"); + debug!("{} s for creating the pc ds", now_pc1.elapsed().as_secs()); + debug!("Post_clustering_ds generated"); let now_pc2 = Instant::now(); detect_overlaps( &mut cl_set_map, @@ -363,12 +366,12 @@ pub(crate) fn cluster_merging( shared_seed_infos_vec, verbose, ); - println!( + debug!( "{} s for detection of overlaps", now_pc2.elapsed().as_secs() ); if verbose { - println!("Merge_into {:?}", merge_into); + debug!("Merge_into {:?}", merge_into); } let now_pc3 = Instant::now(); merge_clusters_from_merge_into( @@ -379,15 +382,15 @@ pub(crate) fn cluster_merging( &small_hs, &mut not_large, ); - println!("{} s for merging of clusters", now_pc3.elapsed().as_secs()); + debug!("{} s for merging of clusters", now_pc3.elapsed().as_secs()); let now_pc4 = Instant::now(); merge_into.retain(|&(_, second)| !not_large.contains(&second)); - println!("{} s for retaining merge_into", now_pc4.elapsed().as_secs()); - println!("{} s since create ds", now_pc2.elapsed().as_secs()); + debug!("{} s for retaining merge_into", now_pc4.elapsed().as_secs()); + debug!("{} s since create ds", now_pc2.elapsed().as_secs()); cl_set_map.clear(); } - println!("min_shared_minis_pc: {}", min_shared_minis); + debug!("min_shared_minis_pc: {}", min_shared_minis); } pub(crate) fn generate_initial_cluster_map( diff --git a/src/generate_sorted_fastq_for_cluster.rs b/src/generate_sorted_fastq_for_cluster.rs index 1ea4985..19f89b0 100644 --- a/src/generate_sorted_fastq_for_cluster.rs +++ b/src/generate_sorted_fastq_for_cluster.rs @@ -9,7 +9,7 @@ use std::path::Path; use std::time::Instant; //use crate::bio_rust_file_read; use bio::io::fastq; -use log::debug; +use log::{debug, info}; use minimizer_iter::MinimizerBuilder; use rustc_hash::FxHashMap; use std::fs; @@ -197,7 +197,7 @@ fn analyse_fastq_and_sort( ); } } else { - println!("No seeding"); + debug!("No seeding"); } seeding_and_filtering_seeds::filter_seeds_by_quality( &this_minimizers, @@ -222,11 +222,11 @@ fn analyse_fastq_and_sort( if verbose { let print_vec = &score_vec[0..5]; for score_tup in print_vec { - println!("ID {} count {}", &score_tup.0, score_tup.1); + debug!("ID {} count {}", &score_tup.0, score_tup.1); } } - println!("{} reads accepted", score_vec.len()); + info!("{} reads accepted", score_vec.len()); debug!("{:?}", score_vec.pop()); } @@ -236,8 +236,8 @@ fn print_statistics(fastq_records: &[FastqRecord_isoncl_init]) { */ let min_e = fastq_records[0].get_err_rate(); let max_e = fastq_records[fastq_records.len() - 1].get_err_rate(); - println!("Lowest read error rate: {}", min_e); - println!("Highest read error rate: {}", max_e); + info!("Lowest read error rate: {}", min_e); + info!("Highest read error rate: {}", max_e); //logfile.write("Median read error rate:{0}\n".format(median_e)) //logfile.write("Mean read error rate:{0}\n".format(mean_e)) } @@ -255,7 +255,7 @@ pub(crate) fn sort_fastq_for_cluster( noncanonical_bool: bool, verbose: bool, ) { - println!("Sorting the fastq_file"); + info!("Sorting the fastq_file"); let now = Instant::now(); //holds the internal ids and scores as tuples to be able to sort properly let mut score_vec = vec![]; @@ -277,12 +277,12 @@ pub(crate) fn sort_fastq_for_cluster( verbose, ); let elapsed = now.elapsed(); - println!("Elapsed: {:.2?}", elapsed); + info!("Elapsed: {:.2?}", elapsed); if !path_exists(outfolder) { fs::create_dir(outfolder.clone()).expect("We should be able to create the directory"); } //write a fastq-file that contains the reordered reads write_output::write_ordered_fastq(&score_vec, outfolder, &id_map, in_file_path); - println!("Wrote the sorted fastq file"); + info!("Wrote the sorted fastq file"); //print_statistics(fastq_records.borrow()); } diff --git a/src/gff_handling.rs b/src/gff_handling.rs index dfdfc95..b0e213d 100644 --- a/src/gff_handling.rs +++ b/src/gff_handling.rs @@ -6,6 +6,7 @@ use bio::io::gff; use rustc_hash::{FxHashMap, FxHashSet}; use bio::io::gff::GffType::GFF3; +use log::info; use rayon::prelude::*; use bio::io::fasta; @@ -48,7 +49,7 @@ fn parse_fasta_and_gen_clusters( k: usize, w: usize, ) { - println!("parse_fasta"); + info!("parse_fasta"); let path = fasta_path.unwrap(); let mut reader = fasta::Reader::from_file(Path::new(path)).expect("We expect the file to exist"); @@ -62,7 +63,7 @@ fn parse_fasta_and_gen_clusters( let mut overlap_ctr = 0; //in the next lines we make sure that we have a proper header and store it as string let id = seq_rec.id().to_string().split(' ').collect::>()[0].to_string(); - println!("Now to the coords_ds"); + info!("Now to the coords_ds"); if let Some(gene_map) = coords.get(id.as_str()) { //iterate over the genes in the gene_map for (gene_id, exon_coords) in gene_map { @@ -93,7 +94,7 @@ fn parse_fasta_and_gen_clusters( internal_id += 1; } } - println!("{} overlaps between genes (their exons) ", overlap_ctr); + info!("{} overlaps between genes (their exons) ", overlap_ctr); }); } @@ -164,27 +165,27 @@ pub(crate) fn resolve_gff( k: usize, w: usize, ) { - println!("Resolving GFF file "); + info!("Resolving GFF file "); let now1 = Instant::now(); let mut coords = FxHashMap::default(); //: HashMap, BuildHasherDefault>, BuildHasherDefault> = FxHashMap::default(); parse_gtf_and_collect_coords(gff_path, &mut coords); - println!( + info!( "{} s used for parsing the gff file", now1.elapsed().as_secs() ); - println!("First step done"); + info!("First step done"); let now2 = Instant::now(); parse_fasta_and_gen_clusters(fasta_path, coords, clusters, cluster_map, k, w); - println!( + info!( "Generated {} initial clusters from the reference", clusters.len() ); - println!( + info!( "{} s used for parsing the fasta file", now2.elapsed().as_secs() ); - println!("{} s for full GFF resolution", now1.elapsed().as_secs()); - println!("GTF resolved"); + info!("{} s for full GFF resolution", now1.elapsed().as_secs()); + info!("GTF resolved"); } pub(crate) fn gff_based_clustering( @@ -282,7 +283,7 @@ pub(crate) fn gff_based_clustering( let id_vec = vec![]; clusters.insert(gene_id, id_vec); } else { - println!( + info!( "found {} genes in {}", gene_id - previous_genes, scaffold_id diff --git a/src/write_output.rs b/src/write_output.rs index 1a2aa13..2e941cf 100644 --- a/src/write_output.rs +++ b/src/write_output.rs @@ -1,6 +1,7 @@ use crate::structs::FastqRecord_isoncl_init; use crate::{file_actions, Cluster_ID_Map}; use log::debug; +use log::info; use rayon::prelude::*; use rustc_hash::FxHashMap; use std::fs; @@ -53,7 +54,7 @@ fn write_final_clusters_tsv( let f = File::create(file_path).expect("unable to create file"); let mut buf_write = BufWriter::new(&f); let mut nr_reads = 0; - println!("{} different clusters identified", clusters.len()); + info!("{} different clusters identified", clusters.len()); //let nr_clusters=clusters.len(); for (cl_id, r_int_ids) in clusters.into_iter() { debug!("cl_id {}, nr_reads {:?}", cl_id, nr_reads); @@ -68,7 +69,7 @@ fn write_final_clusters_tsv( buf_write.flush().expect("Failed to flush the buffer"); // debug!("{} different clusters identified", nr_clusters); debug!("HCLM {:?}", header_cluster_map); - println!("{} reads added to tsv file", nr_reads); + info!("{} reads added to tsv file", nr_reads); } //TODO: this is the current RAM bottleneck: we read the whole file to then have the reads when we write the output @@ -131,7 +132,7 @@ fn write_fastq_files( debug!("cl id for writing: {}, {}", cl_id, read_cter); } - println!("{} reads written", read_cter); + info!("{} reads written", read_cter); } pub fn path_exists(path: &str) -> bool { @@ -172,7 +173,7 @@ pub(crate) fn write_output( //create a data structure that we use to generate the proper fastq files create_final_ds(header_cluster_map, fastq, &mut cluster_hashmap_fastq_record); debug!("Cluster_hashmap: {}", cluster_hashmap_fastq_record.len()); - println!("Writing the fastq files"); + info!("Writing the fastq files"); write_fastq_files(&fastq_path, cluster_hashmap_fastq_record, n); } }