From c8c8202ea228c83c9b4b01e7bf61f14f2c671b36 Mon Sep 17 00:00:00 2001 From: stijndcl Date: Sat, 13 Jan 2024 00:16:58 +0100 Subject: [PATCH 1/4] Re-write WriteToChunk.JS in Rust --- .../src/bin/write-to-chunk.rs | 80 +++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs new file mode 100644 index 00000000..7703ee03 --- /dev/null +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs @@ -0,0 +1,80 @@ +use std::fs::File; +use std::io::{BufRead, BufWriter, Write}; +use std::path::{Path, PathBuf}; + +use anyhow::{Context, Result}; +use clap::Parser; + +use unipept_database::utils::files::open_sin; + +fn main() -> Result<()> { + let args = Cli::parse(); + + let mut file_streams: Vec> = Vec::with_capacity(TAXA_BOUNDS.len()); + + // Create writers for the output files + for (idx, bound) in TAXA_BOUNDS.iter().take(TAXA_BOUNDS.len() - 1).enumerate() { + let next = TAXA_BOUNDS[idx + 1]; + let file_name = format!("unipept.{bound}-{next}.chunk"); + let file_path = Path::new(&args.output_dir).join(file_name); + let file_handler = File::create(file_path).with_context(|| format!("Unable to create output file {bound}-{next}"))?; + let writer = BufWriter::new(file_handler); + file_streams.push(writer); + } + + let mut reader = open_sin(); + + // First read the header + let mut header: String = String::new(); + reader.read_line(&mut header).context("Error reading header from stdin")?; + write_header(&args.output_dir, header)?; + + // Then the rest of the data + for line in reader.lines() { + let line = line.context("Error reading line from stdin")?; + + if args.verbose { + eprintln!("INFO VERBOSE: writing line to chunk: {line}"); + } + + let spl: Vec<&str> = line.split('\t').collect(); + let taxon_id = spl[8].trim().parse::().with_context(|| format!("Error parsing {} as an integer", spl[8]))?; + + // Find the index of this taxon id in the array + // Note that this can be sped up using binary search (see Python's bisect.bisect_left), + // but this tool is near-instant so we favour readability + let mut index: usize = 0; + while taxon_id > TAXA_BOUNDS[index] { + index += 1; + } + + writeln!(&mut file_streams[index], "{line}").context("Error writing to output file")?; + } + + Ok(()) +} + +#[derive(Parser, Debug)] +struct Cli { + #[clap(short, long)] + output_dir: PathBuf, + #[clap(short, long)] + verbose: bool, +} + +const TAXA_BOUNDS: [usize; 45] = [ + 0, 550, 1352, 3047, 5580, 8663, 11676, 32473, 40214, 52774, 66656, 86630, 116960, 162147, 210225, 267979, 334819, + 408172, 470868, 570509, 673318, 881260, 1046115, 1136135, 1227077, 1300307, 1410620, 1519492, 1650438, 1756149, + 1820614, 1871070, 1898104, 1922217, 1978231, 2024617, 2026757, 2035430, 2070414, 2202732, 2382165, 2527964, 2601669, + 2706029, 10000000 +]; + +fn write_header(output_dir: &PathBuf, header: String) -> Result<()> { + let file_path = Path::new(output_dir).join("db.header"); + let file_handler = File::create(file_path).with_context(|| format!("Unable to create header output file"))?; + let mut writer = BufWriter::new(file_handler); + + writeln!(&mut writer, "{}", header).context("Error writing header")?; + + Ok(()) +} From 20fb5c1277a233fadf8a6233572b56f5b3117148 Mon Sep 17 00:00:00 2001 From: stijndcl Date: Sat, 13 Jan 2024 15:34:21 +0100 Subject: [PATCH 2/4] Fix bug --- .../unipept-database-rs/src/bin/taxa-by-chunk.rs | 0 .../unipept-database-rs/src/bin/write-to-chunk.rs | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) create mode 100644 scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs new file mode 100644 index 00000000..e69de29b diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs index 7703ee03..a796ece8 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs @@ -48,7 +48,7 @@ fn main() -> Result<()> { index += 1; } - writeln!(&mut file_streams[index], "{line}").context("Error writing to output file")?; + writeln!(&mut file_streams[index - 1], "{line}").context("Error writing to output file")?; } Ok(()) @@ -74,7 +74,7 @@ fn write_header(output_dir: &PathBuf, header: String) -> Result<()> { let file_handler = File::create(file_path).with_context(|| format!("Unable to create header output file"))?; let mut writer = BufWriter::new(file_handler); - writeln!(&mut writer, "{}", header).context("Error writing header")?; + write!(&mut writer, "{}", header).context("Error writing header")?; Ok(()) } From c39ef6aa4c41a285050c5c079904fbd15c61875c Mon Sep 17 00:00:00 2001 From: stijndcl Date: Sat, 13 Jan 2024 16:15:54 +0100 Subject: [PATCH 3/4] Fix bugs in taxa by chunk --- .../src/bin/taxa-by-chunk.rs | 87 +++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs index e69de29b..537d9701 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs @@ -0,0 +1,87 @@ +use std::fs::{File, read_dir}; +use std::io::{BufRead, BufWriter, Write}; +use std::path::{Path, PathBuf}; + +use anyhow::{Context, Result}; +use clap::Parser; +use regex::Regex; + +use unipept_database::utils::files::open_sin; + +fn main() -> Result<()> { + let args = Cli::parse(); + + let mut all_taxa: Vec = Vec::new(); + + let reader = open_sin(); + + // Read all taxa ids from stdin + for line in reader.lines() { + let line = line.context("Error reading line from stdin")?; + + // Ignore empty lines + if line.trim().is_empty() { + continue; + } + + let taxa_id: u64 = line.trim().parse().with_context(|| format!("Error parsing {line} as an integer"))?; + all_taxa.push(taxa_id); + } + + let chunk_file_regex = Regex::new(r"unipept\..*\.gz").context("Error creating regex")?; + + for entry in read_dir(&args.chunk_dir).context("Error reading chunk directory")? { + let entry = entry.context("Error reading entry from chunk directory")?; + let path = entry.path(); + if !path.is_file() { + continue; + } + + let base_name = match path.file_name() { + None => {continue;} + Some(n) => n.to_str().context("Error creating string from file path")? + }; + + if !chunk_file_regex.is_match(base_name) { + continue; + } + + // Parse the taxa range out of the filename + let replaced_name = base_name.replace("unipept.", "").replace(".chunk.gz", ""); + let range = replaced_name.split_once("-"); + let range = range.with_context(|| format!("Unable to split {replaced_name} on '-'"))?; + let start: u64 = range.0.parse().with_context(|| format!("Error parsing {} as an integer", range.0))?; + let end: u64 = range.1.parse().with_context(|| format!("Error parsing {} as an integer", range.1))?; + + let matching_taxa: Vec<&u64> = all_taxa.iter().filter(|&t| start <= *t && *t <= end).collect(); + + // Write matches to a temporary output file + if !matching_taxa.is_empty() { + let mapped_taxa: Vec = matching_taxa.iter().map(|&t| format!("\t{t}$")).collect(); + let joined_taxa = mapped_taxa.join("\n"); + + let temp_file_path = Path::new(&args.temp_dir).join(format!("{base_name}.pattern")); + let temp_file = File::create(&temp_file_path).context("Error creating temporary pattern file")?; + let mut writer = BufWriter::new(temp_file); + write!( + &mut writer, + "{joined_taxa}", + ).context("Error writing to temporary pattern file")?; + + // The two unwraps here can't be handled using the ? operator + println!("{}", temp_file_path.into_os_string().into_string().unwrap()); + println!("{}", path.into_os_string().into_string().unwrap()); + } + } + + Ok(()) +} + +#[derive(Parser, Debug)] +struct Cli { + #[clap(long)] + chunk_dir: PathBuf, + + #[clap(long)] + temp_dir: PathBuf +} \ No newline at end of file From fa9f0aa2a883bc021c9cdc19e0e38b38c4fdde10 Mon Sep 17 00:00:00 2001 From: stijndcl Date: Sat, 13 Jan 2024 16:29:50 +0100 Subject: [PATCH 4/4] Fix linting and formatting --- .../src/bin/taxa-by-chunk.rs | 48 ++++++++++++------- .../src/bin/write-to-chunk.rs | 22 +++++---- 2 files changed, 45 insertions(+), 25 deletions(-) diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs index 537d9701..64a20225 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs @@ -1,4 +1,4 @@ -use std::fs::{File, read_dir}; +use std::fs::{read_dir, File}; use std::io::{BufRead, BufWriter, Write}; use std::path::{Path, PathBuf}; @@ -24,7 +24,10 @@ fn main() -> Result<()> { continue; } - let taxa_id: u64 = line.trim().parse().with_context(|| format!("Error parsing {line} as an integer"))?; + let taxa_id: u64 = line + .trim() + .parse() + .with_context(|| format!("Error parsing {line} as an integer"))?; all_taxa.push(taxa_id); } @@ -38,8 +41,10 @@ fn main() -> Result<()> { } let base_name = match path.file_name() { - None => {continue;} - Some(n) => n.to_str().context("Error creating string from file path")? + None => { + continue; + } + Some(n) => n.to_str().context("Error creating string from file path")?, }; if !chunk_file_regex.is_match(base_name) { @@ -48,25 +53,34 @@ fn main() -> Result<()> { // Parse the taxa range out of the filename let replaced_name = base_name.replace("unipept.", "").replace(".chunk.gz", ""); - let range = replaced_name.split_once("-"); + let range = replaced_name.split_once('-'); let range = range.with_context(|| format!("Unable to split {replaced_name} on '-'"))?; - let start: u64 = range.0.parse().with_context(|| format!("Error parsing {} as an integer", range.0))?; - let end: u64 = range.1.parse().with_context(|| format!("Error parsing {} as an integer", range.1))?; - - let matching_taxa: Vec<&u64> = all_taxa.iter().filter(|&t| start <= *t && *t <= end).collect(); + let start: u64 = range + .0 + .parse() + .with_context(|| format!("Error parsing {} as an integer", range.0))?; + let end: u64 = range + .1 + .parse() + .with_context(|| format!("Error parsing {} as an integer", range.1))?; + + let matching_taxa: Vec<&u64> = all_taxa + .iter() + .filter(|&t| start <= *t && *t <= end) + .collect(); // Write matches to a temporary output file if !matching_taxa.is_empty() { - let mapped_taxa: Vec = matching_taxa.iter().map(|&t| format!("\t{t}$")).collect(); + let mapped_taxa: Vec = + matching_taxa.iter().map(|&t| format!("\t{t}$")).collect(); let joined_taxa = mapped_taxa.join("\n"); let temp_file_path = Path::new(&args.temp_dir).join(format!("{base_name}.pattern")); - let temp_file = File::create(&temp_file_path).context("Error creating temporary pattern file")?; + let temp_file = + File::create(&temp_file_path).context("Error creating temporary pattern file")?; let mut writer = BufWriter::new(temp_file); - write!( - &mut writer, - "{joined_taxa}", - ).context("Error writing to temporary pattern file")?; + write!(&mut writer, "{joined_taxa}",) + .context("Error writing to temporary pattern file")?; // The two unwraps here can't be handled using the ? operator println!("{}", temp_file_path.into_os_string().into_string().unwrap()); @@ -83,5 +97,5 @@ struct Cli { chunk_dir: PathBuf, #[clap(long)] - temp_dir: PathBuf -} \ No newline at end of file + temp_dir: PathBuf, +} diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs index a796ece8..43bdaba2 100644 --- a/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs +++ b/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs @@ -17,7 +17,8 @@ fn main() -> Result<()> { let next = TAXA_BOUNDS[idx + 1]; let file_name = format!("unipept.{bound}-{next}.chunk"); let file_path = Path::new(&args.output_dir).join(file_name); - let file_handler = File::create(file_path).with_context(|| format!("Unable to create output file {bound}-{next}"))?; + let file_handler = File::create(file_path) + .with_context(|| format!("Unable to create output file {bound}-{next}"))?; let writer = BufWriter::new(file_handler); file_streams.push(writer); } @@ -26,7 +27,9 @@ fn main() -> Result<()> { // First read the header let mut header: String = String::new(); - reader.read_line(&mut header).context("Error reading header from stdin")?; + reader + .read_line(&mut header) + .context("Error reading header from stdin")?; write_header(&args.output_dir, header)?; // Then the rest of the data @@ -38,7 +41,10 @@ fn main() -> Result<()> { } let spl: Vec<&str> = line.split('\t').collect(); - let taxon_id = spl[8].trim().parse::().with_context(|| format!("Error parsing {} as an integer", spl[8]))?; + let taxon_id = spl[8] + .trim() + .parse::() + .with_context(|| format!("Error parsing {} as an integer", spl[8]))?; // Find the index of this taxon id in the array // Note that this can be sped up using binary search (see Python's bisect.bisect_left), @@ -63,15 +69,15 @@ struct Cli { } const TAXA_BOUNDS: [usize; 45] = [ - 0, 550, 1352, 3047, 5580, 8663, 11676, 32473, 40214, 52774, 66656, 86630, 116960, 162147, 210225, 267979, 334819, - 408172, 470868, 570509, 673318, 881260, 1046115, 1136135, 1227077, 1300307, 1410620, 1519492, 1650438, 1756149, - 1820614, 1871070, 1898104, 1922217, 1978231, 2024617, 2026757, 2035430, 2070414, 2202732, 2382165, 2527964, 2601669, - 2706029, 10000000 + 0, 550, 1352, 3047, 5580, 8663, 11676, 32473, 40214, 52774, 66656, 86630, 116960, 162147, + 210225, 267979, 334819, 408172, 470868, 570509, 673318, 881260, 1046115, 1136135, 1227077, + 1300307, 1410620, 1519492, 1650438, 1756149, 1820614, 1871070, 1898104, 1922217, 1978231, + 2024617, 2026757, 2035430, 2070414, 2202732, 2382165, 2527964, 2601669, 2706029, 10000000, ]; fn write_header(output_dir: &PathBuf, header: String) -> Result<()> { let file_path = Path::new(output_dir).join("db.header"); - let file_handler = File::create(file_path).with_context(|| format!("Unable to create header output file"))?; + let file_handler = File::create(file_path).context("Unable to create header output file")?; let mut writer = BufWriter::new(file_handler); write!(&mut writer, "{}", header).context("Error writing header")?;