From c8c8202ea228c83c9b4b01e7bf61f14f2c671b36 Mon Sep 17 00:00:00 2001
From: stijndcl <declercq.stijn@outlook.com>
Date: Sat, 13 Jan 2024 00:16:58 +0100
Subject: [PATCH 1/4] Re-write WriteToChunk.JS in Rust

---
 .../src/bin/write-to-chunk.rs                 | 80 +++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs
diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs
new file mode 100644
index 00000000..7703ee03
--- /dev/null
+++ b/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs
@@ -0,0 +1,80 @@
+use std::fs::File;
+use std::io::{BufRead, BufWriter, Write};
+use std::path::{Path, PathBuf};
+
+use anyhow::{Context, Result};
+use clap::Parser;
+
+use unipept_database::utils::files::open_sin;
+
+fn main() -> Result<()> {
+    let args = Cli::parse();
+
+    let mut file_streams: Vec<BufWriter<File>> = Vec::with_capacity(TAXA_BOUNDS.len());
+
+    // Create writers for the output files
+    for (idx, bound) in TAXA_BOUNDS.iter().take(TAXA_BOUNDS.len() - 1).enumerate() {
+        let next = TAXA_BOUNDS[idx + 1];
+        let file_name = format!("unipept.{bound}-{next}.chunk");
+        let file_path = Path::new(&args.output_dir).join(file_name);
+        let file_handler = File::create(file_path).with_context(|| format!("Unable to create output file {bound}-{next}"))?;
+        let writer = BufWriter::new(file_handler);
+        file_streams.push(writer);
+    }
+
+    let mut reader = open_sin();
+
+    // First read the header
+    let mut header: String = String::new();
+    reader.read_line(&mut header).context("Error reading header from stdin")?;
+    write_header(&args.output_dir, header)?;
+
+    // Then the rest of the data
+    for line in reader.lines() {
+        let line = line.context("Error reading line from stdin")?;
+
+        if args.verbose {
+            eprintln!("INFO VERBOSE: writing line to chunk: {line}");
+        }
+
+        let spl: Vec<&str> = line.split('\t').collect();
+        let taxon_id = spl[8].trim().parse::<usize>().with_context(|| format!("Error parsing {} as an integer", spl[8]))?;
+
+        // Find the index of this taxon id in the array
+        // Note that this can be sped up using binary search (see Python's bisect.bisect_left),
+        // but this tool is near-instant so we favour readability
+        let mut index: usize = 0;
+        while taxon_id > TAXA_BOUNDS[index] {
+            index += 1;
+        }
+
+        writeln!(&mut file_streams[index], "{line}").context("Error writing to output file")?;
+    }
+
+    Ok(())
+}
+
+#[derive(Parser, Debug)]
+struct Cli {
+    #[clap(short, long)]
+    output_dir: PathBuf,
+    #[clap(short, long)]
+    verbose: bool,
+}
+
+const TAXA_BOUNDS: [usize; 45] = [
+    0, 550, 1352, 3047, 5580, 8663, 11676, 32473, 40214, 52774, 66656, 86630, 116960, 162147, 210225, 267979, 334819,
+    408172, 470868, 570509, 673318, 881260, 1046115, 1136135, 1227077, 1300307, 1410620, 1519492, 1650438, 1756149,
+    1820614, 1871070, 1898104, 1922217, 1978231, 2024617, 2026757, 2035430, 2070414, 2202732, 2382165, 2527964, 2601669,
+    2706029, 10000000
+];
+
+fn write_header(output_dir: &PathBuf, header: String) -> Result<()> {
+    let file_path = Path::new(output_dir).join("db.header");
+    let file_handler = File::create(file_path).with_context(|| format!("Unable to create header output file"))?;
+    let mut writer = BufWriter::new(file_handler);
+
+    writeln!(&mut writer, "{}", header).context("Error writing header")?;
+
+    Ok(())
+}

From 20fb5c1277a233fadf8a6233572b56f5b3117148 Mon Sep 17 00:00:00 2001
From: stijndcl <declercq.stijn@outlook.com>
Date: Sat, 13 Jan 2024 15:34:21 +0100
Subject: [PATCH 2/4] Fix bug

---
 .../unipept-database-rs/src/bin/taxa-by-chunk.rs              | 0
 .../unipept-database-rs/src/bin/write-to-chunk.rs             | 4 ++--
 2 files changed, 2 insertions(+), 2 deletions(-)
 create mode 100644 scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs

diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs
new file mode 100644
index 00000000..e69de29b
diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs
index 7703ee03..a796ece8 100644
--- a/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs
+++ b/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs
@@ -48,7 +48,7 @@ fn main() -> Result<()> {
             index += 1;
         }
 
-        writeln!(&mut file_streams[index], "{line}").context("Error writing to output file")?;
+        writeln!(&mut file_streams[index - 1], "{line}").context("Error writing to output file")?;
     }
 
     Ok(())
@@ -74,7 +74,7 @@ fn write_header(output_dir: &PathBuf, header: String) -> Result<()> {
     let file_handler = File::create(file_path).with_context(|| format!("Unable to create header output file"))?;
     let mut writer = BufWriter::new(file_handler);
 
-    writeln!(&mut writer, "{}", header).context("Error writing header")?;
+    write!(&mut writer, "{}", header).context("Error writing header")?;
 
     Ok(())
 }

From c39ef6aa4c41a285050c5c079904fbd15c61875c Mon Sep 17 00:00:00 2001
From: stijndcl <declercq.stijn@outlook.com>
Date: Sat, 13 Jan 2024 16:15:54 +0100
Subject: [PATCH 3/4] Fix bugs in taxa by chunk

---
 .../src/bin/taxa-by-chunk.rs                  | 87 +++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs
index e69de29b..537d9701 100644
--- a/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs
+++ b/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs
@@ -0,0 +1,87 @@
+use std::fs::{File, read_dir};
+use std::io::{BufRead, BufWriter, Write};
+use std::path::{Path, PathBuf};
+
+use anyhow::{Context, Result};
+use clap::Parser;
+use regex::Regex;
+
+use unipept_database::utils::files::open_sin;
+
+fn main() -> Result<()> {
+    let args = Cli::parse();
+
+    let mut all_taxa: Vec<u64> = Vec::new();
+
+    let reader = open_sin();
+
+    // Read all taxa ids from stdin
+    for line in reader.lines() {
+        let line = line.context("Error reading line from stdin")?;
+
+        // Ignore empty lines
+        if line.trim().is_empty() {
+            continue;
+        }
+
+        let taxa_id: u64 = line.trim().parse().with_context(|| format!("Error parsing {line} as an integer"))?;
+        all_taxa.push(taxa_id);
+    }
+
+    let chunk_file_regex = Regex::new(r"unipept\..*\.gz").context("Error creating regex")?;
+
+    for entry in read_dir(&args.chunk_dir).context("Error reading chunk directory")? {
+        let entry = entry.context("Error reading entry from chunk directory")?;
+        let path = entry.path();
+        if !path.is_file() {
+            continue;
+        }
+
+        let base_name = match path.file_name() {
+            None => {continue;}
+            Some(n) => n.to_str().context("Error creating string from file path")?
+        };
+
+        if !chunk_file_regex.is_match(base_name) {
+            continue;
+        }
+
+        // Parse the taxa range out of the filename
+        let replaced_name = base_name.replace("unipept.", "").replace(".chunk.gz", "");
+        let range = replaced_name.split_once("-");
+        let range = range.with_context(|| format!("Unable to split {replaced_name} on '-'"))?;
+        let start: u64 = range.0.parse().with_context(|| format!("Error parsing {} as an integer", range.0))?;
+        let end: u64 = range.1.parse().with_context(|| format!("Error parsing {} as an integer", range.1))?;
+
+        let matching_taxa: Vec<&u64> = all_taxa.iter().filter(|&t| start <= *t && *t <= end).collect();
+
+        // Write matches to a temporary output file
+        if !matching_taxa.is_empty() {
+            let mapped_taxa: Vec<String> = matching_taxa.iter().map(|&t| format!("\t{t}$")).collect();
+            let joined_taxa = mapped_taxa.join("\n");
+
+            let temp_file_path = Path::new(&args.temp_dir).join(format!("{base_name}.pattern"));
+            let temp_file = File::create(&temp_file_path).context("Error creating temporary pattern file")?;
+            let mut writer = BufWriter::new(temp_file);
+            write!(
+                &mut writer,
+                "{joined_taxa}",
+            ).context("Error writing to temporary pattern file")?;
+
+            // The two unwraps here can't be handled using the ? operator
+            println!("{}", temp_file_path.into_os_string().into_string().unwrap());
+            println!("{}", path.into_os_string().into_string().unwrap());
+        }
+    }
+
+    Ok(())
+}
+
+#[derive(Parser, Debug)]
+struct Cli {
+    #[clap(long)]
+    chunk_dir: PathBuf,
+
+    #[clap(long)]
+    temp_dir: PathBuf
+}
\ No newline at end of file

From fa9f0aa2a883bc021c9cdc19e0e38b38c4fdde10 Mon Sep 17 00:00:00 2001
From: stijndcl <declercq.stijn@outlook.com>
Date: Sat, 13 Jan 2024 16:29:50 +0100
Subject: [PATCH 4/4] Fix linting and formatting

---
 .../src/bin/taxa-by-chunk.rs                  | 48 ++++++++++++-------
 .../src/bin/write-to-chunk.rs                 | 22 +++++----
 2 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs
index 537d9701..64a20225 100644
--- a/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs
+++ b/scripts/helper_scripts/unipept-database-rs/src/bin/taxa-by-chunk.rs
@@ -1,4 +1,4 @@
-use std::fs::{File, read_dir};
+use std::fs::{read_dir, File};
 use std::io::{BufRead, BufWriter, Write};
 use std::path::{Path, PathBuf};
 
@@ -24,7 +24,10 @@ fn main() -> Result<()> {
             continue;
         }
 
-        let taxa_id: u64 = line.trim().parse().with_context(|| format!("Error parsing {line} as an integer"))?;
+        let taxa_id: u64 = line
+            .trim()
+            .parse()
+            .with_context(|| format!("Error parsing {line} as an integer"))?;
         all_taxa.push(taxa_id);
     }
 
@@ -38,8 +41,10 @@ fn main() -> Result<()> {
         }
 
         let base_name = match path.file_name() {
-            None => {continue;}
-            Some(n) => n.to_str().context("Error creating string from file path")?
+            None => {
+                continue;
+            }
+            Some(n) => n.to_str().context("Error creating string from file path")?,
         };
 
         if !chunk_file_regex.is_match(base_name) {
@@ -48,25 +53,34 @@ fn main() -> Result<()> {
 
         // Parse the taxa range out of the filename
         let replaced_name = base_name.replace("unipept.", "").replace(".chunk.gz", "");
-        let range = replaced_name.split_once("-");
+        let range = replaced_name.split_once('-');
         let range = range.with_context(|| format!("Unable to split {replaced_name} on '-'"))?;
-        let start: u64 = range.0.parse().with_context(|| format!("Error parsing {} as an integer", range.0))?;
-        let end: u64 = range.1.parse().with_context(|| format!("Error parsing {} as an integer", range.1))?;
-
-        let matching_taxa: Vec<&u64> = all_taxa.iter().filter(|&t| start <= *t && *t <= end).collect();
+        let start: u64 = range
+            .0
+            .parse()
+            .with_context(|| format!("Error parsing {} as an integer", range.0))?;
+        let end: u64 = range
+            .1
+            .parse()
+            .with_context(|| format!("Error parsing {} as an integer", range.1))?;
+
+        let matching_taxa: Vec<&u64> = all_taxa
+            .iter()
+            .filter(|&t| start <= *t && *t <= end)
+            .collect();
 
         // Write matches to a temporary output file
         if !matching_taxa.is_empty() {
-            let mapped_taxa: Vec<String> = matching_taxa.iter().map(|&t| format!("\t{t}$")).collect();
+            let mapped_taxa: Vec<String> =
+                matching_taxa.iter().map(|&t| format!("\t{t}$")).collect();
             let joined_taxa = mapped_taxa.join("\n");
 
             let temp_file_path = Path::new(&args.temp_dir).join(format!("{base_name}.pattern"));
-            let temp_file = File::create(&temp_file_path).context("Error creating temporary pattern file")?;
+            let temp_file =
+                File::create(&temp_file_path).context("Error creating temporary pattern file")?;
             let mut writer = BufWriter::new(temp_file);
-            write!(
-                &mut writer,
-                "{joined_taxa}",
-            ).context("Error writing to temporary pattern file")?;
+            write!(&mut writer, "{joined_taxa}",)
+                .context("Error writing to temporary pattern file")?;
 
             // The two unwraps here can't be handled using the ? operator
             println!("{}", temp_file_path.into_os_string().into_string().unwrap());
@@ -83,5 +97,5 @@ struct Cli {
     chunk_dir: PathBuf,
 
     #[clap(long)]
-    temp_dir: PathBuf
-}
\ No newline at end of file
+    temp_dir: PathBuf,
+}
diff --git a/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs b/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs
index a796ece8..43bdaba2 100644
--- a/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs
+++ b/scripts/helper_scripts/unipept-database-rs/src/bin/write-to-chunk.rs
@@ -17,7 +17,8 @@ fn main() -> Result<()> {
         let next = TAXA_BOUNDS[idx + 1];
         let file_name = format!("unipept.{bound}-{next}.chunk");
         let file_path = Path::new(&args.output_dir).join(file_name);
-        let file_handler = File::create(file_path).with_context(|| format!("Unable to create output file {bound}-{next}"))?;
+        let file_handler = File::create(file_path)
+            .with_context(|| format!("Unable to create output file {bound}-{next}"))?;
         let writer = BufWriter::new(file_handler);
         file_streams.push(writer);
     }
@@ -26,7 +27,9 @@ fn main() -> Result<()> {
 
     // First read the header
     let mut header: String = String::new();
-    reader.read_line(&mut header).context("Error reading header from stdin")?;
+    reader
+        .read_line(&mut header)
+        .context("Error reading header from stdin")?;
     write_header(&args.output_dir, header)?;
 
     // Then the rest of the data
@@ -38,7 +41,10 @@ fn main() -> Result<()> {
         }
 
         let spl: Vec<&str> = line.split('\t').collect();
-        let taxon_id = spl[8].trim().parse::<usize>().with_context(|| format!("Error parsing {} as an integer", spl[8]))?;
+        let taxon_id = spl[8]
+            .trim()
+            .parse::<usize>()
+            .with_context(|| format!("Error parsing {} as an integer", spl[8]))?;
 
         // Find the index of this taxon id in the array
         // Note that this can be sped up using binary search (see Python's bisect.bisect_left),
@@ -63,15 +69,15 @@ struct Cli {
 }
 
 const TAXA_BOUNDS: [usize; 45] = [
-    0, 550, 1352, 3047, 5580, 8663, 11676, 32473, 40214, 52774, 66656, 86630, 116960, 162147, 210225, 267979, 334819,
-    408172, 470868, 570509, 673318, 881260, 1046115, 1136135, 1227077, 1300307, 1410620, 1519492, 1650438, 1756149,
-    1820614, 1871070, 1898104, 1922217, 1978231, 2024617, 2026757, 2035430, 2070414, 2202732, 2382165, 2527964, 2601669,
-    2706029, 10000000
+    0, 550, 1352, 3047, 5580, 8663, 11676, 32473, 40214, 52774, 66656, 86630, 116960, 162147,
+    210225, 267979, 334819, 408172, 470868, 570509, 673318, 881260, 1046115, 1136135, 1227077,
+    1300307, 1410620, 1519492, 1650438, 1756149, 1820614, 1871070, 1898104, 1922217, 1978231,
+    2024617, 2026757, 2035430, 2070414, 2202732, 2382165, 2527964, 2601669, 2706029, 10000000,
 ];
 
 fn write_header(output_dir: &PathBuf, header: String) -> Result<()> {
     let file_path = Path::new(output_dir).join("db.header");
-    let file_handler = File::create(file_path).with_context(|| format!("Unable to create header output file"))?;
+    let file_handler = File::create(file_path).context("Unable to create header output file")?;
     let mut writer = BufWriter::new(file_handler);
 
     write!(&mut writer, "{}", header).context("Error writing header")?;